""" Claim extraction pipeline for the evaluation stack. Extracts specific, concrete claims from LLM-generated analytical documents. Foundation module for Layers 1-3 of the evaluation stack (EXP-078). Claim types: NUMERICAL — sentences with specific numbers, percentages, dollar amounts CAUSAL — sentences asserting causal relationships HEADING — document section headings No LLM judgment required. Pure regex extraction. """ import re from collections import defaultdict # ============================================================ # Sentence Splitting # ============================================================ def split_sentences(text): """Split markdown document into (heading, sentence, paragraph_idx) tuples. Respects markdown structure: headings tracked as context, body text split on sentence boundaries. """ results = [] current_heading = "" para_idx = 0 for line in text.split('\n'): stripped = line.strip() if not stripped: continue # Detect headings if stripped.startswith('#'): current_heading = re.sub(r'^#+\s*', '', stripped) continue # Strip list markers for cleaner sentences cleaned = re.sub(r'^[-*•]\s+', '', stripped) cleaned = re.sub(r'^\d+\.\s+', '', cleaned) # Split on sentence boundaries parts = re.split(r'(?<=[.!?])\s+(?=[A-Z])', cleaned) for sent in parts: sent = sent.strip() if len(sent) > 30: # Skip fragments results.append((current_heading, sent, para_idx)) para_idx += 1 return results def extract_headings(text): """Extract all markdown headings from document.""" headings = [] for line in text.split('\n'): stripped = line.strip() if stripped.startswith('#'): level = len(stripped) - len(stripped.lstrip('#')) title = re.sub(r'^#+\s*', '', stripped) if title: headings.append({'level': level, 'text': title}) return headings # ============================================================ # Numerical Claims # ============================================================ NUMERICAL_PATTERNS = [ (r'(?:~|approximately |about |roughly |nearly |over |under )?(\d+(?:\.\d+)?)\s*%', 'percentage'), (r'(\d+(?:\.\d+)?)\s*[-–]\s*(\d+(?:\.\d+)?)\s*%', 'pct_range'), (r'\$\s*(\d+(?:[.,]\d+)*)\s*(M|B|K|million|billion|thousand|mn|bn)?', 'dollar'), (r'(\d+(?:\.\d+)?)\s*[x×](?:\s|$|,)', 'multiplier'), (r'(\d+(?:\.\d+)?)\s*-?\s*fold', 'multiplier'), (r'(\d+(?:,\d{3})*)\s+(?:companies|firms|teams|organizations|employees|' r'engineers|developers|users|customers|tools|platforms|products|projects|' r'systems|failures|incidents|outages|services|applications|repositories|' r'modules|microservices|endpoints|APIs?|databases?|clusters?|regions?)', 'entity_count'), (r'(\d+(?:\.\d+)?)\s*[-–]?\s*(?:\d+(?:\.\d+)?\s*)?' r'(?:days?|weeks?|months?|years?|hours?|minutes?|quarters?|sprints?)', 'duration'), ] def extract_numerical_claims(text): """Extract all sentences containing specific numbers. Returns list of dicts: sentence: full sentence text heading: section heading context numbers: list of {type, value, raw} found in sentence position: paragraph index """ sentences = split_sentences(text) claims = [] for heading, sent, pos in sentences: numbers = [] for pattern, num_type in NUMERICAL_PATTERNS: for m in re.finditer(pattern, sent, re.IGNORECASE): raw = m.group(0).strip() try: val = float(m.group(1).replace(',', '')) except (ValueError, IndexError): val = None numbers.append({ 'type': num_type, 'value': val, 'raw': raw, }) if numbers: claims.append({ 'sentence': sent, 'heading': heading, 'numbers': numbers, 'position': pos, }) return claims # ============================================================ # Causal Claims # ============================================================ CAUSAL_MARKERS = [ r'\bbecause\b', r'\bsince\b(?!\s+\d)', r'\bdue to\b', r'\bowing to\b', r'\bas a result of\b', r'\bcaused? by\b', r'\bdriven by\b', r'\bleads? to\b', r'\bresults? in\b', r'\bcauses?\b', r'\bproduces?\b', r'\bgenerates?\b', r'\btriggers?\b', r'\bconsequently\b', r'\btherefore\b', r'\bthus\b', r'\bhence\b', r'\bthe (?:primary|main|key|root|fundamental|core|underlying|central) ' r'(?:cause|reason|driver|factor|mechanism|force)\b', r'\b(?:directly|indirectly) (?:causes?|leads? to|results? in|drives?)\b', r'\bis responsible for\b', r'\baccounts? for\b', r'\benables?\b', r'\bprevents?\b', r'\binhibits?\b', r'\bfacilitates?\b', r'\bexacerbates?\b', r'\bcompounds?\b(?:\s+the)', r'\bamplifies?\b', r'\breinforces?\b', r'\bundermines?\b', r'\berodes?\b', ] def extract_causal_claims(text): """Extract sentences asserting causal relationships. Returns list of dicts: sentence: full sentence text heading: section heading context markers: list of causal markers found position: paragraph index """ sentences = split_sentences(text) combined = '|'.join(CAUSAL_MARKERS) claims = [] for heading, sent, pos in sentences: found = re.findall(combined, sent, re.IGNORECASE) if found: claims.append({ 'sentence': sent, 'heading': heading, 'markers': [m.strip().lower() for m in found if isinstance(m, str)], 'position': pos, }) return claims # ============================================================ # Combined Extraction # ============================================================ def extract_all(text): """Extract all claim types from document. Returns dict: headings: list of heading dicts numerical: list of numerical claim dicts causal: list of causal claim dicts stats: summary counts """ headings = extract_headings(text) numerical = extract_numerical_claims(text) causal = extract_causal_claims(text) pct_values = set() for claim in numerical: for n in claim['numbers']: if n['type'] == 'percentage' and n['value'] is not None: pct_values.add(n['value']) return { 'headings': headings, 'numerical': numerical, 'causal': causal, 'stats': { 'n_headings': len(headings), 'n_numerical': len(numerical), 'n_causal': len(causal), 'n_unique_percentages': len(pct_values), 'percentage_values': sorted(pct_values), } } # ============================================================ # Cross-version Comparison (for Temporal Consistency) # ============================================================ def compare_numerical_across_versions(version_claims_list): """Compare numerical claims across multiple generations of same document. Args: version_claims_list: list of extract_all() results, one per version Returns dict with stability metrics, fabrication rate, and examples. """ n_versions = len(version_claims_list) # --- Percentage stability --- all_pct_sets = [] pct_contexts = defaultdict(list) for v_idx, v_data in enumerate(version_claims_list): pct_set = set() for claim in v_data['numerical']: for n in claim['numbers']: if n['type'] == 'percentage' and n['value'] is not None: pct_set.add(n['value']) pct_contexts[n['value']].append({ 'version': v_idx, 'sentence': claim['sentence'][:250], 'heading': claim['heading'], }) all_pct_sets.append(pct_set) all_pct_values = set().union(*all_pct_sets) if all_pct_sets else set() stability = {} for val in all_pct_values: count = sum(val in s for s in all_pct_sets) stability[val] = count stable = sorted(v for v, c in stability.items() if c == n_versions) variable = sorted(v for v, c in stability.items() if c < n_versions) total = len(all_pct_values) # --- Dollar amount stability --- all_dollar_sets = [] dollar_contexts = defaultdict(list) for v_idx, v_data in enumerate(version_claims_list): dollar_set = set() for claim in v_data['numerical']: for n in claim['numbers']: if n['type'] == 'dollar' and n['value'] is not None: dollar_set.add(n['value']) dollar_contexts[n['value']].append({ 'version': v_idx, 'sentence': claim['sentence'][:250], }) all_dollar_sets.append(dollar_set) all_dollar_values = set().union(*all_dollar_sets) if all_dollar_sets else set() dollar_stable = sorted(v for v in all_dollar_values if sum(v in s for s in all_dollar_sets) == n_versions) dollar_variable = sorted(v for v in all_dollar_values if sum(v in s for s in all_dollar_sets) < n_versions) # --- All numbers (any type) --- all_num_sets = [] for v_data in version_claims_list: num_set = set() for claim in v_data['numerical']: for n in claim['numbers']: if n['value'] is not None: num_set.add((n['type'], n['value'])) all_num_sets.append(num_set) all_nums = set().union(*all_num_sets) if all_num_sets else set() all_nums_stable = sum( 1 for v in all_nums if sum(v in s for s in all_num_sets) == n_versions ) all_nums_variable = sum( 1 for v in all_nums if sum(v in s for s in all_num_sets) < n_versions ) # --- Heading overlap (Jaccard) --- heading_sets = [] for v_data in version_claims_list: h_set = set(h['text'].lower().strip() for h in v_data['headings']) heading_sets.append(h_set) if heading_sets: all_headings = set().union(*heading_sets) shared_headings = set.intersection(*heading_sets) if heading_sets else set() heading_jaccard = len(shared_headings) / len(all_headings) if all_headings else 1.0 else: heading_jaccard = 1.0 shared_headings = set() all_headings = set() # --- Claim count stability (coefficient of variation) --- num_counts = [v_data['stats']['n_numerical'] for v_data in version_claims_list] causal_counts = [v_data['stats']['n_causal'] for v_data in version_claims_list] def cv(vals): if not vals or max(vals) == 0: return 0.0 mean = sum(vals) / len(vals) if mean == 0: return 0.0 variance = sum((v - mean) ** 2 for v in vals) / len(vals) return (variance ** 0.5) / mean # --- Examples --- examples = {'stable': [], 'variable': []} for val in stable[:3]: examples['stable'].append({ 'value': f'{val}%', 'contexts': pct_contexts[val][:n_versions] }) for val in variable[:5]: examples['variable'].append({ 'value': f'{val}%', 'in_versions': [ctx['version'] for ctx in pct_contexts[val]], 'contexts': pct_contexts[val] }) return { 'percentage_stability': {str(v): c for v, c in sorted(stability.items())}, 'stable_percentages': stable, 'variable_percentages': variable, 'n_pct_stable': len(stable), 'n_pct_variable': len(variable), 'n_pct_total': total, 'fabrication_rate_pct': len(variable) / total if total > 0 else None, 'dollar_stable': dollar_stable, 'dollar_variable': dollar_variable, 'n_dollar_stable': len(dollar_stable), 'n_dollar_variable': len(dollar_variable), 'all_numbers_stable': all_nums_stable, 'all_numbers_variable': all_nums_variable, 'all_numbers_total': len(all_nums), 'all_numbers_fab_rate': all_nums_variable / len(all_nums) if all_nums else None, 'heading_jaccard': round(heading_jaccard, 3), 'n_shared_headings': len(shared_headings), 'n_total_headings': len(all_headings), 'shared_heading_texts': sorted(shared_headings), 'numerical_counts': num_counts, 'causal_counts': causal_counts, 'numerical_count_cv': round(cv(num_counts), 3), 'causal_count_cv': round(cv(causal_counts), 3), 'examples': examples, } # ============================================================ # Self-test # ============================================================ if __name__ == '__main__': test_doc = """# Why API Design Decisions Become Permanent ## Backward compatibility multiplies constraints Every deployed client is a standing invariant. Approximately 70% of API breaking changes originate from semantic drift rather than syntactic changes. The cost of maintaining backward compatibility grows 3x with each major version, reaching $2M annually for mid-size platforms. ## Implicit assumptions create fragility Clients rely on behaviors not present in schemas. Because response ordering is often assumed but never guaranteed, this leads to subtle failures that cascade across 15-20 dependent services. ## Testing costs grow nonlinearly Proving compatibility requires testing across combinations. Therefore, teams that maintain 5+ API versions spend roughly 40% of engineering time on compatibility testing alone. """ result = extract_all(test_doc) print(f"Headings: {len(result['headings'])}") for h in result['headings']: print(f" [{h['level']}] {h['text']}") print(f"\nNumerical claims: {len(result['numerical'])}") for c in result['numerical']: nums = ', '.join(f"{n['type']}={n['raw']}" for n in c['numbers']) print(f" [{c['heading'][:40]}] {nums}") print(f" \"{c['sentence'][:120]}...\"") print(f"\nCausal claims: {len(result['causal'])}") for c in result['causal']: print(f" markers: {c['markers']}") print(f" \"{c['sentence'][:120]}...\"") print(f"\nStats: {result['stats']}") print("\nSelf-test PASSED.")