""" EXP-081 Programmatic Number Matching: Basin-Breaking Measurement Zero LLM involvement. Extracts every number from generated documents, searches for each in the source text. Classifies: IN_SOURCE — exact number appears in source document NOT_IN_SOURCE — number absent from source (parametric/fabricated/derived) No judgment calls, no verifier bias, no EXTENDS ambiguity. Measures what percentage of numerical claims come from source vs elsewhere. """ import json import os import re from collections import defaultdict SOURCE_DIR = os.path.join(os.path.dirname(__file__), "sources") def load_source(filename): path = os.path.join(SOURCE_DIR, filename) with open(path, "r") as f: return f.read() SOURCES = { "remote_work": load_source("source_1_remote_work.md"), "communication": load_source("source_3_communication.md"), "ai_workflows": load_source("source_4_ai_workflows.md"), } def extract_numbers(text): """Extract all numerical values from text with context. Returns list of dicts: {value, raw, context, type} Types: percentage, dollar, decimal, integer """ numbers = [] seen = set() # avoid duplicates from overlapping patterns # Percentages: 43%, 43.5%, 43 percent for m in re.finditer(r'(\d+(?:\.\d+)?)\s*%', text): val = m.group(1) ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "percentage", m.start()) if key not in seen: seen.add(key) numbers.append({ "value": val, "raw": m.group(0), "context": context, "type": "percentage", }) # Dollar amounts: $55, $15, $61 for m in re.finditer(r'\$(\d+(?:\.\d+)?(?:,\d{3})*)', text): val = m.group(1).replace(",", "") ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "dollar", m.start()) if key not in seen: seen.add(key) numbers.append({ "value": val, "raw": m.group(0), "context": context, "type": "dollar", }) # Decimal numbers not already caught (e.g., 2.6) for m in re.finditer(r'(? 0 else 0 all_results.append({ "tier": tier, "topic": topic, "version": version, "total_numbers": total, "in_source": len(in_source), "not_in_source": len(not_in_source), "years_excluded": len(years), "word_counts_excluded": len(word_counts), "unsourced_rate": round(unsourced_rate, 3), "unsourced_details": [ {"value": n["value"], "type": n["type"], "raw": n["raw"], "context": n["context"]} for n in not_in_source ], }) print(f" {tier} | {topic} v{version}: " f"{total} numbers ({len(in_source)} sourced, " f"{len(not_in_source)} unsourced [{unsourced_rate:.1%}], " f"excl {len(years)}yr {len(word_counts)}wc)") # === Summary by Tier === print(f"\n{'='*60}") print(f" BY TIER") print(f"{'='*60}") for tier_name in ["T1_BASIC", "T2_STANDARD", "T3_REFINED", "T4_AGENTIC"]: tier_r = [r for r in all_results if r["tier"] == tier_name] if tier_r: total = sum(r["total_numbers"] for r in tier_r) sourced = sum(r["in_source"] for r in tier_r) unsourced = sum(r["not_in_source"] for r in tier_r) rate = unsourced / total if total > 0 else 0 print(f" {tier_name}: {total} numbers — " f"{sourced} sourced, " f"{unsourced} unsourced [{rate:.1%}]") # === Summary by Topic === print(f"\n{'='*60}") print(f" BY TOPIC") print(f"{'='*60}") for topic_key in SOURCES: topic_r = [r for r in all_results if r["topic"] == topic_key] if topic_r: total = sum(r["total_numbers"] for r in topic_r) sourced = sum(r["in_source"] for r in topic_r) unsourced = sum(r["not_in_source"] for r in topic_r) rate = unsourced / total if total > 0 else 0 print(f" {topic_key}: {total} numbers — " f"{sourced} sourced, " f"{unsourced} unsourced [{rate:.1%}]") # === Tier × Topic Matrix === print(f"\n{'='*60}") print(f" UNSOURCED RATE MATRIX (Tier × Topic)") print(f"{'='*60}") print(f" {'':20s} {'remote_work':>14s} {'communication':>14s} {'ai_workflows':>14s}") for tier_name in ["T1_BASIC", "T2_STANDARD", "T3_REFINED", "T4_AGENTIC"]: row = f" {tier_name:20s}" for topic_key in ["remote_work", "communication", "ai_workflows"]: cell = [r for r in all_results if r["tier"] == tier_name and r["topic"] == topic_key] if cell: total = sum(r["total_numbers"] for r in cell) unsourced = sum(r["not_in_source"] for r in cell) rate = unsourced / total if total > 0 else 0 row += f" {unsourced}/{total} ({rate:.0%})".rjust(14) print(row) # === Unsourced numbers detail === print(f"\n{'='*60}") print(f" UNSOURCED NUMBERS (all)") print(f"{'='*60}") for r in all_results: if r["unsourced_details"]: print(f"\n {r['tier']} | {r['topic']} v{r['version']}:") for detail in r["unsourced_details"]: print(f" {detail['raw']:>8s} ({detail['type']:>10s}) " f" ...{detail['context']}...") # === Save === output_path = os.path.join(os.path.dirname(__file__), "exp081_number_match_results.json") output = { "experiment": "EXP-081-number-match", "description": ( "Programmatic number extraction and source matching. " "Zero LLM involvement. Extracts every number from generated " "documents, checks if it appears in source. " "Measures source-grounding rate without verifier bias." ), "method": "regex extraction + string matching", "numbers_excluded": "years (2019-2030), single digits", "results": all_results, } with open(output_path, "w") as f: json.dump(output, f, indent=2) print(f"\n Saved: {output_path}") if __name__ == "__main__": main()