""" EXP-078: Temporal Consistency Kill-Test (Evaluation Stack Layer 2) Question: Do numerical claims vary across independent generations of the same prompt? Data: fpr_estimation_docs.json 20 topics × 3 STANDARD versions (same prompt, independent runs, gemini-3.1-flash-lite-preview) 5 topics × 2 BASIC versions (for condition comparison) Method: Extract all numerical claims (percentages, dollar amounts, counts) from each version. Measure cross-version stability: which specific numbers appear in all 3, only 2, or only 1 version. Variable numbers = fabricated specificity candidates Stable numbers = knowledge retrieval candidates Kill signals: KS1: <3 numerical claims per document → insufficient signal density KS2: No discrimination between STANDARD and BASIC → doesn't separate conditions KS3: All claims stable (<10% fabrication) → no fabrication to detect KS4: All claims variable (>90% fabrication) → no gradient to exploit """ import json import statistics from pathlib import Path from collections import defaultdict from claim_extraction import extract_all, compare_numerical_across_versions DATA_DIR = Path(__file__).parent def main(): # === Load data === with open(DATA_DIR / 'fpr_estimation_docs.json') as f: docs = json.load(f) # Group by (topic_id, condition) groups = defaultdict(list) for doc in docs: groups[(doc['topic_id'], doc['condition'])].append(doc) triple_groups = { k: sorted(v, key=lambda d: d.get('sample_idx', 0)) for k, v in groups.items() if len(v) >= 3 } double_groups = { k: sorted(v, key=lambda d: d.get('sample_idx', 0)) for k, v in groups.items() if len(v) == 2 } print("=" * 70) print("EXP-078: TEMPORAL CONSISTENCY KILL-TEST") print("=" * 70) print(f"Data: fpr_estimation_docs.json ({len(docs)} docs)") print(f"Generator: gemini-3.1-flash-lite-preview") print(f"3-version groups: {len(triple_groups)} (STANDARD)") print(f"2-version groups: {len(double_groups)} (BASIC)") print() # === Extract and compare each topic === all_results = {} print("-" * 70) print("PER-TOPIC RESULTS (3 versions, STANDARD)") print("-" * 70) for (topic_id, condition), doc_group in sorted(triple_groups.items()): version_data = [extract_all(doc['text']) for doc in doc_group] comparison = compare_numerical_across_versions(version_data) key = f"{topic_id}_{condition}" all_results[key] = comparison fab = comparison['fabrication_rate_pct'] fab_str = f"{fab:.0%}" if fab is not None else "N/A" all_fab = comparison['all_numbers_fab_rate'] all_fab_str = f"{all_fab:.0%}" if all_fab is not None else "N/A" print(f"\n{topic_id}:") print(f" Numerical claims/version: {comparison['numerical_counts']}") print(f" Causal claims/version: {comparison['causal_counts']}") print(f" Heading Jaccard: {comparison['heading_jaccard']}") print(f" Percentages — total: {comparison['n_pct_total']}, " f"stable: {comparison['n_pct_stable']}, " f"variable: {comparison['n_pct_variable']} " f"(fab rate: {fab_str})") print(f" All numbers — total: {comparison['all_numbers_total']}, " f"stable: {comparison['all_numbers_stable']}, " f"variable: {comparison['all_numbers_variable']} " f"(fab rate: {all_fab_str})") # Show examples of variable claims if comparison['examples']['variable']: print(f" Examples of VARIABLE percentages:") for ex in comparison['examples']['variable'][:3]: versions_in = ex.get('in_versions', []) print(f" {ex['value']} (in {len(versions_in)}/3 versions):") for ctx in ex['contexts'][:2]: snippet = ctx['sentence'][:130] print(f" v{ctx['version']}: \"{snippet}...\"") if comparison['examples']['stable']: print(f" Examples of STABLE percentages:") for ex in comparison['examples']['stable'][:2]: print(f" {ex['value']} — consistent across all 3 versions") # === BASIC comparison === basic_results = {} if double_groups: print() print("-" * 70) print("BASIC CONDITION (2 versions)") print("-" * 70) for (topic_id, condition), doc_group in sorted(double_groups.items()): version_data = [extract_all(doc['text']) for doc in doc_group] comparison = compare_numerical_across_versions(version_data) key = f"{topic_id}_{condition}" basic_results[key] = comparison fab = comparison['fabrication_rate_pct'] fab_str = f"{fab:.0%}" if fab is not None else "N/A" print(f"\n{topic_id} [BASIC]:") print(f" Numerical claims/version: {comparison['numerical_counts']}") print(f" Percentages — total: {comparison['n_pct_total']}, " f"stable: {comparison['n_pct_stable']}, " f"variable: {comparison['n_pct_variable']} " f"(fab rate: {fab_str})") print(f" All numbers — total: {comparison['all_numbers_total']}, " f"stable: {comparison['all_numbers_stable']}, " f"variable: {comparison['all_numbers_variable']}") # ================================================================ # AGGREGATE ANALYSIS # ================================================================ print() print("=" * 70) print("AGGREGATE ANALYSIS") print("=" * 70) # --- Percentage fabrication rates --- std_pct_rates = [r['fabrication_rate_pct'] for r in all_results.values() if r['fabrication_rate_pct'] is not None] std_all_rates = [r['all_numbers_fab_rate'] for r in all_results.values() if r['all_numbers_fab_rate'] is not None] std_pct_totals = [r['n_pct_total'] for r in all_results.values()] std_all_totals = [r['all_numbers_total'] for r in all_results.values()] std_num_counts = [ statistics.mean(r['numerical_counts']) for r in all_results.values() ] std_causal_counts = [ statistics.mean(r['causal_counts']) for r in all_results.values() ] std_heading_j = [r['heading_jaccard'] for r in all_results.values()] if std_pct_rates: print(f"\nSTANDARD (n={len(std_pct_rates)} topics):") print(f" Percentage fabrication rate:") print(f" Mean: {statistics.mean(std_pct_rates):.1%}") print(f" Median: {statistics.median(std_pct_rates):.1%}") print(f" Range: {min(std_pct_rates):.1%} — {max(std_pct_rates):.1%}") print(f" Stdev: {statistics.stdev(std_pct_rates):.1%}" if len(std_pct_rates) > 1 else "") print(f" All-number fabrication rate:") print(f" Mean: {statistics.mean(std_all_rates):.1%}") print(f" Median: {statistics.median(std_all_rates):.1%}") print(f" Range: {min(std_all_rates):.1%} — {max(std_all_rates):.1%}") print(f" Signal density:") print(f" Mean percentages per topic: {statistics.mean(std_pct_totals):.1f}") print(f" Mean all numbers per topic: {statistics.mean(std_all_totals):.1f}") print(f" Mean numerical claims/doc: {statistics.mean(std_num_counts):.1f}") print(f" Mean causal claims/doc: {statistics.mean(std_causal_counts):.1f}") print(f" Structure stability:") print(f" Mean heading Jaccard: {statistics.mean(std_heading_j):.3f}") # --- BASIC rates --- bas_pct_rates = [r['fabrication_rate_pct'] for r in basic_results.values() if r['fabrication_rate_pct'] is not None] bas_all_rates = [r['all_numbers_fab_rate'] for r in basic_results.values() if r['all_numbers_fab_rate'] is not None] if bas_pct_rates: print(f"\nBASIC (n={len(bas_pct_rates)} topics):") print(f" Percentage fabrication rate:") print(f" Mean: {statistics.mean(bas_pct_rates):.1%}") print(f" Range: {min(bas_pct_rates):.1%} — {max(bas_pct_rates):.1%}") if bas_all_rates: print(f" All-number fabrication rate:") print(f" Mean: {statistics.mean(bas_all_rates):.1%}") # --- STANDARD vs BASIC on matched topics --- print() print("-" * 70) print("STANDARD vs BASIC (matched topics)") print("-" * 70) matched_topics = set() for key in basic_results: topic = key.rsplit('_', 1)[0] std_key = f"{topic}_STANDARD" if std_key in all_results: matched_topics.add(topic) matched_std_rates = [] matched_bas_rates = [] for topic in sorted(matched_topics): std_r = all_results[f"{topic}_STANDARD"] bas_r = basic_results[f"{topic}_BASIC"] std_fab = std_r['fabrication_rate_pct'] bas_fab = bas_r['fabrication_rate_pct'] print(f" {topic}:") print(f" STANDARD: fab={std_fab:.0%} " f"({std_r['n_pct_stable']}S/{std_r['n_pct_variable']}V " f"of {std_r['n_pct_total']} pcts)" if std_fab is not None else f" STANDARD: no percentages found") print(f" BASIC: fab={bas_fab:.0%} " f"({bas_r['n_pct_stable']}S/{bas_r['n_pct_variable']}V " f"of {bas_r['n_pct_total']} pcts)" if bas_fab is not None else f" BASIC: no percentages found") if std_fab is not None: matched_std_rates.append(std_fab) if bas_fab is not None: matched_bas_rates.append(bas_fab) if matched_std_rates and matched_bas_rates: std_mean = statistics.mean(matched_std_rates) bas_mean = statistics.mean(matched_bas_rates) diff = std_mean - bas_mean print(f"\n Matched comparison:") print(f" STANDARD mean fab rate: {std_mean:.1%}") print(f" BASIC mean fab rate: {bas_mean:.1%}") print(f" Difference: {diff:+.1%}") # ================================================================ # KILL SIGNAL ASSESSMENT # ================================================================ print() print("=" * 70) print("KILL SIGNAL ASSESSMENT") print("=" * 70) # KS1: Signal density if std_pct_totals: low_signal = sum(1 for t in std_pct_totals if t < 3) print(f"\nKS1 — Signal density:") print(f" Topics with <3 unique percentages: {low_signal}/{len(std_pct_totals)}") print(f" Mean unique percentages per topic: {statistics.mean(std_pct_totals):.1f}") print(f" Mean all-type numbers per topic: {statistics.mean(std_all_totals):.1f}") if low_signal > len(std_pct_totals) * 0.5: print(" >>> KILL SIGNAL: >50% of topics have insufficient percentage claims") print(" >>> BUT: check all-number totals — broader extraction may save it") else: print(f" >>> PASS: {len(std_pct_totals) - low_signal}/{len(std_pct_totals)} " f"topics have sufficient data") # KS2: Discrimination if matched_std_rates and matched_bas_rates: diff = statistics.mean(matched_std_rates) - statistics.mean(matched_bas_rates) print(f"\nKS2 — Condition discrimination:") print(f" STANDARD-BASIC fab rate difference: {diff:+.1%}") if abs(diff) < 0.05: print(" >>> WARNING: <5% difference — may not discriminate conditions") else: direction = "more" if diff > 0 else "less" print(f" >>> STANDARD is {direction} fabricated than BASIC by {abs(diff):.1%}") # KS3: All stable if std_pct_rates: mean_fab = statistics.mean(std_pct_rates) print(f"\nKS3 — Fabrication detectable?") print(f" Mean fabrication rate: {mean_fab:.1%}") if mean_fab < 0.10: print(" >>> KILL SIGNAL: <10% fabrication — insufficient variance to exploit") elif mean_fab < 0.20: print(" >>> WEAK: 10-20% fabrication — detectable but thin signal") else: print(f" >>> PASS: {mean_fab:.0%} fabrication rate — clear signal") # KS4: All variable if std_pct_rates: print(f"\nKS4 — Gradient exists?") print(f" Range: {min(std_pct_rates):.0%} — {max(std_pct_rates):.0%}") if min(std_pct_rates) > 0.90: print(" >>> KILL SIGNAL: All topics >90% — no gradient, everything fabricated") else: print(" >>> PASS: range of fabrication rates — gradient exists across topics") # ================================================================ # HEADING STABILITY (secondary signal) # ================================================================ print() print("-" * 70) print("SECONDARY: HEADING STABILITY ACROSS VERSIONS") print("-" * 70) heading_jaccards = [] for (topic_id, _), _ in sorted(triple_groups.items()): key = f"{topic_id}_STANDARD" r = all_results[key] heading_jaccards.append(r['heading_jaccard']) shared = r.get('shared_heading_texts', []) print(f" {topic_id}: Jaccard={r['heading_jaccard']:.3f} " f"({r['n_shared_headings']}/{r['n_total_headings']} shared)") if heading_jaccards: print(f"\n Mean heading Jaccard: {statistics.mean(heading_jaccards):.3f}") print(f" Interpretation: {'HIGH' if statistics.mean(heading_jaccards) > 0.5 else 'LOW'} " f"structural stability across generations") # ================================================================ # CAUSAL CLAIM STABILITY (secondary signal) # ================================================================ print() print("-" * 70) print("SECONDARY: CAUSAL CLAIM COUNT STABILITY") print("-" * 70) causal_cvs = [r['causal_count_cv'] for r in all_results.values()] numerical_cvs = [r['numerical_count_cv'] for r in all_results.values()] print(f" Mean CV of causal claim count: {statistics.mean(causal_cvs):.3f}") print(f" Mean CV of numerical claim count: {statistics.mean(numerical_cvs):.3f}") print(f" Interpretation: CV<0.15 = stable counts, CV>0.30 = unstable") # ================================================================ # SAVE RESULTS # ================================================================ output = { 'experiment': 'EXP-078_temporal_consistency_kill_test', 'method': 'Cross-generation numerical claim variance', 'data_source': 'fpr_estimation_docs.json', 'n_topics_3ver': len(triple_groups), 'n_topics_2ver': len(double_groups), 'generator': 'gemini-3.1-flash-lite-preview', 'standard_results': all_results, 'basic_results': basic_results, 'aggregate': { 'standard_pct_fab_rate_mean': ( round(statistics.mean(std_pct_rates), 4) if std_pct_rates else None ), 'standard_pct_fab_rate_median': ( round(statistics.median(std_pct_rates), 4) if std_pct_rates else None ), 'standard_all_fab_rate_mean': ( round(statistics.mean(std_all_rates), 4) if std_all_rates else None ), 'basic_pct_fab_rate_mean': ( round(statistics.mean(bas_pct_rates), 4) if bas_pct_rates else None ), 'mean_heading_jaccard': ( round(statistics.mean(std_heading_j), 4) if std_heading_j else None ), 'mean_pct_per_topic': ( round(statistics.mean(std_pct_totals), 1) if std_pct_totals else None ), 'mean_all_nums_per_topic': ( round(statistics.mean(std_all_totals), 1) if std_all_totals else None ), }, 'kill_signals': { 'ks1_signal_density': ( sum(1 for t in std_pct_totals if t < 3) / len(std_pct_totals) if std_pct_totals else None ), 'ks2_condition_diff': ( round(statistics.mean(matched_std_rates) - statistics.mean(matched_bas_rates), 4) if matched_std_rates and matched_bas_rates else None ), 'ks3_mean_fab_rate': ( round(statistics.mean(std_pct_rates), 4) if std_pct_rates else None ), 'ks4_fab_rate_range': ( [round(min(std_pct_rates), 4), round(max(std_pct_rates), 4)] if std_pct_rates else None ), }, } results_path = DATA_DIR / 'exp078_temporal_consistency_results.json' with open(results_path, 'w') as f: json.dump(output, f, indent=2, default=str) print(f"\nResults saved to {results_path.name}") if __name__ == '__main__': main()