"""
EXP-078: Temporal Consistency Kill-Test (Evaluation Stack Layer 2)

Question: Do numerical claims vary across independent generations
of the same prompt?

Data: fpr_estimation_docs.json
  20 topics × 3 STANDARD versions (same prompt, independent runs, gemini-3.1-flash-lite-preview)
  5 topics × 2 BASIC versions (for condition comparison)

Method:
  Extract all numerical claims (percentages, dollar amounts, counts)
  from each version. Measure cross-version stability: which specific
  numbers appear in all 3, only 2, or only 1 version.

  Variable numbers = fabricated specificity candidates
  Stable numbers = knowledge retrieval candidates

Kill signals:
  KS1: <3 numerical claims per document → insufficient signal density
  KS2: No discrimination between STANDARD and BASIC → doesn't separate conditions
  KS3: All claims stable (<10% fabrication) → no fabrication to detect
  KS4: All claims variable (>90% fabrication) → no gradient to exploit
"""

import json
import statistics
from pathlib import Path
from collections import defaultdict

from claim_extraction import extract_all, compare_numerical_across_versions

DATA_DIR = Path(__file__).parent


def main():
    # === Load data ===
    with open(DATA_DIR / 'fpr_estimation_docs.json') as f:
        docs = json.load(f)

    # Group by (topic_id, condition)
    groups = defaultdict(list)
    for doc in docs:
        groups[(doc['topic_id'], doc['condition'])].append(doc)

    triple_groups = {
        k: sorted(v, key=lambda d: d.get('sample_idx', 0))
        for k, v in groups.items() if len(v) >= 3
    }
    double_groups = {
        k: sorted(v, key=lambda d: d.get('sample_idx', 0))
        for k, v in groups.items() if len(v) == 2
    }

    print("=" * 70)
    print("EXP-078: TEMPORAL CONSISTENCY KILL-TEST")
    print("=" * 70)
    print(f"Data: fpr_estimation_docs.json ({len(docs)} docs)")
    print(f"Generator: gemini-3.1-flash-lite-preview")
    print(f"3-version groups: {len(triple_groups)} (STANDARD)")
    print(f"2-version groups: {len(double_groups)} (BASIC)")
    print()

    # === Extract and compare each topic ===
    all_results = {}

    print("-" * 70)
    print("PER-TOPIC RESULTS (3 versions, STANDARD)")
    print("-" * 70)

    for (topic_id, condition), doc_group in sorted(triple_groups.items()):
        version_data = [extract_all(doc['text']) for doc in doc_group]
        comparison = compare_numerical_across_versions(version_data)

        key = f"{topic_id}_{condition}"
        all_results[key] = comparison

        fab = comparison['fabrication_rate_pct']
        fab_str = f"{fab:.0%}" if fab is not None else "N/A"
        all_fab = comparison['all_numbers_fab_rate']
        all_fab_str = f"{all_fab:.0%}" if all_fab is not None else "N/A"

        print(f"\n{topic_id}:")
        print(f"  Numerical claims/version: {comparison['numerical_counts']}")
        print(f"  Causal claims/version:    {comparison['causal_counts']}")
        print(f"  Heading Jaccard:          {comparison['heading_jaccard']}")
        print(f"  Percentages — total: {comparison['n_pct_total']}, "
              f"stable: {comparison['n_pct_stable']}, "
              f"variable: {comparison['n_pct_variable']} "
              f"(fab rate: {fab_str})")
        print(f"  All numbers — total: {comparison['all_numbers_total']}, "
              f"stable: {comparison['all_numbers_stable']}, "
              f"variable: {comparison['all_numbers_variable']} "
              f"(fab rate: {all_fab_str})")

        # Show examples of variable claims
        if comparison['examples']['variable']:
            print(f"  Examples of VARIABLE percentages:")
            for ex in comparison['examples']['variable'][:3]:
                versions_in = ex.get('in_versions', [])
                print(f"    {ex['value']} (in {len(versions_in)}/3 versions):")
                for ctx in ex['contexts'][:2]:
                    snippet = ctx['sentence'][:130]
                    print(f"      v{ctx['version']}: \"{snippet}...\"")

        if comparison['examples']['stable']:
            print(f"  Examples of STABLE percentages:")
            for ex in comparison['examples']['stable'][:2]:
                print(f"    {ex['value']} — consistent across all 3 versions")

    # === BASIC comparison ===
    basic_results = {}

    if double_groups:
        print()
        print("-" * 70)
        print("BASIC CONDITION (2 versions)")
        print("-" * 70)

        for (topic_id, condition), doc_group in sorted(double_groups.items()):
            version_data = [extract_all(doc['text']) for doc in doc_group]
            comparison = compare_numerical_across_versions(version_data)

            key = f"{topic_id}_{condition}"
            basic_results[key] = comparison

            fab = comparison['fabrication_rate_pct']
            fab_str = f"{fab:.0%}" if fab is not None else "N/A"

            print(f"\n{topic_id} [BASIC]:")
            print(f"  Numerical claims/version: {comparison['numerical_counts']}")
            print(f"  Percentages — total: {comparison['n_pct_total']}, "
                  f"stable: {comparison['n_pct_stable']}, "
                  f"variable: {comparison['n_pct_variable']} "
                  f"(fab rate: {fab_str})")
            print(f"  All numbers — total: {comparison['all_numbers_total']}, "
                  f"stable: {comparison['all_numbers_stable']}, "
                  f"variable: {comparison['all_numbers_variable']}")

    # ================================================================
    # AGGREGATE ANALYSIS
    # ================================================================
    print()
    print("=" * 70)
    print("AGGREGATE ANALYSIS")
    print("=" * 70)

    # --- Percentage fabrication rates ---
    std_pct_rates = [r['fabrication_rate_pct'] for r in all_results.values()
                     if r['fabrication_rate_pct'] is not None]
    std_all_rates = [r['all_numbers_fab_rate'] for r in all_results.values()
                     if r['all_numbers_fab_rate'] is not None]
    std_pct_totals = [r['n_pct_total'] for r in all_results.values()]
    std_all_totals = [r['all_numbers_total'] for r in all_results.values()]
    std_num_counts = [
        statistics.mean(r['numerical_counts']) for r in all_results.values()
    ]
    std_causal_counts = [
        statistics.mean(r['causal_counts']) for r in all_results.values()
    ]
    std_heading_j = [r['heading_jaccard'] for r in all_results.values()]

    if std_pct_rates:
        print(f"\nSTANDARD (n={len(std_pct_rates)} topics):")
        print(f"  Percentage fabrication rate:")
        print(f"    Mean:   {statistics.mean(std_pct_rates):.1%}")
        print(f"    Median: {statistics.median(std_pct_rates):.1%}")
        print(f"    Range:  {min(std_pct_rates):.1%} — {max(std_pct_rates):.1%}")
        print(f"    Stdev:  {statistics.stdev(std_pct_rates):.1%}" if len(std_pct_rates) > 1 else "")
        print(f"  All-number fabrication rate:")
        print(f"    Mean:   {statistics.mean(std_all_rates):.1%}")
        print(f"    Median: {statistics.median(std_all_rates):.1%}")
        print(f"    Range:  {min(std_all_rates):.1%} — {max(std_all_rates):.1%}")
        print(f"  Signal density:")
        print(f"    Mean percentages per topic: {statistics.mean(std_pct_totals):.1f}")
        print(f"    Mean all numbers per topic: {statistics.mean(std_all_totals):.1f}")
        print(f"    Mean numerical claims/doc:  {statistics.mean(std_num_counts):.1f}")
        print(f"    Mean causal claims/doc:     {statistics.mean(std_causal_counts):.1f}")
        print(f"  Structure stability:")
        print(f"    Mean heading Jaccard: {statistics.mean(std_heading_j):.3f}")

    # --- BASIC rates ---
    bas_pct_rates = [r['fabrication_rate_pct'] for r in basic_results.values()
                     if r['fabrication_rate_pct'] is not None]
    bas_all_rates = [r['all_numbers_fab_rate'] for r in basic_results.values()
                     if r['all_numbers_fab_rate'] is not None]

    if bas_pct_rates:
        print(f"\nBASIC (n={len(bas_pct_rates)} topics):")
        print(f"  Percentage fabrication rate:")
        print(f"    Mean:   {statistics.mean(bas_pct_rates):.1%}")
        print(f"    Range:  {min(bas_pct_rates):.1%} — {max(bas_pct_rates):.1%}")
        if bas_all_rates:
            print(f"  All-number fabrication rate:")
            print(f"    Mean:   {statistics.mean(bas_all_rates):.1%}")

    # --- STANDARD vs BASIC on matched topics ---
    print()
    print("-" * 70)
    print("STANDARD vs BASIC (matched topics)")
    print("-" * 70)

    matched_topics = set()
    for key in basic_results:
        topic = key.rsplit('_', 1)[0]
        std_key = f"{topic}_STANDARD"
        if std_key in all_results:
            matched_topics.add(topic)

    matched_std_rates = []
    matched_bas_rates = []

    for topic in sorted(matched_topics):
        std_r = all_results[f"{topic}_STANDARD"]
        bas_r = basic_results[f"{topic}_BASIC"]
        std_fab = std_r['fabrication_rate_pct']
        bas_fab = bas_r['fabrication_rate_pct']

        print(f"  {topic}:")
        print(f"    STANDARD: fab={std_fab:.0%} "
              f"({std_r['n_pct_stable']}S/{std_r['n_pct_variable']}V "
              f"of {std_r['n_pct_total']} pcts)" if std_fab is not None else
              f"    STANDARD: no percentages found")
        print(f"    BASIC:    fab={bas_fab:.0%} "
              f"({bas_r['n_pct_stable']}S/{bas_r['n_pct_variable']}V "
              f"of {bas_r['n_pct_total']} pcts)" if bas_fab is not None else
              f"    BASIC:    no percentages found")

        if std_fab is not None:
            matched_std_rates.append(std_fab)
        if bas_fab is not None:
            matched_bas_rates.append(bas_fab)

    if matched_std_rates and matched_bas_rates:
        std_mean = statistics.mean(matched_std_rates)
        bas_mean = statistics.mean(matched_bas_rates)
        diff = std_mean - bas_mean
        print(f"\n  Matched comparison:")
        print(f"    STANDARD mean fab rate: {std_mean:.1%}")
        print(f"    BASIC mean fab rate:    {bas_mean:.1%}")
        print(f"    Difference:             {diff:+.1%}")

    # ================================================================
    # KILL SIGNAL ASSESSMENT
    # ================================================================
    print()
    print("=" * 70)
    print("KILL SIGNAL ASSESSMENT")
    print("=" * 70)

    # KS1: Signal density
    if std_pct_totals:
        low_signal = sum(1 for t in std_pct_totals if t < 3)
        print(f"\nKS1 — Signal density:")
        print(f"  Topics with <3 unique percentages: {low_signal}/{len(std_pct_totals)}")
        print(f"  Mean unique percentages per topic: {statistics.mean(std_pct_totals):.1f}")
        print(f"  Mean all-type numbers per topic:   {statistics.mean(std_all_totals):.1f}")
        if low_signal > len(std_pct_totals) * 0.5:
            print("  >>> KILL SIGNAL: >50% of topics have insufficient percentage claims")
            print("  >>> BUT: check all-number totals — broader extraction may save it")
        else:
            print(f"  >>> PASS: {len(std_pct_totals) - low_signal}/{len(std_pct_totals)} "
                  f"topics have sufficient data")

    # KS2: Discrimination
    if matched_std_rates and matched_bas_rates:
        diff = statistics.mean(matched_std_rates) - statistics.mean(matched_bas_rates)
        print(f"\nKS2 — Condition discrimination:")
        print(f"  STANDARD-BASIC fab rate difference: {diff:+.1%}")
        if abs(diff) < 0.05:
            print("  >>> WARNING: <5% difference — may not discriminate conditions")
        else:
            direction = "more" if diff > 0 else "less"
            print(f"  >>> STANDARD is {direction} fabricated than BASIC by {abs(diff):.1%}")

    # KS3: All stable
    if std_pct_rates:
        mean_fab = statistics.mean(std_pct_rates)
        print(f"\nKS3 — Fabrication detectable?")
        print(f"  Mean fabrication rate: {mean_fab:.1%}")
        if mean_fab < 0.10:
            print("  >>> KILL SIGNAL: <10% fabrication — insufficient variance to exploit")
        elif mean_fab < 0.20:
            print("  >>> WEAK: 10-20% fabrication — detectable but thin signal")
        else:
            print(f"  >>> PASS: {mean_fab:.0%} fabrication rate — clear signal")

    # KS4: All variable
    if std_pct_rates:
        print(f"\nKS4 — Gradient exists?")
        print(f"  Range: {min(std_pct_rates):.0%} — {max(std_pct_rates):.0%}")
        if min(std_pct_rates) > 0.90:
            print("  >>> KILL SIGNAL: All topics >90% — no gradient, everything fabricated")
        else:
            print("  >>> PASS: range of fabrication rates — gradient exists across topics")

    # ================================================================
    # HEADING STABILITY (secondary signal)
    # ================================================================
    print()
    print("-" * 70)
    print("SECONDARY: HEADING STABILITY ACROSS VERSIONS")
    print("-" * 70)

    heading_jaccards = []
    for (topic_id, _), _ in sorted(triple_groups.items()):
        key = f"{topic_id}_STANDARD"
        r = all_results[key]
        heading_jaccards.append(r['heading_jaccard'])
        shared = r.get('shared_heading_texts', [])
        print(f"  {topic_id}: Jaccard={r['heading_jaccard']:.3f} "
              f"({r['n_shared_headings']}/{r['n_total_headings']} shared)")

    if heading_jaccards:
        print(f"\n  Mean heading Jaccard: {statistics.mean(heading_jaccards):.3f}")
        print(f"  Interpretation: {'HIGH' if statistics.mean(heading_jaccards) > 0.5 else 'LOW'} "
              f"structural stability across generations")

    # ================================================================
    # CAUSAL CLAIM STABILITY (secondary signal)
    # ================================================================
    print()
    print("-" * 70)
    print("SECONDARY: CAUSAL CLAIM COUNT STABILITY")
    print("-" * 70)

    causal_cvs = [r['causal_count_cv'] for r in all_results.values()]
    numerical_cvs = [r['numerical_count_cv'] for r in all_results.values()]

    print(f"  Mean CV of causal claim count:    {statistics.mean(causal_cvs):.3f}")
    print(f"  Mean CV of numerical claim count: {statistics.mean(numerical_cvs):.3f}")
    print(f"  Interpretation: CV<0.15 = stable counts, CV>0.30 = unstable")

    # ================================================================
    # SAVE RESULTS
    # ================================================================
    output = {
        'experiment': 'EXP-078_temporal_consistency_kill_test',
        'method': 'Cross-generation numerical claim variance',
        'data_source': 'fpr_estimation_docs.json',
        'n_topics_3ver': len(triple_groups),
        'n_topics_2ver': len(double_groups),
        'generator': 'gemini-3.1-flash-lite-preview',
        'standard_results': all_results,
        'basic_results': basic_results,
        'aggregate': {
            'standard_pct_fab_rate_mean': (
                round(statistics.mean(std_pct_rates), 4) if std_pct_rates else None
            ),
            'standard_pct_fab_rate_median': (
                round(statistics.median(std_pct_rates), 4) if std_pct_rates else None
            ),
            'standard_all_fab_rate_mean': (
                round(statistics.mean(std_all_rates), 4) if std_all_rates else None
            ),
            'basic_pct_fab_rate_mean': (
                round(statistics.mean(bas_pct_rates), 4) if bas_pct_rates else None
            ),
            'mean_heading_jaccard': (
                round(statistics.mean(std_heading_j), 4) if std_heading_j else None
            ),
            'mean_pct_per_topic': (
                round(statistics.mean(std_pct_totals), 1) if std_pct_totals else None
            ),
            'mean_all_nums_per_topic': (
                round(statistics.mean(std_all_totals), 1) if std_all_totals else None
            ),
        },
        'kill_signals': {
            'ks1_signal_density': (
                sum(1 for t in std_pct_totals if t < 3) / len(std_pct_totals)
                if std_pct_totals else None
            ),
            'ks2_condition_diff': (
                round(statistics.mean(matched_std_rates) - statistics.mean(matched_bas_rates), 4)
                if matched_std_rates and matched_bas_rates else None
            ),
            'ks3_mean_fab_rate': (
                round(statistics.mean(std_pct_rates), 4) if std_pct_rates else None
            ),
            'ks4_fab_rate_range': (
                [round(min(std_pct_rates), 4), round(max(std_pct_rates), 4)]
                if std_pct_rates else None
            ),
        },
    }

    results_path = DATA_DIR / 'exp078_temporal_consistency_results.json'
    with open(results_path, 'w') as f:
        json.dump(output, f, indent=2, default=str)

    print(f"\nResults saved to {results_path.name}")


if __name__ == '__main__':
    main()