#!/usr/bin/env python3
"""
EXP-078b: Cross-Generator Temporal Consistency Replication

Replicates the Layer 2 temporal consistency test across two additional
generator families to test whether the 100% fabrication finding is universal.

Original finding (gemini-3.1-flash-lite-preview, 20 topics):
  100% fabrication rate on model-generated percentages
  0.000 heading Jaccard across versions

This replication adds:
  - xAI (grok-4-1-fast) — vault's current default generator
  - Gemini 3 Flash (gemini-3-flash-preview) — stronger Gemini model (within-family test)

Design:
  10 topics × 3 STANDARD versions × 2 generators = 60 documents
  5 topics × 2 BASIC versions × 2 generators = 20 documents
  Total: 80 generations

  Topics: matched subset from cross_gen_docs.json (overlap with original 20)

Same prompts as fpr_estimation.py. Same analysis pipeline (claim_extraction.py).
No LLM evaluation at any point — pure regex extraction + set comparison.

Kill signals (pre-registered):
  KS1: <3 numerical claims per document → insufficient signal density
  KS2: No discrimination between STANDARD and BASIC → doesn't separate conditions
  KS3: All claims stable (<10% fabrication) → no fabrication to detect
  KS4: All claims variable (>90% fabrication) → no gradient to exploit
  KS5: Generator fabrication rates within 5% of each other → no gradient across generators
"""

import json
import os
import statistics
import sys
import time
from collections import defaultdict
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent))
from _config import (
    get_xai_client, get_gemini_client,
    GENERATOR_MODEL,
)

from claim_extraction import extract_all, compare_numerical_across_versions

DATA_DIR = Path(__file__).parent
DOCS_CACHE = DATA_DIR / "exp078b_cross_gen_temporal_docs.json"
RESULTS_PATH = DATA_DIR / "exp078b_cross_gen_temporal_results.json"
ORIGINAL_RESULTS = DATA_DIR / "exp078_temporal_consistency_results.json"

N_SAMPLES_STANDARD = 3
N_SAMPLES_BASIC = 2

# 10 topics matching cross_gen_docs.json for direct comparison
TOPICS = [
    {"id": "business_strategy", "domain": "business strategy",
     "task": "Analyze why mid-stage B2B SaaS companies (Series B, $10-30M ARR) "
             "consistently fail to transition from founder-led sales to a scalable "
             "sales organization, and what the structural causes are."},
    {"id": "technical_architecture", "domain": "technical architecture",
     "task": "Analyze why microservice architectures degrade into distributed "
             "monoliths within 18-24 months at companies with fewer than 50 engineers, "
             "and what architectural decisions cause this."},
    {"id": "research_analysis", "domain": "research analysis",
     "task": "Analyze why randomized controlled trials in educational technology "
             "consistently show smaller effect sizes than quasi-experimental studies, "
             "and what this reveals about the mechanisms of ed-tech interventions."},
    {"id": "platform_migration", "domain": "engineering management",
     "task": "Why platform migrations consistently take 3x longer than estimated and "
             "what structural factors make them resistant to better planning"},
    {"id": "pricing_psychology", "domain": "product strategy",
     "task": "How B2B SaaS companies systematically underprice their products and the "
             "cognitive biases that prevent correction"},
    {"id": "ml_deployment", "domain": "machine learning engineering",
     "task": "Why most ML models that work in notebooks fail in production and the "
             "specific engineering gaps responsible"},
    {"id": "remote_culture", "domain": "organizational design",
     "task": "How remote-first companies develop distinct failure modes in decision-making "
             "that co-located teams don't experience"},
    {"id": "api_design", "domain": "software architecture",
     "task": "Why backward-compatible API evolution is structurally harder than breaking "
             "changes and the design principles that make it tractable"},
    {"id": "data_governance", "domain": "data engineering",
     "task": "Why data quality initiatives fail even with executive support and what "
             "makes the problem structurally different from code quality"},
    {"id": "supply_chain_resilience", "domain": "operations management",
     "task": "Why lean supply chains that optimized for efficiency pre-2020 proved "
             "catastrophically fragile and what structural trade-offs between efficiency "
             "and resilience are genuinely unavoidable"},
]


# ================================================================
# PROMPTS — identical to fpr_estimation.py
# ================================================================

def build_standard_prompt(topic):
    return (
        f"Write an analytical document (~1000 words) on the following topic:\n\n"
        f"{topic['task']}\n\n"
        f"Requirements:\n"
        f"- SHAPED: Your document structure must be shaped by THIS specific analysis. "
        f"FAIL: your headings survive topic swap. If someone could paste these headings "
        f"onto a different {topic['domain']} document and they'd still fit, you wrote "
        f"generic structure, not shaped analysis. Each heading should name the specific "
        f"mechanism, finding, or claim it covers.\n"
        f"- COMMITTED: Take a clear position. Don't hedge with 'it depends' or 'there are "
        f"many factors.' Name the specific causes and commit to your diagnosis.\n"
        f"- DENSE: Every section must earn its place. If removing a paragraph doesn't change "
        f"the argument, that paragraph shouldn't exist.\n\n"
        f"Write the analysis now. Use markdown with ## headings."
    )


def build_basic_prompt(topic):
    return (
        f"Write an analysis of the following topic (~1000 words):\n\n"
        f"{topic['task']}\n\n"
        f"Write the analysis now. Use markdown with ## headings."
    )


# ================================================================
# GENERATION
# ================================================================

GEMINI_GEN_MODEL = "gemini-3-flash-preview"


def generate_with_xai(client, model, prompt, max_retries=5):
    """Generate using xAI (OpenAI-compatible API)."""
    messages = [{"role": "user", "content": prompt}]

    for attempt in range(max_retries):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=messages,
                max_tokens=4096,
                temperature=1.0,
            )
            content = resp.choices[0].message.content or ""
            if content.strip() and len(content.strip()) > 100:
                return content.strip()
            raise ValueError("Empty or too short response")
        except Exception as e:
            err_str = str(e)
            is_rate = "429" in err_str or "rate" in err_str.lower()
            wait = (60 if is_rate else 10) * (attempt + 1)
            print(f"    attempt {attempt+1}/{max_retries} failed: "
                  f"{'RATE LIMIT' if is_rate else 'ERROR'} — {err_str[:80]}")
            if attempt < max_retries - 1:
                print(f"    waiting {wait}s...")
                time.sleep(wait)

    print(f"    FAILED after {max_retries} attempts")
    return None


def generate_with_gemini(client, model, prompt, max_retries=5):
    """Generate using Gemini API."""
    from google.genai import types

    for attempt in range(max_retries):
        try:
            resp = client.models.generate_content(
                model=model,
                contents=prompt,
                config=types.GenerateContentConfig(
                    temperature=1.0,
                    max_output_tokens=4096,
                ),
            )
            text = resp.text
            if text and text.strip() and len(text.strip()) > 100:
                return text.strip()
            raise ValueError("Empty or too short response")
        except Exception as e:
            err_str = str(e)
            is_rate = "429" in err_str or "rate" in err_str.lower()
            wait = (30 if is_rate else 10) * (attempt + 1)
            print(f"    attempt {attempt+1}/{max_retries} failed: "
                  f"{'RATE LIMIT' if is_rate else 'ERROR'} — {err_str[:80]}")
            if attempt < max_retries - 1:
                print(f"    waiting {wait}s...")
                time.sleep(wait)

    print(f"    FAILED after {max_retries} attempts")
    return None


def _save_docs(documents):
    """Save documents incrementally."""
    with open(DOCS_CACHE, "w") as f:
        json.dump(documents, f, indent=2)


def generate_all_documents():
    """Generate documents for all generators. Resumes from cache if available."""
    documents = []
    if DOCS_CACHE.exists():
        with open(DOCS_CACHE) as f:
            documents = json.load(f)
        print(f"Loaded {len(documents)} cached documents")

    # Track what's already generated
    existing = set()
    for d in documents:
        existing.add((d["generator"], d["condition"], d["topic_id"],
                       d.get("sample_idx", 0)))

    # Build clients once
    xai_client = get_xai_client()
    gemini_client = get_gemini_client()

    generators = [
        {
            "label": "xai",
            "model": GENERATOR_MODEL,  # grok-4-1-fast
            "gen_fn": lambda prompt, c=xai_client, m=GENERATOR_MODEL: (
                generate_with_xai(c, m, prompt)),
            "delay": 3,
        },
        {
            "label": "gemini3f",
            "model": GEMINI_GEN_MODEL,  # gemini-3-flash-preview
            "gen_fn": lambda prompt, c=gemini_client, m=GEMINI_GEN_MODEL: (
                generate_with_gemini(c, m, prompt)),
            "delay": 1,
        },
    ]

    total_expected = (len(TOPICS) * N_SAMPLES_STANDARD + 5 * N_SAMPLES_BASIC) * 2
    gen_count = len(existing)

    for gen_info in generators:
        label = gen_info["label"]
        model = gen_info["model"]
        gen_fn = gen_info["gen_fn"]
        delay = gen_info["delay"]

        print(f"\n{'='*60}")
        print(f"GENERATOR: {label} ({model})")
        print(f"{'='*60}")

        # STANDARD: 10 topics × 3 samples
        for topic in TOPICS:
            for sample_idx in range(N_SAMPLES_STANDARD):
                key = (label, "STANDARD", topic["id"], sample_idx)
                if key in existing:
                    gen_count += 1
                    print(f"  [{gen_count}/{total_expected}] "
                          f"STANDARD {topic['id']} #{sample_idx+1}... CACHED")
                    continue

                gen_count += 1
                print(f"  [{gen_count}/{total_expected}] "
                      f"STANDARD {topic['id']} #{sample_idx+1}...",
                      end=" ", flush=True)

                prompt = build_standard_prompt(topic)
                text = gen_fn(prompt)

                if text:
                    wc = len(text.split())
                    hc = len([l for l in text.split('\n')
                              if l.strip().startswith('##')])
                    print(f"OK ({wc}w, {hc}h)")
                    documents.append({
                        "topic_id": topic["id"],
                        "domain": topic["domain"],
                        "task": topic["task"],
                        "condition": "STANDARD",
                        "generator": label,
                        "generator_model": model,
                        "sample_idx": sample_idx,
                        "text": text,
                        "word_count": wc,
                        "heading_count": hc,
                    })
                    existing.add(key)
                else:
                    print("FAILED")

                time.sleep(delay)

        # BASIC: first 5 topics × 2 samples
        for topic in TOPICS[:5]:
            for sample_idx in range(N_SAMPLES_BASIC):
                key = (label, "BASIC", topic["id"], sample_idx)
                if key in existing:
                    gen_count += 1
                    print(f"  [{gen_count}/{total_expected}] "
                          f"BASIC {topic['id']} #{sample_idx+1}... CACHED")
                    continue

                gen_count += 1
                print(f"  [{gen_count}/{total_expected}] "
                      f"BASIC {topic['id']} #{sample_idx+1}...",
                      end=" ", flush=True)

                prompt = build_basic_prompt(topic)
                text = gen_fn(prompt)

                if text:
                    wc = len(text.split())
                    hc = len([l for l in text.split('\n')
                              if l.strip().startswith('##')])
                    print(f"OK ({wc}w, {hc}h)")
                    documents.append({
                        "topic_id": topic["id"],
                        "domain": topic["domain"],
                        "task": topic["task"],
                        "condition": "BASIC",
                        "generator": label,
                        "generator_model": model,
                        "sample_idx": sample_idx,
                        "text": text,
                        "word_count": wc,
                        "heading_count": hc,
                    })
                    existing.add(key)
                else:
                    print("FAILED")

                time.sleep(delay)

        # Save after each generator completes
        _save_docs(documents)
        print(f"  Saved {len(documents)} documents (incremental)")

    print(f"\nTotal generated: {len(documents)}/{total_expected} documents")
    return documents


# ================================================================
# ANALYSIS — same pipeline as exp078_temporal_consistency.py
# ================================================================

def analyze_generator(docs, generator_label):
    """Run temporal consistency analysis for one generator's documents."""
    # Group by (topic_id, condition)
    groups = defaultdict(list)
    for doc in docs:
        if doc["generator"] == generator_label:
            groups[(doc["topic_id"], doc["condition"])].append(doc)

    triple_groups = {
        k: sorted(v, key=lambda d: d.get("sample_idx", 0))
        for k, v in groups.items() if len(v) >= 3
    }
    double_groups = {
        k: sorted(v, key=lambda d: d.get("sample_idx", 0))
        for k, v in groups.items() if len(v) == 2
    }

    # STANDARD analysis (3-version groups)
    standard_results = {}
    for (topic_id, condition), doc_group in sorted(triple_groups.items()):
        version_data = [extract_all(doc["text"]) for doc in doc_group]
        comparison = compare_numerical_across_versions(version_data)
        standard_results[f"{topic_id}_{condition}"] = comparison

    # BASIC analysis (2-version groups)
    basic_results = {}
    for (topic_id, condition), doc_group in sorted(double_groups.items()):
        version_data = [extract_all(doc["text"]) for doc in doc_group]
        comparison = compare_numerical_across_versions(version_data)
        basic_results[f"{topic_id}_{condition}"] = comparison

    return standard_results, basic_results


def compute_aggregate(results):
    """Compute aggregate stats from per-topic results."""
    pct_rates = [r["fabrication_rate_pct"] for r in results.values()
                 if r["fabrication_rate_pct"] is not None]
    all_rates = [r["all_numbers_fab_rate"] for r in results.values()
                 if r["all_numbers_fab_rate"] is not None]
    heading_j = [r["heading_jaccard"] for r in results.values()]
    pct_totals = [r["n_pct_total"] for r in results.values()]
    all_totals = [r["all_numbers_total"] for r in results.values()]

    agg = {}
    if pct_rates:
        agg["pct_fab_rate_mean"] = round(statistics.mean(pct_rates), 4)
        agg["pct_fab_rate_median"] = round(statistics.median(pct_rates), 4)
        agg["pct_fab_rate_range"] = [round(min(pct_rates), 4),
                                     round(max(pct_rates), 4)]
        agg["n_topics_with_pct"] = len(pct_rates)
    if all_rates:
        agg["all_fab_rate_mean"] = round(statistics.mean(all_rates), 4)
    if heading_j:
        agg["mean_heading_jaccard"] = round(statistics.mean(heading_j), 3)
    if pct_totals:
        agg["mean_pct_per_topic"] = round(statistics.mean(pct_totals), 1)
    if all_totals:
        agg["mean_all_nums_per_topic"] = round(statistics.mean(all_totals), 1)

    return agg


def print_generator_results(gen_label, std_results, bas_results, agg):
    """Print results for one generator."""
    print(f"\n{'='*60}")
    print(f"GENERATOR: {gen_label}")
    print(f"{'='*60}")

    print(f"\nSTANDARD (n={len(std_results)} topics):")
    for key, r in sorted(std_results.items()):
        fab = r["fabrication_rate_pct"]
        fab_str = f"{fab:.0%}" if fab is not None else "N/A (no pcts)"
        all_fab = r["all_numbers_fab_rate"]
        all_str = f"{all_fab:.0%}" if all_fab is not None else "N/A"
        print(f"  {key}: pct_fab={fab_str}  all_fab={all_str}  "
              f"heading_J={r['heading_jaccard']:.3f}  "
              f"nums={r['numerical_counts']}")

    if agg:
        print(f"\n  Aggregate:")
        for k, v in agg.items():
            print(f"    {k}: {v}")

    if bas_results:
        print(f"\nBASIC (n={len(bas_results)} topics):")
        for key, r in sorted(bas_results.items()):
            fab = r["fabrication_rate_pct"]
            fab_str = f"{fab:.0%}" if fab is not None else "N/A"
            print(f"  {key}: pct_fab={fab_str}  "
                  f"heading_J={r['heading_jaccard']:.3f}")


def load_original_gemini_results():
    """Load original Gemini results for comparison."""
    if not ORIGINAL_RESULTS.exists():
        print("  (original results file not found — skipping comparison)")
        return None

    with open(ORIGINAL_RESULTS) as f:
        data = json.load(f)

    # Extract per-topic fabrication rates for matched topics
    matched = {}
    for key, r in data.get("standard_results", {}).items():
        topic_id = key.replace("_STANDARD", "")
        if r["fabrication_rate_pct"] is not None:
            matched[topic_id] = r["fabrication_rate_pct"]

    return {
        "aggregate": data.get("aggregate", {}),
        "per_topic": matched,
    }


def main():
    print("=" * 70)
    print("EXP-078b: CROSS-GENERATOR TEMPORAL CONSISTENCY REPLICATION")
    print("=" * 70)
    print(f"Generators: xAI ({GENERATOR_MODEL}), Gemini 3 Flash ({GEMINI_GEN_MODEL})")
    print(f"Topics: {len(TOPICS)}")
    print(f"STANDARD: {len(TOPICS)} × {N_SAMPLES_STANDARD} versions × 2 generators "
          f"= {len(TOPICS) * N_SAMPLES_STANDARD * 2}")
    print(f"BASIC: 5 × {N_SAMPLES_BASIC} versions × 2 generators "
          f"= {5 * N_SAMPLES_BASIC * 2}")
    print(f"No LLM evaluation — pure regex extraction + set comparison")
    print()

    # === Phase 1: Generate documents ===
    docs = generate_all_documents()

    # === Phase 2: Analyze each generator ===
    all_generator_results = {}

    for gen_label in ["xai", "gemini3f"]:
        gen_docs = [d for d in docs if d["generator"] == gen_label]
        if not gen_docs:
            print(f"\nWARNING: No documents for {gen_label}")
            continue

        gen_model = gen_docs[0]["generator_model"]
        std_results, bas_results = analyze_generator(docs, gen_label)
        std_agg = compute_aggregate(std_results)

        print_generator_results(gen_label, std_results, bas_results, std_agg)

        all_generator_results[gen_label] = {
            "generator_model": gen_model,
            "n_standard_topics": len(std_results),
            "n_basic_topics": len(bas_results),
            "standard_results": std_results,
            "basic_results": bas_results,
            "aggregate": std_agg,
        }

    # === Phase 3: Cross-generator comparison ===
    print()
    print("=" * 70)
    print("CROSS-GENERATOR COMPARISON")
    print("=" * 70)

    # Load original Gemini results
    gemini_orig = load_original_gemini_results()

    # Build comparison table
    generators_compared = {}

    if gemini_orig:
        gemini_agg = gemini_orig["aggregate"]
        generators_compared["gemini (original)"] = {
            "model": "gemini-3.1-flash-lite-preview",
            "pct_fab_mean": gemini_agg.get("standard_pct_fab_rate_mean"),
            "all_fab_mean": gemini_agg.get("standard_all_fab_rate_mean"),
            "heading_jaccard": gemini_agg.get("mean_heading_jaccard"),
            "n_topics": 20,
        }

    for gen_label, gen_data in all_generator_results.items():
        agg = gen_data["aggregate"]
        generators_compared[gen_label] = {
            "model": gen_data["generator_model"],
            "pct_fab_mean": agg.get("pct_fab_rate_mean"),
            "all_fab_mean": agg.get("all_fab_rate_mean"),
            "heading_jaccard": agg.get("mean_heading_jaccard"),
            "n_topics": gen_data["n_standard_topics"],
        }

    print(f"\n{'Generator':<25} {'Model':<30} {'Pct Fab':<10} "
          f"{'All Fab':<10} {'Head J':<10} {'N':<5}")
    print("-" * 90)
    for gen, data in generators_compared.items():
        pct = f"{data['pct_fab_mean']:.1%}" if data['pct_fab_mean'] is not None else "N/A"
        all_f = f"{data['all_fab_mean']:.1%}" if data['all_fab_mean'] is not None else "N/A"
        hj = f"{data['heading_jaccard']:.3f}" if data['heading_jaccard'] is not None else "N/A"
        print(f"{gen:<25} {data['model']:<30} {pct:<10} {all_f:<10} {hj:<10} {data['n_topics']}")

    # Per-topic comparison (matched topics only)
    if gemini_orig:
        print(f"\nPer-topic fabrication rates (matched topics, STANDARD):")
        print(f"{'Topic':<25} {'Gemini(orig)':<12} {'xAI':<12} {'Gemini3F':<12}")
        print("-" * 60)

        for topic in TOPICS:
            tid = topic["id"]
            gem_rate = gemini_orig["per_topic"].get(tid)
            xai_rate = None
            gpt_rate = None

            if "xai" in all_generator_results:
                key = f"{tid}_STANDARD"
                r = all_generator_results["xai"]["standard_results"].get(key, {})
                xai_rate = r.get("fabrication_rate_pct")

            g3f_rate = None
            if "gemini3f" in all_generator_results:
                key = f"{tid}_STANDARD"
                r = all_generator_results["gemini3f"]["standard_results"].get(key, {})
                g3f_rate = r.get("fabrication_rate_pct")

            gem_str = f"{gem_rate:.0%}" if gem_rate is not None else "—"
            xai_str = f"{xai_rate:.0%}" if xai_rate is not None else "—"
            g3f_str = f"{g3f_rate:.0%}" if g3f_rate is not None else "—"
            print(f"{tid:<25} {gem_str:<12} {xai_str:<12} {g3f_str:<12}")

    # === Phase 4: Kill signal assessment ===
    print()
    print("=" * 70)
    print("KILL SIGNAL ASSESSMENT")
    print("=" * 70)

    for gen_label, gen_data in all_generator_results.items():
        agg = gen_data["aggregate"]
        print(f"\n{gen_label} ({gen_data['generator_model']}):")

        # KS1: Signal density
        mean_pct = agg.get("mean_pct_per_topic", 0)
        mean_all = agg.get("mean_all_nums_per_topic", 0)
        if mean_pct < 3 and mean_all < 5:
            print(f"  KS1 — Signal density: CONCERN (mean pct/topic={mean_pct}, "
                  f"all nums/topic={mean_all})")
        else:
            print(f"  KS1 — Signal density: PASS (mean pct/topic={mean_pct}, "
                  f"all nums/topic={mean_all})")

        # KS3: Fabrication detectable?
        fab = agg.get("pct_fab_rate_mean")
        if fab is not None:
            if fab < 0.10:
                print(f"  KS3 — Fabrication: KILL SIGNAL (<10% fab rate: {fab:.1%})")
            elif fab < 0.20:
                print(f"  KS3 — Fabrication: WEAK (10-20% fab rate: {fab:.1%})")
            else:
                print(f"  KS3 — Fabrication: PASS ({fab:.0%} fabrication)")
        else:
            print(f"  KS3 — Fabrication: NO DATA (no percentage claims found)")

        # KS4: Gradient exists?
        fab_range = agg.get("pct_fab_rate_range")
        if fab_range:
            if fab_range[0] > 0.90:
                print(f"  KS4 — Gradient: KILL (all >90%, range {fab_range[0]:.0%}-{fab_range[1]:.0%})")
            else:
                print(f"  KS4 — Gradient: PASS (range {fab_range[0]:.0%}-{fab_range[1]:.0%})")

    # KS5: Cross-generator gradient
    fab_rates = {g: d.get("pct_fab_mean") for g, d in generators_compared.items()
                 if d.get("pct_fab_mean") is not None}
    if len(fab_rates) >= 2:
        vals = list(fab_rates.values())
        spread = max(vals) - min(vals)
        print(f"\nKS5 — Cross-generator gradient:")
        print(f"  Rates: {', '.join(f'{g}={v:.1%}' for g, v in fab_rates.items())}")
        print(f"  Spread: {spread:.1%}")
        if spread < 0.05:
            print(f"  >>> KILL: <5% spread — fabrication rate is generator-invariant")
            print(f"  >>> Interpretation: fabrication is universal (STRONG for convergence thesis)")
        elif spread < 0.20:
            print(f"  >>> MODERATE gradient — fabrication varies but is present across generators")
        else:
            print(f"  >>> LARGE gradient — fabrication is generator-specific (WEAKENS convergence)")

    # === Phase 5: Verdict ===
    print()
    print("=" * 70)
    print("VERDICT")
    print("=" * 70)

    universal = True
    for gen_label, gen_data in all_generator_results.items():
        fab = gen_data["aggregate"].get("pct_fab_rate_mean")
        if fab is not None and fab < 0.50:
            universal = False
            print(f"  {gen_label}: {fab:.0%} fabrication — BELOW 50% threshold")
        elif fab is not None:
            print(f"  {gen_label}: {fab:.0%} fabrication — ABOVE 50% threshold")
        else:
            universal = False
            print(f"  {gen_label}: No percentage data — CANNOT ASSESS")

    if universal:
        print("\n  UNIVERSAL FABRICATION: All generators show majority fabrication.")
        print("  The convergence thesis strengthens — constrained generation")
        print("  produces fabricated specificity regardless of model family.")
    else:
        print("\n  GENERATOR-SPECIFIC: Fabrication rate varies significantly.")
        print("  The convergence thesis needs revision — the mechanism is")
        print("  not universal but model-family-dependent.")

    # === Save results ===
    output = {
        "experiment": "EXP-078b_cross_generator_temporal_consistency",
        "method": "Cross-generation numerical claim variance (same as EXP-078 Layer 2)",
        "generators": {
            gen: {
                "model": data["generator_model"],
                "n_standard_topics": data["n_standard_topics"],
                "n_basic_topics": data["n_basic_topics"],
                "aggregate": data["aggregate"],
                "standard_results": data["standard_results"],
                "basic_results": data["basic_results"],
            }
            for gen, data in all_generator_results.items()
        },
        "cross_generator_comparison": generators_compared,
        "n_topics": len(TOPICS),
        "n_standard_samples": N_SAMPLES_STANDARD,
        "n_basic_samples": N_SAMPLES_BASIC,
        "prompts_source": "fpr_estimation.py (identical)",
        "analysis_pipeline": "claim_extraction.py (identical)",
        "llm_evaluation": "NONE — pure regex extraction + set comparison",
    }

    with open(RESULTS_PATH, "w") as f:
        json.dump(output, f, indent=2, default=str)
    print(f"\nResults saved to {RESULTS_PATH.name}")


if __name__ == "__main__":
    main()