"""
EXP-081c: Commensurable Bridge

Closes the measurement gap between EXP-048-080 (temporal consistency)
and EXP-081 (source matching) by applying BOTH measurements to the
SAME documents under BOTH conditions (source-present, source-absent).

Design:
  3 topics × 2 conditions × 3 versions = 18 documents (T3-CURRENT)
  Source-present: v1+v2 from 081b + 1 new = 3 versions
  Source-absent: 3 new versions (same prompt, no source material)

Measurements (all zero-LLM):
  1. Source matching: % of numbers appearing in source document
  2. Temporal consistency: % of numbers stable across 3 versions
  3. Cross-tabulation: source-matched × temporally-stable

Bridge prediction:
  - Source-present: high source match (~92%), high temporal stability
  - Source-absent: low coincidental source match, low temporal stability
  - Source-matched numbers should be temporally stable (convergent validity)
"""

import json
import os
import re
import sys
import time

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from _config import (
    GENERATOR_MODEL,
    get_generator_client,
    call_generator,
    format_results,
)

SOURCE_DIR = os.path.join(os.path.dirname(__file__), "sources")


def load_source(filename):
    path = os.path.join(SOURCE_DIR, filename)
    with open(path, "r") as f:
        return f.read()


# ============================================================
# Topics (same as 081/081b)
# ============================================================

TOPICS = {
    "remote_work": {
        "source_file": "source_1_remote_work.md",
        "topic": (
            "Remote and hybrid work effectiveness: what the productivity data "
            "actually shows vs. what companies claim, where task type determines "
            "optimal location, and why the flexibility-retention link matters "
            "more than the productivity debate"
        ),
        "judge_role": "Engineering manager, 5+ years leading distributed teams",
        "judge_axis": (
            "Actionable diagnosis of what actually changes productivity; "
            "distinguishing signal from preference"
        ),
        "judge_cares": (
            "What actually changes productivity by task type; flexibility as "
            "retention lever vs productivity lever; cost-benefit that includes "
            "hidden overhead"
        ),
    },
    "communication": {
        "source_file": "source_3_communication.md",
        "topic": (
            "Internal communication as organizational infrastructure: why "
            "communication satisfaction is the lowest-rated workplace factor "
            "despite having the largest impact on retention, satisfaction, and "
            "productivity — and what the non-desk worker gap reveals about "
            "systemic failures"
        ),
        "judge_role": (
            "Executive coach, 50+ leader engagements, tracks behavioral outcomes"
        ),
        "judge_axis": (
            "Observable behavioral patterns with measurable downstream effects; "
            "channel-specific trust data"
        ),
        "judge_cares": (
            "Observable patterns with measurable effects; channel trust data; "
            "the gap between communication impact and communication quality; "
            "non-desk vs desk-based disparities"
        ),
    },
    "ai_workflows": {
        "source_file": "source_4_ai_workflows.md",
        "topic": (
            "AI-assisted development workflows: how developers actually use AI "
            "coding tools, where they trust vs. distrust the output, and what "
            "the productivity claims miss about debugging overhead and code "
            "quality trade-offs"
        ),
        "judge_role": (
            "Staff engineer who's shipped AI-integrated features AND maintained "
            "them 6+ months"
        ),
        "judge_axis": (
            "Trust/accuracy trade-offs in production; debugging overhead vs "
            "generation speed; what code quality metrics actually show"
        ),
        "judge_cares": (
            "Trust/accuracy trade-offs in production; debugging overhead vs "
            "generation speed; what code quality metrics actually show"
        ),
    },
}

VERSIONS = 3  # Need 3 for temporal consistency


# ============================================================
# Prompt (T3-CURRENT, with and without source)
# ============================================================

def t3_prompt_with_source(source, topic, judge_role, judge_axis, judge_cares):
    return (
        f"SOURCE MATERIAL:\n{source}\n\n---\n\n"
        f"Write an analytical document (~1000 words) on the following topic:\n\n"
        f"{topic}\n\n"
        f"JUDGE: {judge_role} — evaluating for {judge_axis}\n"
        f"  Cares about: {judge_cares}\n\n"
        f"FAIL (any = failure):\n"
        f"- Structure survives topic swap. Headings that work for any document\n"
        f"  in this domain = generic, not shaped.\n"
        f"- Claims hedge without conditions. \"It depends\" only with explicit\n"
        f"  conditions and consequences. \"On the other hand\" after every claim\n"
        f"  = uncommitted.\n"
        f"- Paragraph survives removal without changing the argument = padding.\n"
        f"- Analysis survives context-swap. Generic = failure.\n"
        f"- Claims presented as established fact without sourcing. If you cite\n"
        f"  a number, name the source. If you can't \u2192 qualitative language.\n"
        f"- Position without falsifier. If nothing could prove your thesis\n"
        f"  wrong, it's not a thesis.\n\n"
        f"Must GUARANTEE:\n"
        f"- Every factual claim traces to the source material OR is explicitly\n"
        f"  labeled EXTENDS (beyond source) with confidence level (H/M/L).\n"
        f"- Mechanism for each causal claim: HOW it produces the outcome.\n\n"
        f"Write the analysis now. Use markdown with ## headings."
    )


def t3_prompt_without_source(topic, judge_role, judge_axis, judge_cares):
    """Same T3 prompt but NO source material in context."""
    return (
        f"Write an analytical document (~1000 words) on the following topic:\n\n"
        f"{topic}\n\n"
        f"JUDGE: {judge_role} — evaluating for {judge_axis}\n"
        f"  Cares about: {judge_cares}\n\n"
        f"FAIL (any = failure):\n"
        f"- Structure survives topic swap. Headings that work for any document\n"
        f"  in this domain = generic, not shaped.\n"
        f"- Claims hedge without conditions. \"It depends\" only with explicit\n"
        f"  conditions and consequences. \"On the other hand\" after every claim\n"
        f"  = uncommitted.\n"
        f"- Paragraph survives removal without changing the argument = padding.\n"
        f"- Analysis survives context-swap. Generic = failure.\n"
        f"- Claims presented as established fact without sourcing. If you cite\n"
        f"  a number, name the source. If you can't \u2192 qualitative language.\n"
        f"- Position without falsifier. If nothing could prove your thesis\n"
        f"  wrong, it's not a thesis.\n\n"
        f"Must GUARANTEE:\n"
        f"- Mechanism for each causal claim: HOW it produces the outcome.\n\n"
        f"Write the analysis now. Use markdown with ## headings."
    )


# ============================================================
# Number extraction + matching (from exp081_number_match.py)
# ============================================================

def extract_numbers(text):
    numbers = []
    seen = set()

    for m in re.finditer(r'(\d+(?:\.\d+)?)\s*%', text):
        val = m.group(1)
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "percentage", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({"value": val, "raw": m.group(0),
                            "context": context, "type": "percentage"})

    for m in re.finditer(r'\$(\d+(?:\.\d+)?(?:,\d{3})*)', text):
        val = m.group(1).replace(",", "")
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "dollar", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({"value": val, "raw": m.group(0),
                            "context": context, "type": "dollar"})

    for m in re.finditer(r'(?<!\$)(?<!\d)(\d+\.\d+)(?!%)', text):
        val = m.group(1)
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "decimal", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({"value": val, "raw": m.group(0),
                            "context": context, "type": "decimal"})

    for m in re.finditer(r'(?<!\$)(?<!\d)(?<!\.)(\d{2,6})(?!\.\d)(?!%)(?!\d)', text):
        val = m.group(1)
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "integer", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({"value": val, "raw": m.group(0),
                            "context": context, "type": "integer"})

    return numbers


def number_in_source(number, source_text):
    val = number["value"]
    ntype = number["type"]
    if ntype == "percentage":
        pattern = re.escape(val) + r'\s*%'
        if re.search(pattern, source_text):
            return True
        if "." in val:
            int_val = val.split(".")[0]
            if re.search(re.escape(int_val) + r'\s*%', source_text):
                return True
        return False
    elif ntype == "dollar":
        return bool(re.search(r'\$' + re.escape(val), source_text))
    else:
        return bool(re.search(r'(?<!\d)' + re.escape(val) + r'(?!\d)', source_text))


def classify_year(val):
    try:
        n = int(val)
        return 2019 <= n <= 2030
    except ValueError:
        return False


def is_word_count(num, text):
    val = num["value"]
    for m in re.finditer(re.escape(num["raw"]), text):
        start = max(0, m.start() - 60)
        context = text[start:m.end() + 20].lower()
        if any(p in context for p in ["word count", "total word count",
                                       "word count so far"]):
            return True
    return False


def analyze_numbers(text, source):
    numbers = extract_numbers(text)
    in_source = []
    not_in_source = []
    years = 0
    wc = 0

    for num in numbers:
        if classify_year(num["value"]):
            years += 1
            continue
        if is_word_count(num, text):
            wc += 1
            continue
        if number_in_source(num, source):
            in_source.append(num)
        else:
            not_in_source.append(num)

    total = len(in_source) + len(not_in_source)
    rate = len(not_in_source) / total if total > 0 else 0

    return {
        "total_numbers": total,
        "in_source": len(in_source),
        "not_in_source": len(not_in_source),
        "unsourced_rate": round(rate, 3),
        "years_excluded": years,
        "word_counts_excluded": wc,
        "in_source_values": [n["value"] for n in in_source],
        "not_in_source_values": [n["value"] for n in not_in_source],
    }


# ============================================================
# Temporal consistency
# ============================================================

def temporal_consistency(docs_numbers):
    """Given list of 3 analyze_numbers results, compute temporal stability.

    For each unique number value across all 3 versions:
      - STABLE: appears in all 3 versions
      - UNSTABLE: appears in 1 or 2 versions

    Returns dict with stability rates.
    """
    # Collect all unique number values per version (as sets)
    version_sets = []
    for nm in docs_numbers:
        vals = set(nm["in_source_values"] + nm["not_in_source_values"])
        version_sets.append(vals)

    # Union of all numbers across versions
    all_nums = version_sets[0] | version_sets[1] | version_sets[2]
    if not all_nums:
        return {"total_unique": 0, "stable": 0, "unstable": 0,
                "stability_rate": 0}

    stable = 0
    unstable = 0
    stable_vals = []
    unstable_vals = []

    for val in all_nums:
        count = sum(1 for vs in version_sets if val in vs)
        if count == 3:
            stable += 1
            stable_vals.append(val)
        else:
            unstable += 1
            unstable_vals.append(val)

    total = stable + unstable
    return {
        "total_unique": total,
        "stable": stable,
        "unstable": unstable,
        "stability_rate": round(stable / total, 3) if total > 0 else 0,
        "instability_rate": round(unstable / total, 3) if total > 0 else 0,
        "stable_values": stable_vals,
        "unstable_values": unstable_vals,
    }


def cross_tabulate(docs_numbers, source, temporal_result):
    """Cross-tabulate source matching × temporal stability.

    For each stable number: is it in source?
    For each unstable number: is it in source?
    """
    stable_in_source = 0
    stable_not_in_source = 0
    unstable_in_source = 0
    unstable_not_in_source = 0

    stable_set = set(temporal_result["stable_values"])
    unstable_set = set(temporal_result["unstable_values"])

    # Check source matching for stable values
    for val in stable_set:
        # Create a synthetic number dict for matching
        num = {"value": val, "type": "percentage" if "." not in val and
               int(val) <= 100 else "integer"}
        # Try percentage first
        pct_num = {"value": val, "type": "percentage"}
        int_num = {"value": val, "type": "integer"}
        dec_num = {"value": val, "type": "decimal"}
        if number_in_source(pct_num, source) or number_in_source(int_num, source) \
                or number_in_source(dec_num, source):
            stable_in_source += 1
        else:
            stable_not_in_source += 1

    for val in unstable_set:
        pct_num = {"value": val, "type": "percentage"}
        int_num = {"value": val, "type": "integer"}
        dec_num = {"value": val, "type": "decimal"}
        if number_in_source(pct_num, source) or number_in_source(int_num, source) \
                or number_in_source(dec_num, source):
            unstable_in_source += 1
        else:
            unstable_not_in_source += 1

    return {
        "stable_in_source": stable_in_source,
        "stable_not_in_source": stable_not_in_source,
        "unstable_in_source": unstable_in_source,
        "unstable_not_in_source": unstable_not_in_source,
    }


# ============================================================
# Main
# ============================================================

def main():
    client, model = get_generator_client()
    print(f"Generator: {model}")
    print(f"Design: 3 topics × 2 conditions × 3 versions = 18 documents")
    print(f"Measurements: source matching + temporal consistency")
    print()

    # Load sources
    sources = {}
    for topic_key, cfg in TOPICS.items():
        sources[topic_key] = load_source(cfg["source_file"])
        print(f"  Source [{topic_key}]: {len(sources[topic_key])} chars")

    # Load existing 081b results for source-present v1, v2
    results_path = os.path.join(os.path.dirname(__file__),
                                 "exp081_prompt_arch_results.json")
    with open(results_path) as f:
        existing = json.load(f)

    existing_results = existing["findings"]["results"]

    # Storage: {topic: {condition: [v1_text, v2_text, v3_text]}}
    docs = {t: {"source_present": [], "source_absent": []}
            for t in TOPICS}

    # === SOURCE-PRESENT: Load v1, v2 from 081b, generate v3 ===
    print(f"\n{'='*60}")
    print(f"  SOURCE-PRESENT (v1+v2 from 081b + v3 new)")
    print(f"{'='*60}")

    for topic_key, cfg in TOPICS.items():
        source = sources[topic_key]

        # Load v1, v2 from existing results
        for r in existing_results:
            if r["arch"] == "T3_CURRENT" and r["topic"] == topic_key:
                docs[topic_key]["source_present"].append(r["text"])

        print(f"  {topic_key}: loaded v1, v2 from 081b "
              f"({len(docs[topic_key]['source_present'])} docs)")

        # Generate v3
        print(f"  {topic_key} v3...", end=" ", flush=True)
        t0 = time.time()
        prompt = t3_prompt_with_source(
            source, cfg["topic"], cfg["judge_role"],
            cfg["judge_axis"], cfg["judge_cares"])
        text = call_generator(client, model,
                              [{"role": "user", "content": prompt}])
        elapsed = time.time() - t0
        words = len(text.split())
        docs[topic_key]["source_present"].append(text)
        print(f"done ({words}w, {elapsed:.1f}s)")
        time.sleep(3)

    # === SOURCE-ABSENT: Generate 3 versions each ===
    print(f"\n{'='*60}")
    print(f"  SOURCE-ABSENT (3 new versions each)")
    print(f"{'='*60}")

    for topic_key, cfg in TOPICS.items():
        for v in range(VERSIONS):
            print(f"  {topic_key} v{v+1}...", end=" ", flush=True)
            t0 = time.time()
            prompt = t3_prompt_without_source(
                cfg["topic"], cfg["judge_role"],
                cfg["judge_axis"], cfg["judge_cares"])
            text = call_generator(client, model,
                                  [{"role": "user", "content": prompt}])
            elapsed = time.time() - t0
            words = len(text.split())
            docs[topic_key]["source_absent"].append(text)
            print(f"done ({words}w, {elapsed:.1f}s)")
            time.sleep(3)

    # ============================================================
    # Analysis
    # ============================================================

    print(f"\n{'='*60}")
    print(f"  ANALYSIS")
    print(f"{'='*60}")

    all_results = {}

    for topic_key in TOPICS:
        source = sources[topic_key]
        all_results[topic_key] = {}

        for condition in ["source_present", "source_absent"]:
            texts = docs[topic_key][condition]
            assert len(texts) == 3, f"Expected 3 versions, got {len(texts)}"

            # Number matching for each version
            nm_results = [analyze_numbers(t, source) for t in texts]

            # Temporal consistency across 3 versions
            tc = temporal_consistency(nm_results)

            # Cross-tabulation
            xt = cross_tabulate(nm_results, source, tc)

            # Aggregate source matching
            total_nums = sum(r["total_numbers"] for r in nm_results)
            total_sourced = sum(r["in_source"] for r in nm_results)
            total_unsourced = sum(r["not_in_source"] for r in nm_results)
            source_rate = total_sourced / total_nums if total_nums > 0 else 0

            all_results[topic_key][condition] = {
                "per_version": nm_results,
                "temporal": tc,
                "cross_tab": xt,
                "aggregate": {
                    "total_numbers": total_nums,
                    "in_source": total_sourced,
                    "not_in_source": total_unsourced,
                    "source_match_rate": round(source_rate, 3),
                },
            }

            print(f"\n  {topic_key} | {condition}")
            print(f"    Source matching: {total_sourced}/{total_nums} "
                  f"= {source_rate:.1%} in source")
            print(f"    Temporal: {tc['stable']}/{tc['total_unique']} "
                  f"stable = {tc['stability_rate']:.1%}")
            print(f"    Cross-tab: stable+sourced={xt['stable_in_source']}, "
                  f"stable+unsourced={xt['stable_not_in_source']}, "
                  f"unstable+sourced={xt['unstable_in_source']}, "
                  f"unstable+unsourced={xt['unstable_not_in_source']}")

    # ============================================================
    # Summary: The Bridge
    # ============================================================

    print(f"\n{'='*60}")
    print(f"  COMMENSURABLE BRIDGE: Source-Present vs Source-Absent")
    print(f"{'='*60}")

    # Aggregate across topics
    for condition in ["source_present", "source_absent"]:
        total_nums = sum(all_results[t][condition]["aggregate"]["total_numbers"]
                         for t in TOPICS)
        total_sourced = sum(all_results[t][condition]["aggregate"]["in_source"]
                            for t in TOPICS)
        total_unique = sum(all_results[t][condition]["temporal"]["total_unique"]
                           for t in TOPICS)
        total_stable = sum(all_results[t][condition]["temporal"]["stable"]
                           for t in TOPICS)

        source_rate = total_sourced / total_nums if total_nums > 0 else 0
        stability = total_stable / total_unique if total_unique > 0 else 0

        print(f"\n  {condition}:")
        print(f"    Source match rate: {total_sourced}/{total_nums} = {source_rate:.1%}")
        print(f"    Temporal stability: {total_stable}/{total_unique} = {stability:.1%}")

    # Cross-tabulation summary
    print(f"\n  --- Cross-Tabulation (convergent validity) ---")
    print(f"  {'':20s} {'Stable':>10s} {'Unstable':>10s}")
    for condition in ["source_present", "source_absent"]:
        xt_agg = {"si": 0, "sn": 0, "ui": 0, "un": 0}
        for t in TOPICS:
            xt = all_results[t][condition]["cross_tab"]
            xt_agg["si"] += xt["stable_in_source"]
            xt_agg["sn"] += xt["stable_not_in_source"]
            xt_agg["ui"] += xt["unstable_in_source"]
            xt_agg["un"] += xt["unstable_not_in_source"]
        print(f"  {condition}:")
        print(f"    In source:     {xt_agg['si']:>10d} {xt_agg['ui']:>10d}")
        print(f"    Not in source: {xt_agg['sn']:>10d} {xt_agg['un']:>10d}")

    # Convergent validity test
    print(f"\n  --- Convergent Validity Test ---")
    print(f"  Prediction: source-matched numbers should be temporally stable")
    print(f"  Prediction: unsourced numbers should be temporally unstable")

    for condition in ["source_present", "source_absent"]:
        si = sum(all_results[t][condition]["cross_tab"]["stable_in_source"]
                 for t in TOPICS)
        sn = sum(all_results[t][condition]["cross_tab"]["stable_not_in_source"]
                 for t in TOPICS)
        ui = sum(all_results[t][condition]["cross_tab"]["unstable_in_source"]
                 for t in TOPICS)
        un = sum(all_results[t][condition]["cross_tab"]["unstable_not_in_source"]
                 for t in TOPICS)
        total_stable = si + sn
        total_unstable = ui + un
        pct_stable_sourced = si / total_stable if total_stable > 0 else 0
        pct_unstable_unsourced = un / total_unstable if total_unstable > 0 else 0
        print(f"  {condition}:")
        print(f"    % of stable numbers that are sourced: {pct_stable_sourced:.1%}")
        print(f"    % of unstable numbers that are unsourced: {pct_unstable_unsourced:.1%}")

    # Save
    output_path = os.path.join(os.path.dirname(__file__),
                                "exp081_bridge_results.json")
    output = format_results(
        findings={
            "experiment": "EXP-081c-commensurable-bridge",
            "description": (
                "Commensurable bridge between temporal consistency and source "
                "matching measurements. Same measurement tool applied to "
                "source-present and source-absent conditions. Cross-tabulation "
                "validates convergent validity of the two measurement systems."
            ),
            "design": {
                "topics": 3,
                "conditions": ["source_present", "source_absent"],
                "versions_per_cell": 3,
                "total_docs": 18,
                "measurements": ["source_matching", "temporal_consistency",
                                  "cross_tabulation"],
            },
            "results": all_results,
        },
        generator=model,
    )

    with open(output_path, "w") as f:
        json.dump(output, f, indent=2)

    print(f"\n  Saved: {output_path}")


if __name__ == "__main__":
    main()