"""
EXP-081d: Cross-Generator Replication of Prohibition

Tests whether the prohibition > monitoring finding (5x, EXP-081b)
replicates across model families. If it holds, it's a robust
prompt architecture principle. If not, it's a model quirk.

Design:
  3 topics × 2 architectures (CURRENT, PROHIBITION) × 2 versions = 12 docs
  Generator: gemini-3-flash-preview (vs grok-4-1-fast in 081b)
  Measurement: programmatic number matching (zero LLM)

Replication target:
  081b (xAI):  CURRENT 7.7%, PROHIBITION 1.6% (5x improvement)
  081d (Gemini): ?
"""

import json
import os
import re
import sys
import time

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from _config import (
    get_gemini_client,
    gemini_evaluate,
    format_results,
)

GEMINI_MODEL = "gemini-3-flash-preview"

SOURCE_DIR = os.path.join(os.path.dirname(__file__), "sources")


def load_source(filename):
    path = os.path.join(SOURCE_DIR, filename)
    with open(path, "r") as f:
        return f.read()


# ============================================================
# Topics (same as 081/081b)
# ============================================================

TOPICS = {
    "remote_work": {
        "source_file": "source_1_remote_work.md",
        "topic": (
            "Remote and hybrid work effectiveness: what the productivity data "
            "actually shows vs. what companies claim, where task type determines "
            "optimal location, and why the flexibility-retention link matters "
            "more than the productivity debate"
        ),
        "judge_role": "Engineering manager, 5+ years leading distributed teams",
        "judge_axis": (
            "Actionable diagnosis of what actually changes productivity; "
            "distinguishing signal from preference"
        ),
        "judge_cares": (
            "What actually changes productivity by task type; flexibility as "
            "retention lever vs productivity lever; cost-benefit that includes "
            "hidden overhead"
        ),
    },
    "communication": {
        "source_file": "source_3_communication.md",
        "topic": (
            "Internal communication as organizational infrastructure: why "
            "communication satisfaction is the lowest-rated workplace factor "
            "despite having the largest impact on retention, satisfaction, and "
            "productivity — and what the non-desk worker gap reveals about "
            "systemic failures"
        ),
        "judge_role": (
            "Executive coach, 50+ leader engagements, tracks behavioral outcomes"
        ),
        "judge_axis": (
            "Observable behavioral patterns with measurable downstream effects; "
            "channel-specific trust data"
        ),
        "judge_cares": (
            "Observable patterns with measurable effects; channel trust data; "
            "the gap between communication impact and communication quality; "
            "non-desk vs desk-based disparities"
        ),
    },
    "ai_workflows": {
        "source_file": "source_4_ai_workflows.md",
        "topic": (
            "AI-assisted development workflows: how developers actually use AI "
            "coding tools, where they trust vs. distrust the output, and what "
            "the productivity claims miss about debugging overhead and code "
            "quality trade-offs"
        ),
        "judge_role": (
            "Staff engineer who's shipped AI-integrated features AND maintained "
            "them 6+ months"
        ),
        "judge_axis": (
            "Trust/accuracy trade-offs in production; debugging overhead vs "
            "generation speed; what code quality metrics actually show"
        ),
        "judge_cares": (
            "Trust/accuracy trade-offs in production; debugging overhead vs "
            "generation speed; what code quality metrics actually show"
        ),
    },
}

VERSIONS = 2


# ============================================================
# Prompts (same as 081b)
# ============================================================

def t3_current(source, topic, judge_role, judge_axis, judge_cares):
    return (
        f"SOURCE MATERIAL:\n{source}\n\n---\n\n"
        f"Write an analytical document (~1000 words) on the following topic:\n\n"
        f"{topic}\n\n"
        f"JUDGE: {judge_role} — evaluating for {judge_axis}\n"
        f"  Cares about: {judge_cares}\n\n"
        f"FAIL (any = failure):\n"
        f"- Structure survives topic swap. Headings that work for any document\n"
        f"  in this domain = generic, not shaped.\n"
        f"- Claims hedge without conditions. \"It depends\" only with explicit\n"
        f"  conditions and consequences. \"On the other hand\" after every claim\n"
        f"  = uncommitted.\n"
        f"- Paragraph survives removal without changing the argument = padding.\n"
        f"- Analysis survives context-swap. Generic = failure.\n"
        f"- Claims presented as established fact without sourcing. If you cite\n"
        f"  a number, name the source. If you can't \u2192 qualitative language.\n"
        f"- Position without falsifier. If nothing could prove your thesis\n"
        f"  wrong, it's not a thesis.\n\n"
        f"Must GUARANTEE:\n"
        f"- Every factual claim traces to the source material OR is explicitly\n"
        f"  labeled EXTENDS (beyond source) with confidence level (H/M/L).\n"
        f"- Mechanism for each causal claim: HOW it produces the outcome.\n\n"
        f"Write the analysis now. Use markdown with ## headings."
    )


def t3_prohibition(source, topic, judge_role, judge_axis, judge_cares):
    return (
        f"SOURCE MATERIAL:\n{source}\n\n---\n\n"
        f"Write an analytical document (~1000 words) on the following topic:\n\n"
        f"{topic}\n\n"
        f"JUDGE: {judge_role} — evaluating for {judge_axis}\n"
        f"  Cares about: {judge_cares}\n\n"
        f"NUMERICAL CONSTRAINT (ABSOLUTE — violation = automatic failure):\n"
        f"You may ONLY use specific numbers, percentages, and dollar amounts\n"
        f"that appear in the SOURCE MATERIAL above. For any quantitative point\n"
        f"where the source does not provide a number, you MUST use qualitative\n"
        f"language instead (\"most\", \"a significant minority\", \"substantial\",\n"
        f"\"roughly half\"). Do NOT invent illustrative numbers. Do NOT pull\n"
        f"numbers from your training data. If the source says it, cite it.\n"
        f"If the source doesn't say it, use words, not numbers.\n\n"
        f"This constraint applies to:\n"
        f"- Percentages in examples and hypotheticals\n"
        f"- Dollar amounts in cost analyses\n"
        f"- Multipliers (e.g., \"1.5x\", \"10x\")\n"
        f"- Thresholds in falsifiers (use \"significant\" not \"15%\")\n"
        f"- Industry benchmarks not in source (use \"industry research suggests\")\n\n"
        f"FAIL (any = failure):\n"
        f"- Structure survives topic swap = generic.\n"
        f"- Claims hedge without conditions = uncommitted.\n"
        f"- Paragraph survives removal = padding.\n"
        f"- Any number not traceable to the source material above = failure.\n"
        f"- Position without falsifier.\n\n"
        f"Must GUARANTEE:\n"
        f"- Every number in the document appears in the source material.\n"
        f"- Mechanism for each causal claim: HOW it produces the outcome.\n"
        f"- Analytical depth achieved through reasoning, not invented specifics.\n\n"
        f"Write the analysis now. Use markdown with ## headings."
    )


# ============================================================
# Number matching (from exp081_number_match.py)
# ============================================================

def extract_numbers(text):
    numbers = []
    seen = set()

    for m in re.finditer(r'(\d+(?:\.\d+)?)\s*%', text):
        val = m.group(1)
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "percentage", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({"value": val, "raw": m.group(0),
                            "context": context, "type": "percentage"})

    for m in re.finditer(r'\$(\d+(?:\.\d+)?(?:,\d{3})*)', text):
        val = m.group(1).replace(",", "")
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "dollar", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({"value": val, "raw": m.group(0),
                            "context": context, "type": "dollar"})

    for m in re.finditer(r'(?<!\$)(?<!\d)(\d+\.\d+)(?!%)', text):
        val = m.group(1)
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "decimal", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({"value": val, "raw": m.group(0),
                            "context": context, "type": "decimal"})

    for m in re.finditer(r'(?<!\$)(?<!\d)(?<!\.)(\d{2,6})(?!\.\d)(?!%)(?!\d)', text):
        val = m.group(1)
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "integer", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({"value": val, "raw": m.group(0),
                            "context": context, "type": "integer"})

    return numbers


def number_in_source(number, source_text):
    val = number["value"]
    ntype = number["type"]
    if ntype == "percentage":
        pattern = re.escape(val) + r'\s*%'
        if re.search(pattern, source_text):
            return True
        if "." in val:
            int_val = val.split(".")[0]
            if re.search(re.escape(int_val) + r'\s*%', source_text):
                return True
        return False
    elif ntype == "dollar":
        return bool(re.search(r'\$' + re.escape(val), source_text))
    else:
        return bool(re.search(r'(?<!\d)' + re.escape(val) + r'(?!\d)', source_text))


def classify_year(val):
    try:
        n = int(val)
        return 2019 <= n <= 2030
    except ValueError:
        return False


def is_word_count(num, text):
    for m in re.finditer(re.escape(num["raw"]), text):
        start = max(0, m.start() - 60)
        context = text[start:m.end() + 20].lower()
        if any(p in context for p in ["word count", "total word count",
                                       "word count so far"]):
            return True
    return False


def analyze_numbers(text, source):
    numbers = extract_numbers(text)
    in_source = []
    not_in_source = []
    years = 0
    wc = 0

    for num in numbers:
        if classify_year(num["value"]):
            years += 1
            continue
        if is_word_count(num, text):
            wc += 1
            continue
        if number_in_source(num, source):
            in_source.append(num)
        else:
            not_in_source.append(num)

    total = len(in_source) + len(not_in_source)
    rate = len(not_in_source) / total if total > 0 else 0

    return {
        "total_numbers": total,
        "in_source": len(in_source),
        "not_in_source": len(not_in_source),
        "unsourced_rate": round(rate, 3),
        "years_excluded": years,
        "word_counts_excluded": wc,
        "unsourced_details": [
            {"value": n["value"], "type": n["type"],
             "raw": n["raw"], "context": n["context"]}
            for n in not_in_source
        ],
    }


# ============================================================
# Main
# ============================================================

def main():
    client = get_gemini_client()
    model = GEMINI_MODEL
    print(f"Generator: {model}")
    print(f"Topics: {len(TOPICS)}")
    print(f"Architectures: T3-CURRENT, T3-PROHIBITION")
    print(f"Versions: {VERSIONS}")
    print(f"Total documents: {len(TOPICS) * 2 * VERSIONS}")
    print(f"Measurement: programmatic number matching (zero LLM)")
    print()

    # Load sources
    sources = {}
    for topic_key, cfg in TOPICS.items():
        sources[topic_key] = load_source(cfg["source_file"])
        print(f"  Source [{topic_key}]: {len(sources[topic_key])} chars")

    results = []

    for topic_key, cfg in TOPICS.items():
        source = sources[topic_key]

        for arch_name, prompt_fn in [("T3_CURRENT", t3_current),
                                      ("T3_PROHIBITION", t3_prohibition)]:
            print(f"\n{'='*60}")
            print(f"  {arch_name} | {topic_key} | {model}")
            print(f"{'='*60}")

            for v in range(VERSIONS):
                print(f"  v{v+1}...", end=" ", flush=True)
                t0 = time.time()
                prompt = prompt_fn(
                    source, cfg["topic"], cfg["judge_role"],
                    cfg["judge_axis"], cfg["judge_cares"])

                text = gemini_evaluate(
                    prompt, model=model, temperature=1.0,
                    max_output_tokens=8192, client=client)

                if text is None:
                    print(f"FAILED (API error)")
                    continue

                elapsed = time.time() - t0
                words = len(text.split())
                nm = analyze_numbers(text, source)
                print(f"done ({words}w, {nm['total_numbers']} nums, "
                      f"{nm['not_in_source']} unsourced [{nm['unsourced_rate']:.1%}], "
                      f"{elapsed:.1f}s)")
                results.append({
                    "arch": arch_name, "topic": topic_key,
                    "version": v + 1, "word_count": words,
                    "text": text, "numbers": nm,
                })
                time.sleep(2)

    # ============================================================
    # Summary
    # ============================================================

    print(f"\n{'='*60}")
    print(f"  RESULTS: Cross-Generator Replication ({model})")
    print(f"{'='*60}")

    # By architecture
    print(f"\n  --- By Architecture ---")
    for arch in ["T3_CURRENT", "T3_PROHIBITION"]:
        arch_r = [r for r in results if r["arch"] == arch]
        if not arch_r:
            print(f"  {arch}: NO DATA")
            continue
        total_nums = sum(r["numbers"]["total_numbers"] for r in arch_r)
        total_unsrc = sum(r["numbers"]["not_in_source"] for r in arch_r)
        rate = total_unsrc / total_nums if total_nums > 0 else 0
        avg_words = sum(r["word_count"] for r in arch_r) / len(arch_r)
        print(f"  {arch}: {total_nums} numbers, "
              f"{total_unsrc} unsourced [{rate:.1%}], "
              f"avg {avg_words:.0f} words")

    # Comparison to xAI
    print(f"\n  --- Cross-Generator Comparison ---")
    print(f"  {'':20s} {'xAI (081b)':>14s} {'Gemini (081d)':>14s}")
    for arch in ["T3_CURRENT", "T3_PROHIBITION"]:
        arch_r = [r for r in results if r["arch"] == arch]
        if arch_r:
            total = sum(r["numbers"]["total_numbers"] for r in arch_r)
            unsrc = sum(r["numbers"]["not_in_source"] for r in arch_r)
            rate = unsrc / total if total > 0 else 0
            # 081b xAI rates
            xai_rates = {"T3_CURRENT": "7.7%", "T3_PROHIBITION": "1.6%"}
            print(f"  {arch:20s} {xai_rates.get(arch, '?'):>14s} "
                  f"{rate:.1%}".rjust(14))

    # Matrix: Architecture × Topic
    print(f"\n  --- Unsourced Rate Matrix ---")
    print(f"  {'':20s} {'remote_work':>14s} {'communication':>14s} {'ai_workflows':>14s}")
    for arch in ["T3_CURRENT", "T3_PROHIBITION"]:
        row = f"  {arch:20s}"
        for topic_key in ["remote_work", "communication", "ai_workflows"]:
            cell = [r for r in results
                    if r["arch"] == arch and r["topic"] == topic_key]
            if cell:
                total = sum(r["numbers"]["total_numbers"] for r in cell)
                unsrc = sum(r["numbers"]["not_in_source"] for r in cell)
                rate = unsrc / total if total > 0 else 0
                row += f"  {unsrc}/{total} ({rate:.0%})".rjust(14)
            else:
                row += "  N/A".rjust(14)
        print(row)

    # Save
    output_path = os.path.join(os.path.dirname(__file__),
                                "exp081_cross_gen_results.json")
    output = format_results(
        findings={
            "experiment": "EXP-081d-cross-generator",
            "description": (
                "Cross-generator replication of prohibition > monitoring "
                "finding. Tests whether EXP-081b's 5x improvement (7.7% → 1.6%) "
                "holds with gemini-3-flash-preview. Same topics, same prompts, "
                "same measurement. Only variable: generator model."
            ),
            "replication_target": {
                "xai_current": 0.077,
                "xai_prohibition": 0.016,
            },
            "results": results,
        },
        generator=model,
    )

    with open(output_path, "w") as f:
        json.dump(output, f, indent=2)

    print(f"\n  Saved: {output_path}")


if __name__ == "__main__":
    main()