"""
EXP-081 Programmatic Number Matching: Basin-Breaking Measurement

Zero LLM involvement. Extracts every number from generated documents,
searches for each in the source text. Classifies:
  IN_SOURCE  — exact number appears in source document
  NOT_IN_SOURCE — number absent from source (parametric/fabricated/derived)

No judgment calls, no verifier bias, no EXTENDS ambiguity.
Measures what percentage of numerical claims come from source vs elsewhere.
"""

import json
import os
import re
from collections import defaultdict

SOURCE_DIR = os.path.join(os.path.dirname(__file__), "sources")


def load_source(filename):
    path = os.path.join(SOURCE_DIR, filename)
    with open(path, "r") as f:
        return f.read()


SOURCES = {
    "remote_work": load_source("source_1_remote_work.md"),
    "communication": load_source("source_3_communication.md"),
    "ai_workflows": load_source("source_4_ai_workflows.md"),
}


def extract_numbers(text):
    """Extract all numerical values from text with context.

    Returns list of dicts: {value, raw, context, type}
    Types: percentage, dollar, decimal, integer
    """
    numbers = []
    seen = set()  # avoid duplicates from overlapping patterns

    # Percentages: 43%, 43.5%, 43 percent
    for m in re.finditer(r'(\d+(?:\.\d+)?)\s*%', text):
        val = m.group(1)
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "percentage", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({
                "value": val,
                "raw": m.group(0),
                "context": context,
                "type": "percentage",
            })

    # Dollar amounts: $55, $15, $61
    for m in re.finditer(r'\$(\d+(?:\.\d+)?(?:,\d{3})*)', text):
        val = m.group(1).replace(",", "")
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "dollar", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({
                "value": val,
                "raw": m.group(0),
                "context": context,
                "type": "dollar",
            })

    # Decimal numbers not already caught (e.g., 2.6)
    for m in re.finditer(r'(?<!\$)(?<!\d)(\d+\.\d+)(?!%)', text):
        val = m.group(1)
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "decimal", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({
                "value": val,
                "raw": m.group(0),
                "context": context,
                "type": "decimal",
            })

    # Integers with context (skip very small numbers like 1, 2, 3 unless
    # they appear in data-like context)
    for m in re.finditer(r'(?<!\$)(?<!\d)(?<!\.)(\d{1,6})(?!\.\d)(?!%)(?!\d)', text):
        val = m.group(1)
        # Skip single digits (too ambiguous) and years 2020-2030
        if len(val) == 1:
            continue
        ctx_start = max(0, m.start() - 40)
        ctx_end = min(len(text), m.end() + 40)
        context = text[ctx_start:ctx_end].strip()
        key = (val, "integer", m.start())
        if key not in seen:
            seen.add(key)
            numbers.append({
                "value": val,
                "raw": m.group(0),
                "context": context,
                "type": "integer",
            })

    return numbers


def number_in_source(number, source_text):
    """Check if a number appears in source text.

    For percentages: look for "XX%" pattern.
    For dollars: look for "$XX" pattern.
    For others: look for the number with word boundaries.
    """
    val = number["value"]
    ntype = number["type"]

    if ntype == "percentage":
        # Search for XX% in source
        pattern = re.escape(val) + r'\s*%'
        if re.search(pattern, source_text):
            return True
        # Also check without decimal (43.0% → 43%)
        if "." in val:
            int_val = val.split(".")[0]
            pattern2 = re.escape(int_val) + r'\s*%'
            if re.search(pattern2, source_text):
                return True
        return False

    elif ntype == "dollar":
        pattern = r'\$' + re.escape(val)
        return bool(re.search(pattern, source_text))

    elif ntype == "decimal":
        # Look for exact decimal
        pattern = r'(?<!\d)' + re.escape(val) + r'(?!\d)'
        return bool(re.search(pattern, source_text))

    else:  # integer
        # Word boundary search
        pattern = r'(?<!\d)' + re.escape(val) + r'(?!\d)'
        return bool(re.search(pattern, source_text))


def classify_year(val):
    """Check if a number is likely a year (2019-2030)."""
    try:
        n = int(val)
        return 2019 <= n <= 2030
    except ValueError:
        return False


def is_word_count(num, text):
    """Check if this number is a document word count, not a claim."""
    # Look for patterns like "(Word count: NNN)" or "(Word count so far: NNN)"
    val = num["value"]
    # Find the number in text and check surrounding context
    for m in re.finditer(re.escape(num["raw"]), text):
        start = max(0, m.start() - 60)
        context = text[start:m.end() + 20].lower()
        if any(p in context for p in [
            "word count", "total word count", "word count so far",
        ]):
            return True
    return False


def is_derived_arithmetic(num, source_text):
    """Check if number could be derived from source arithmetic.

    e.g., 56 = 76-20 (both in source), 70 = 41+29 (both in source).
    Only checks simple addition/subtraction of two source numbers.
    """
    try:
        val = float(num["value"])
    except ValueError:
        return False

    # Extract all numbers from source
    source_nums = set()
    for m in re.finditer(r'(\d+(?:\.\d+)?)', source_text):
        try:
            source_nums.add(float(m.group(1)))
        except ValueError:
            pass

    # Check if val = a + b or val = a - b for any two source numbers
    for a in source_nums:
        for b in source_nums:
            if abs((a + b) - val) < 0.01:
                return True
            if abs((a - b) - val) < 0.01 and a != b:
                return True
    return False


def main():
    # Load full run results
    results_path = os.path.join(os.path.dirname(__file__),
                                "exp081_full_results.json")
    with open(results_path) as f:
        data = json.load(f)

    docs = data["findings"]["documents"]
    print(f"Documents: {len(docs)}")
    print(f"Method: Programmatic number extraction + source matching")
    print(f"Zero LLM involvement\n")

    all_results = []

    for doc in docs:
        tier = doc["tier"]
        topic = doc["topic"]
        version = doc["version"]
        source = SOURCES[topic]
        text = doc["text"]

        numbers = extract_numbers(text)
        in_source = []
        not_in_source = []
        derived = []
        years = []
        word_counts = []

        for num in numbers:
            if classify_year(num["value"]):
                years.append(num)
                continue

            if is_word_count(num, text):
                word_counts.append(num)
                continue

            if number_in_source(num, source):
                in_source.append(num)
            else:
                not_in_source.append(num)

        total = len(in_source) + len(not_in_source)
        unsourced_rate = len(not_in_source) / total if total > 0 else 0

        all_results.append({
            "tier": tier,
            "topic": topic,
            "version": version,
            "total_numbers": total,
            "in_source": len(in_source),
            "not_in_source": len(not_in_source),
            "years_excluded": len(years),
            "word_counts_excluded": len(word_counts),
            "unsourced_rate": round(unsourced_rate, 3),
            "unsourced_details": [
                {"value": n["value"], "type": n["type"],
                 "raw": n["raw"], "context": n["context"]}
                for n in not_in_source
            ],
        })

        print(f"  {tier} | {topic} v{version}: "
              f"{total} numbers ({len(in_source)} sourced, "
              f"{len(not_in_source)} unsourced [{unsourced_rate:.1%}], "
              f"excl {len(years)}yr {len(word_counts)}wc)")

    # === Summary by Tier ===
    print(f"\n{'='*60}")
    print(f"  BY TIER")
    print(f"{'='*60}")
    for tier_name in ["T1_BASIC", "T2_STANDARD", "T3_REFINED", "T4_AGENTIC"]:
        tier_r = [r for r in all_results if r["tier"] == tier_name]
        if tier_r:
            total = sum(r["total_numbers"] for r in tier_r)
            sourced = sum(r["in_source"] for r in tier_r)
            unsourced = sum(r["not_in_source"] for r in tier_r)
            rate = unsourced / total if total > 0 else 0
            print(f"  {tier_name}: {total} numbers — "
                  f"{sourced} sourced, "
                  f"{unsourced} unsourced [{rate:.1%}]")

    # === Summary by Topic ===
    print(f"\n{'='*60}")
    print(f"  BY TOPIC")
    print(f"{'='*60}")
    for topic_key in SOURCES:
        topic_r = [r for r in all_results if r["topic"] == topic_key]
        if topic_r:
            total = sum(r["total_numbers"] for r in topic_r)
            sourced = sum(r["in_source"] for r in topic_r)
            unsourced = sum(r["not_in_source"] for r in topic_r)
            rate = unsourced / total if total > 0 else 0
            print(f"  {topic_key}: {total} numbers — "
                  f"{sourced} sourced, "
                  f"{unsourced} unsourced [{rate:.1%}]")

    # === Tier × Topic Matrix ===
    print(f"\n{'='*60}")
    print(f"  UNSOURCED RATE MATRIX (Tier × Topic)")
    print(f"{'='*60}")
    print(f"  {'':20s} {'remote_work':>14s} {'communication':>14s} {'ai_workflows':>14s}")
    for tier_name in ["T1_BASIC", "T2_STANDARD", "T3_REFINED", "T4_AGENTIC"]:
        row = f"  {tier_name:20s}"
        for topic_key in ["remote_work", "communication", "ai_workflows"]:
            cell = [r for r in all_results
                    if r["tier"] == tier_name and r["topic"] == topic_key]
            if cell:
                total = sum(r["total_numbers"] for r in cell)
                unsourced = sum(r["not_in_source"] for r in cell)
                rate = unsourced / total if total > 0 else 0
                row += f"  {unsourced}/{total} ({rate:.0%})".rjust(14)
        print(row)

    # === Unsourced numbers detail ===
    print(f"\n{'='*60}")
    print(f"  UNSOURCED NUMBERS (all)")
    print(f"{'='*60}")
    for r in all_results:
        if r["unsourced_details"]:
            print(f"\n  {r['tier']} | {r['topic']} v{r['version']}:")
            for detail in r["unsourced_details"]:
                print(f"    {detail['raw']:>8s} ({detail['type']:>10s}) "
                      f"  ...{detail['context']}...")

    # === Save ===
    output_path = os.path.join(os.path.dirname(__file__),
                               "exp081_number_match_results.json")
    output = {
        "experiment": "EXP-081-number-match",
        "description": (
            "Programmatic number extraction and source matching. "
            "Zero LLM involvement. Extracts every number from generated "
            "documents, checks if it appears in source. "
            "Measures source-grounding rate without verifier bias."
        ),
        "method": "regex extraction + string matching",
        "numbers_excluded": "years (2019-2030), single digits",
        "results": all_results,
    }
    with open(output_path, "w") as f:
        json.dump(output, f, indent=2)

    print(f"\n  Saved: {output_path}")


if __name__ == "__main__":
    main()