"""
Claim extraction pipeline for the evaluation stack.

Extracts specific, concrete claims from LLM-generated analytical documents.
Foundation module for Layers 1-3 of the evaluation stack (EXP-078).

Claim types:
  NUMERICAL — sentences with specific numbers, percentages, dollar amounts
  CAUSAL    — sentences asserting causal relationships
  HEADING   — document section headings

No LLM judgment required. Pure regex extraction.
"""

import re
from collections import defaultdict


# ============================================================
# Sentence Splitting
# ============================================================

def split_sentences(text):
    """Split markdown document into (heading, sentence, paragraph_idx) tuples.

    Respects markdown structure: headings tracked as context,
    body text split on sentence boundaries.
    """
    results = []
    current_heading = ""
    para_idx = 0

    for line in text.split('\n'):
        stripped = line.strip()
        if not stripped:
            continue

        # Detect headings
        if stripped.startswith('#'):
            current_heading = re.sub(r'^#+\s*', '', stripped)
            continue

        # Strip list markers for cleaner sentences
        cleaned = re.sub(r'^[-*•]\s+', '', stripped)
        cleaned = re.sub(r'^\d+\.\s+', '', cleaned)

        # Split on sentence boundaries
        parts = re.split(r'(?<=[.!?])\s+(?=[A-Z])', cleaned)

        for sent in parts:
            sent = sent.strip()
            if len(sent) > 30:  # Skip fragments
                results.append((current_heading, sent, para_idx))

        para_idx += 1

    return results


def extract_headings(text):
    """Extract all markdown headings from document."""
    headings = []
    for line in text.split('\n'):
        stripped = line.strip()
        if stripped.startswith('#'):
            level = len(stripped) - len(stripped.lstrip('#'))
            title = re.sub(r'^#+\s*', '', stripped)
            if title:
                headings.append({'level': level, 'text': title})
    return headings


# ============================================================
# Numerical Claims
# ============================================================

NUMERICAL_PATTERNS = [
    (r'(?:~|approximately |about |roughly |nearly |over |under )?(\d+(?:\.\d+)?)\s*%',
     'percentage'),
    (r'(\d+(?:\.\d+)?)\s*[-–]\s*(\d+(?:\.\d+)?)\s*%',
     'pct_range'),
    (r'\$\s*(\d+(?:[.,]\d+)*)\s*(M|B|K|million|billion|thousand|mn|bn)?',
     'dollar'),
    (r'(\d+(?:\.\d+)?)\s*[x×](?:\s|$|,)',
     'multiplier'),
    (r'(\d+(?:\.\d+)?)\s*-?\s*fold',
     'multiplier'),
    (r'(\d+(?:,\d{3})*)\s+(?:companies|firms|teams|organizations|employees|'
     r'engineers|developers|users|customers|tools|platforms|products|projects|'
     r'systems|failures|incidents|outages|services|applications|repositories|'
     r'modules|microservices|endpoints|APIs?|databases?|clusters?|regions?)',
     'entity_count'),
    (r'(\d+(?:\.\d+)?)\s*[-–]?\s*(?:\d+(?:\.\d+)?\s*)?'
     r'(?:days?|weeks?|months?|years?|hours?|minutes?|quarters?|sprints?)',
     'duration'),
]


def extract_numerical_claims(text):
    """Extract all sentences containing specific numbers.

    Returns list of dicts:
        sentence: full sentence text
        heading: section heading context
        numbers: list of {type, value, raw} found in sentence
        position: paragraph index
    """
    sentences = split_sentences(text)
    claims = []

    for heading, sent, pos in sentences:
        numbers = []
        for pattern, num_type in NUMERICAL_PATTERNS:
            for m in re.finditer(pattern, sent, re.IGNORECASE):
                raw = m.group(0).strip()
                try:
                    val = float(m.group(1).replace(',', ''))
                except (ValueError, IndexError):
                    val = None

                numbers.append({
                    'type': num_type,
                    'value': val,
                    'raw': raw,
                })

        if numbers:
            claims.append({
                'sentence': sent,
                'heading': heading,
                'numbers': numbers,
                'position': pos,
            })

    return claims


# ============================================================
# Causal Claims
# ============================================================

CAUSAL_MARKERS = [
    r'\bbecause\b',
    r'\bsince\b(?!\s+\d)',
    r'\bdue to\b',
    r'\bowing to\b',
    r'\bas a result of\b',
    r'\bcaused? by\b',
    r'\bdriven by\b',
    r'\bleads? to\b',
    r'\bresults? in\b',
    r'\bcauses?\b',
    r'\bproduces?\b',
    r'\bgenerates?\b',
    r'\btriggers?\b',
    r'\bconsequently\b',
    r'\btherefore\b',
    r'\bthus\b',
    r'\bhence\b',
    r'\bthe (?:primary|main|key|root|fundamental|core|underlying|central) '
    r'(?:cause|reason|driver|factor|mechanism|force)\b',
    r'\b(?:directly|indirectly) (?:causes?|leads? to|results? in|drives?)\b',
    r'\bis responsible for\b',
    r'\baccounts? for\b',
    r'\benables?\b',
    r'\bprevents?\b',
    r'\binhibits?\b',
    r'\bfacilitates?\b',
    r'\bexacerbates?\b',
    r'\bcompounds?\b(?:\s+the)',
    r'\bamplifies?\b',
    r'\breinforces?\b',
    r'\bundermines?\b',
    r'\berodes?\b',
]


def extract_causal_claims(text):
    """Extract sentences asserting causal relationships.

    Returns list of dicts:
        sentence: full sentence text
        heading: section heading context
        markers: list of causal markers found
        position: paragraph index
    """
    sentences = split_sentences(text)
    combined = '|'.join(CAUSAL_MARKERS)
    claims = []

    for heading, sent, pos in sentences:
        found = re.findall(combined, sent, re.IGNORECASE)
        if found:
            claims.append({
                'sentence': sent,
                'heading': heading,
                'markers': [m.strip().lower() for m in found if isinstance(m, str)],
                'position': pos,
            })

    return claims


# ============================================================
# Combined Extraction
# ============================================================

def extract_all(text):
    """Extract all claim types from document.

    Returns dict:
        headings: list of heading dicts
        numerical: list of numerical claim dicts
        causal: list of causal claim dicts
        stats: summary counts
    """
    headings = extract_headings(text)
    numerical = extract_numerical_claims(text)
    causal = extract_causal_claims(text)

    pct_values = set()
    for claim in numerical:
        for n in claim['numbers']:
            if n['type'] == 'percentage' and n['value'] is not None:
                pct_values.add(n['value'])

    return {
        'headings': headings,
        'numerical': numerical,
        'causal': causal,
        'stats': {
            'n_headings': len(headings),
            'n_numerical': len(numerical),
            'n_causal': len(causal),
            'n_unique_percentages': len(pct_values),
            'percentage_values': sorted(pct_values),
        }
    }


# ============================================================
# Cross-version Comparison (for Temporal Consistency)
# ============================================================

def compare_numerical_across_versions(version_claims_list):
    """Compare numerical claims across multiple generations of same document.

    Args:
        version_claims_list: list of extract_all() results, one per version

    Returns dict with stability metrics, fabrication rate, and examples.
    """
    n_versions = len(version_claims_list)

    # --- Percentage stability ---
    all_pct_sets = []
    pct_contexts = defaultdict(list)

    for v_idx, v_data in enumerate(version_claims_list):
        pct_set = set()
        for claim in v_data['numerical']:
            for n in claim['numbers']:
                if n['type'] == 'percentage' and n['value'] is not None:
                    pct_set.add(n['value'])
                    pct_contexts[n['value']].append({
                        'version': v_idx,
                        'sentence': claim['sentence'][:250],
                        'heading': claim['heading'],
                    })
        all_pct_sets.append(pct_set)

    all_pct_values = set().union(*all_pct_sets) if all_pct_sets else set()

    stability = {}
    for val in all_pct_values:
        count = sum(val in s for s in all_pct_sets)
        stability[val] = count

    stable = sorted(v for v, c in stability.items() if c == n_versions)
    variable = sorted(v for v, c in stability.items() if c < n_versions)
    total = len(all_pct_values)

    # --- Dollar amount stability ---
    all_dollar_sets = []
    dollar_contexts = defaultdict(list)
    for v_idx, v_data in enumerate(version_claims_list):
        dollar_set = set()
        for claim in v_data['numerical']:
            for n in claim['numbers']:
                if n['type'] == 'dollar' and n['value'] is not None:
                    dollar_set.add(n['value'])
                    dollar_contexts[n['value']].append({
                        'version': v_idx,
                        'sentence': claim['sentence'][:250],
                    })
        all_dollar_sets.append(dollar_set)

    all_dollar_values = set().union(*all_dollar_sets) if all_dollar_sets else set()
    dollar_stable = sorted(v for v in all_dollar_values
                           if sum(v in s for s in all_dollar_sets) == n_versions)
    dollar_variable = sorted(v for v in all_dollar_values
                             if sum(v in s for s in all_dollar_sets) < n_versions)

    # --- All numbers (any type) ---
    all_num_sets = []
    for v_data in version_claims_list:
        num_set = set()
        for claim in v_data['numerical']:
            for n in claim['numbers']:
                if n['value'] is not None:
                    num_set.add((n['type'], n['value']))
        all_num_sets.append(num_set)

    all_nums = set().union(*all_num_sets) if all_num_sets else set()
    all_nums_stable = sum(
        1 for v in all_nums if sum(v in s for s in all_num_sets) == n_versions
    )
    all_nums_variable = sum(
        1 for v in all_nums if sum(v in s for s in all_num_sets) < n_versions
    )

    # --- Heading overlap (Jaccard) ---
    heading_sets = []
    for v_data in version_claims_list:
        h_set = set(h['text'].lower().strip() for h in v_data['headings'])
        heading_sets.append(h_set)

    if heading_sets:
        all_headings = set().union(*heading_sets)
        shared_headings = set.intersection(*heading_sets) if heading_sets else set()
        heading_jaccard = len(shared_headings) / len(all_headings) if all_headings else 1.0
    else:
        heading_jaccard = 1.0
        shared_headings = set()
        all_headings = set()

    # --- Claim count stability (coefficient of variation) ---
    num_counts = [v_data['stats']['n_numerical'] for v_data in version_claims_list]
    causal_counts = [v_data['stats']['n_causal'] for v_data in version_claims_list]

    def cv(vals):
        if not vals or max(vals) == 0:
            return 0.0
        mean = sum(vals) / len(vals)
        if mean == 0:
            return 0.0
        variance = sum((v - mean) ** 2 for v in vals) / len(vals)
        return (variance ** 0.5) / mean

    # --- Examples ---
    examples = {'stable': [], 'variable': []}

    for val in stable[:3]:
        examples['stable'].append({
            'value': f'{val}%',
            'contexts': pct_contexts[val][:n_versions]
        })

    for val in variable[:5]:
        examples['variable'].append({
            'value': f'{val}%',
            'in_versions': [ctx['version'] for ctx in pct_contexts[val]],
            'contexts': pct_contexts[val]
        })

    return {
        'percentage_stability': {str(v): c for v, c in sorted(stability.items())},
        'stable_percentages': stable,
        'variable_percentages': variable,
        'n_pct_stable': len(stable),
        'n_pct_variable': len(variable),
        'n_pct_total': total,
        'fabrication_rate_pct': len(variable) / total if total > 0 else None,
        'dollar_stable': dollar_stable,
        'dollar_variable': dollar_variable,
        'n_dollar_stable': len(dollar_stable),
        'n_dollar_variable': len(dollar_variable),
        'all_numbers_stable': all_nums_stable,
        'all_numbers_variable': all_nums_variable,
        'all_numbers_total': len(all_nums),
        'all_numbers_fab_rate': all_nums_variable / len(all_nums) if all_nums else None,
        'heading_jaccard': round(heading_jaccard, 3),
        'n_shared_headings': len(shared_headings),
        'n_total_headings': len(all_headings),
        'shared_heading_texts': sorted(shared_headings),
        'numerical_counts': num_counts,
        'causal_counts': causal_counts,
        'numerical_count_cv': round(cv(num_counts), 3),
        'causal_count_cv': round(cv(causal_counts), 3),
        'examples': examples,
    }


# ============================================================
# Self-test
# ============================================================

if __name__ == '__main__':
    test_doc = """# Why API Design Decisions Become Permanent

## Backward compatibility multiplies constraints

Every deployed client is a standing invariant. Approximately 70% of API
breaking changes originate from semantic drift rather than syntactic changes.
The cost of maintaining backward compatibility grows 3x with each major
version, reaching $2M annually for mid-size platforms.

## Implicit assumptions create fragility

Clients rely on behaviors not present in schemas. Because response ordering
is often assumed but never guaranteed, this leads to subtle failures that
cascade across 15-20 dependent services.

## Testing costs grow nonlinearly

Proving compatibility requires testing across combinations. Therefore, teams
that maintain 5+ API versions spend roughly 40% of engineering time on
compatibility testing alone.
"""

    result = extract_all(test_doc)
    print(f"Headings: {len(result['headings'])}")
    for h in result['headings']:
        print(f"  [{h['level']}] {h['text']}")

    print(f"\nNumerical claims: {len(result['numerical'])}")
    for c in result['numerical']:
        nums = ', '.join(f"{n['type']}={n['raw']}" for n in c['numbers'])
        print(f"  [{c['heading'][:40]}] {nums}")
        print(f"    \"{c['sentence'][:120]}...\"")

    print(f"\nCausal claims: {len(result['causal'])}")
    for c in result['causal']:
        print(f"  markers: {c['markers']}")
        print(f"    \"{c['sentence'][:120]}...\"")

    print(f"\nStats: {result['stats']}")
    print("\nSelf-test PASSED.")