"""
T-418 Strengthening: Trust Signals Inversion
Original: N=1 evaluator, 6 documents. This: 10 topics, programmatic trust signal measurement,
blinded LLM evaluation.

Two phases:
1. GENERATION: 10 topics x 2 conditions (sourced/unsourced) x 3 versions = 60 outputs (xAI)
2. MEASUREMENT: Programmatic trust signal extraction (zero LLM) + blinded trust rating (Gemini)
"""

import os
import json
import re
import sys
import time
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from _config import get_xai_client, get_gemini_client

# Clients (proxy-routed: master keys never enter this process)
xai_client = get_xai_client()
gemini_client = get_gemini_client()

XAI_MODEL = "grok-4-1-fast"
EVAL_MODEL = "gemini-2.5-flash"
VERSIONS = 3

# 10 topics with real source data (verifiable statistics)
TOPICS = [
    {
        "name": "remote_work",
        "prompt": "Analyze the impact of remote work on organizational productivity and employee retention.",
        "source": """Remote work data from recent surveys and studies:
- Stanford study (Bloom et al.): remote workers showed 13% performance increase in a 9-month trial at a Chinese travel agency (16,000 employees, 2013-2014).
- Gallup 2023 State of the Workplace: 6 in 10 exclusively remote workers say they are "extremely likely to change companies" if not offered remote flexibility. Engagement highest for hybrid workers (35%) vs fully remote (27%) vs fully on-site (29%).
- US Bureau of Labor Statistics: as of 2024, approximately 27.6% of employed persons worked remotely at least part of the day, down from 35% in 2020 but up from 24% pre-pandemic.
- Microsoft Work Trend Index 2023: 85% of leaders say shift to hybrid work has made it challenging to have confidence employees are being productive. Weekly meeting time for Teams users increased 153% since February 2020.
- Owl Labs State of Remote Work 2023: remote workers report saving an average of 40 minutes per day on commute; 62% of workers feel more productive when working remotely.""",
    },
    {
        "name": "cybersecurity",
        "prompt": "Analyze the current state of enterprise cybersecurity threats and organizational readiness.",
        "source": """Cybersecurity data from industry reports:
- IBM Cost of a Data Breach Report 2024: average cost of a data breach reached $4.88 million globally, a 10% increase over 2023. Healthcare sector averaged $9.77 million per breach. Average time to identify and contain a breach: 258 days.
- Verizon 2024 Data Breach Investigations Report: 68% of breaches involved a non-malicious human element (social engineering or error). Ransomware or extortion involved in 32% of breaches. Median time from access to data exfiltration: less than 2 days.
- Cybersecurity Ventures: global cybercrime costs projected to reach $10.5 trillion annually by 2025. Ransomware attack frequency: one attack every 11 seconds by 2024 estimate.
- ISACA State of Cybersecurity 2024: 48% of organizations experienced more cyberattacks compared to prior year. 71% report unfilled cybersecurity positions. Average time to fill a cybersecurity role: 3-6 months.""",
    },
    {
        "name": "renewable_energy",
        "prompt": "Analyze the trajectory of renewable energy investment and its impact on energy markets.",
        "source": """Renewable energy data from IEA and industry sources:
- IEA World Energy Investment 2024: global clean energy investment reached $1.8 trillion in 2023, nearly double fossil fuel investment ($1 trillion). Solar PV investment alone exceeded $380 billion.
- BloombergNEF: levelized cost of electricity for solar PV dropped 89% between 2010 and 2023 (from $359/MWh to $40/MWh). Onshore wind dropped 69% ($111/MWh to $34/MWh). Battery storage costs dropped 90% since 2010.
- IEA Renewables 2024: renewable capacity additions reached 507 GW in 2023, 50% higher than 2022. Solar PV accounted for three-quarters of additions. China installed as much solar PV in 2023 as the entire world did in 2022.
- US EIA: in 2023, renewables generated 21.4% of US electricity (up from 19.8% in 2022). Natural gas: 43.1%. Coal: 16.2%.""",
    },
    {
        "name": "ai_adoption",
        "prompt": "Analyze the state of AI adoption in enterprise organizations and its measurable business impact.",
        "source": """AI adoption data from McKinsey, Gartner, and surveys:
- McKinsey Global Survey on AI 2024: 72% of organizations have adopted AI in at least one business function, up from 55% in 2023. Generative AI specifically adopted by 65% of respondents, nearly double from 10 months prior.
- Gartner 2024: only 54% of AI projects make it from pilot to production. Average enterprise AI project takes 8-36 months to deploy. Top deployment barriers: data quality (45%), lack of skilled talent (42%), integration complexity (38%).
- Harvard Business Review analysis: companies using AI in customer service reported 13.8% increase in issue resolution per hour. Companies using AI for code generation reported 26% faster task completion but no significant difference in code quality metrics.
- Deloitte State of AI in the Enterprise 2024: average ROI on AI investments was 3.5x, but 47% of organizations report difficulty measuring AI ROI accurately.""",
    },
    {
        "name": "ecommerce",
        "prompt": "Analyze the evolution of e-commerce and its impact on retail business models.",
        "source": """E-commerce data from US Census Bureau and industry reports:
- US Census Bureau Q2 2024: e-commerce accounted for 16.0% of total US retail sales ($291.6 billion in Q2), up from 15.4% year prior. Total retail sales: $1.823 trillion.
- Statista: global e-commerce revenue reached $6.3 trillion in 2023, projected to grow to $8.1 trillion by 2026. Amazon accounted for 37.6% of US e-commerce sales.
- Shopify 2024 Commerce Report: average e-commerce conversion rate: 1.4%. Average cart abandonment rate: 70.19%. Mobile commerce: 60% of global e-commerce traffic but 45% of revenue (conversion gap).
- National Retail Federation: US retail returns totaled $743 billion in 2023 (14.5% of total sales). Online purchase return rate: 17.6% vs in-store: 10.02%.""",
    },
    {
        "name": "saas_metrics",
        "prompt": "Analyze SaaS business model health indicators and what separates top-performing companies.",
        "source": """SaaS metrics data from industry benchmarks:
- OpenView 2024 SaaS Benchmarks: median net revenue retention for top-quartile SaaS companies: 120%. Median for all SaaS: 100%. Bottom quartile: below 85%. NRR above 130% correlates with 2x faster growth.
- Bessemer Cloud Index: median Rule of 40 score for public SaaS companies: 27% (sum of revenue growth rate + free cash flow margin). Top decile: above 55%.
- ProfitWell/Paddle: median SaaS monthly churn rate: 3.2% (monthly revenue churn). Annual gross churn: 31.2% for SMB-focused, 8.5% for enterprise-focused. CAC payback period median: 15 months.
- Gartner: SaaS spending reached $197 billion in 2023, growing 17.9% YoY. Average enterprise uses 130 SaaS applications (up from 80 in 2020). SaaS sprawl drives 25-30% of licenses unused.""",
    },
    {
        "name": "supply_chain",
        "prompt": "Analyze the state of global supply chain resilience and strategies organizations are adopting.",
        "source": """Supply chain data from industry surveys and reports:
- McKinsey Global Institute: companies can expect supply chain disruptions lasting 1-2 months to occur every 3.7 years on average. Disruptions erased nearly 45% of one year's EBITDA over the course of a decade for the average company.
- Gartner Supply Chain Top 25 (2024): 83% of supply chain leaders are investing in supply chain visibility technology. Only 6% report full end-to-end supply chain visibility. Average time to detect a supply disruption: 3-7 days.
- BCI Supply Chain Resilience Report 2024: 72.3% of organizations experienced at least one supply chain disruption in 2023. Cyber attacks and data breaches ranked as top disruption source (38.6%), followed by adverse weather (28.1%).
- Flexport: average global container shipping costs peaked at $10,377 per 40ft container in September 2021, normalized to approximately $1,500-2,000 by late 2023.""",
    },
    {
        "name": "digital_payments",
        "prompt": "Analyze the transformation of digital payment systems and implications for financial services.",
        "source": """Digital payments data from central banks and industry:
- Federal Reserve Payments Study 2023: US noncash payment value reached $128.51 trillion in 2022. Card payments: 211.5 billion transactions. ACH transfers: 30 billion transactions ($80 trillion value).
- McKinsey Global Payments Report 2024: global payments revenue reached $2.4 trillion in 2023, growing 7% YoY. Digital wallets represented 50% of global e-commerce payment value and 30% of POS payment value.
- BIS: 134 central banks (98% of global GDP) are exploring CBDCs. 36 pilots active. 3 fully launched (Bahamas, Jamaica, Nigeria). Adoption rates for launched CBDCs remain below 1% of payment volume.
- FIS Global Payments Report: buy-now-pay-later (BNPL) accounted for 5% of global e-commerce value in 2023. Default rates on BNPL: 3.7% vs credit cards 2.1%.""",
    },
    {
        "name": "cloud_computing",
        "prompt": "Analyze the evolution of cloud computing infrastructure and enterprise adoption patterns.",
        "source": """Cloud computing data from Synergy, Gartner, and providers:
- Synergy Research Group Q1 2024: cloud infrastructure services market reached $76 billion quarterly run rate. AWS market share: 31%. Azure: 25%. Google Cloud: 11%. Combined: 67% of market.
- Gartner: worldwide public cloud spending projected at $679 billion in 2024 (up 20.4% from 2023). IaaS grew fastest at 26.6%. SaaS remained largest segment at $247 billion.
- Flexera 2024 State of the Cloud: 87% of enterprises have a multi-cloud strategy. Average enterprise uses 2.3 public clouds and 2.7 private clouds. 28% of cloud spend is estimated as waste.
- Uptime Institute 2024: average data center PUE (power usage effectiveness) improved to 1.58, down from 2.0 in 2010. Hyperscale operators average 1.2-1.3 PUE.""",
    },
    {
        "name": "talent_retention",
        "prompt": "Analyze the factors driving employee retention and the cost of turnover in knowledge work.",
        "source": """Employee retention data from HR research:
- SHRM 2024: average cost to replace an employee ranges from 50-200% of annual salary depending on role level. For knowledge workers: average 100-150%. Time to full productivity for a new hire: 8-12 months.
- Gallup 2024 State of the Global Workplace: 23% of employees globally are engaged at work. 59% are "quiet quitting" (not engaged). 18% are actively disengaged. Engaged business units show 23% higher profitability.
- LinkedIn 2024 Workforce Report: average employee tenure has declined from 4.1 years in 2020 to 3.5 years in 2024. Top reasons for leaving: compensation (38%), career growth (32%), flexibility (28%), management quality (23%).
- Work Institute 2024 Retention Report: 77.4% of turnover is preventable. Manager quality is the strongest predictor of retention. Employees who receive regular feedback are 3.6x more likely to be engaged.""",
    },
]

ANALYTICAL_PROMPT = """Write a 400-600 word analytical briefing on the following topic. Include specific data points, cite sources where relevant, draw conclusions, and make actionable recommendations.

Topic: {prompt}"""

SOURCED_PROMPT = """The following source data is available for your analysis. Use only numbers from the source material below. If the source doesn't contain a relevant number, make the analytical point without inventing numbers.

{source}

Write a 400-600 word analytical briefing on the following topic. Include specific data points from the source material, cite sources where relevant, draw conclusions, and make actionable recommendations.

Topic: {prompt}"""


def generate_output(topic, condition, version):
    """Generate one analytical output."""
    if condition == "sourced":
        prompt = SOURCED_PROMPT.format(source=topic["source"], prompt=topic["prompt"])
    else:
        prompt = ANALYTICAL_PROMPT.format(prompt=topic["prompt"])

    response = xai_client.chat.completions.create(
        model=XAI_MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=1.0,
        max_tokens=1500,
    )
    return response.choices[0].message.content


def measure_trust_signals(text):
    """Programmatic trust signal extraction. Zero LLM."""
    # Citation patterns
    named_citations = len(re.findall(
        r'(?:according to|per|from|by|reported by|published by|study by|survey by|data from|report from)\s+[A-Z][a-zA-Z]+',
        text, re.IGNORECASE
    ))
    parenthetical_citations = len(re.findall(r'\([A-Z][a-zA-Z]+.*?\d{4}\)', text))

    # Number patterns
    specific_numbers = len(re.findall(r'\d+\.?\d*\s*(?:%|percent|billion|million|trillion|x\b)', text, re.IGNORECASE))
    dollar_amounts = len(re.findall(r'\$\d+[\d,]*(?:\.\d+)?(?:\s*(?:billion|million|trillion))?', text, re.IGNORECASE))
    precise_numbers = len(re.findall(r'\d+\.\d{1,}(?:\s*%)?', text))  # Numbers with decimals

    # Confidence markers
    confidence_phrases = len(re.findall(
        r'\b(?:clearly|undoubtedly|certainly|definitely|without doubt|unquestionably|inevitably|obviously|significantly|dramatically|fundamentally|critical|essential|paramount|imperative)\b',
        text, re.IGNORECASE
    ))

    # Hedge markers
    hedge_phrases = len(re.findall(
        r'\b(?:may|might|possibly|perhaps|approximately|roughly|around|about|estimated|likely|suggests|appears|seems|could|uncertain|unclear)\b',
        text, re.IGNORECASE
    ))

    # Specificity markers
    named_entities = len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text))  # Multi-word proper nouns

    word_count = len(text.split())
    sentence_count = len(re.findall(r'[.!?]+', text))

    return {
        "named_citations": named_citations,
        "parenthetical_citations": parenthetical_citations,
        "total_citations": named_citations + parenthetical_citations,
        "specific_numbers": specific_numbers,
        "dollar_amounts": dollar_amounts,
        "precise_numbers": precise_numbers,
        "total_numbers": specific_numbers + dollar_amounts,
        "confidence_markers": confidence_phrases,
        "hedge_markers": hedge_phrases,
        "confidence_hedge_ratio": confidence_phrases / max(hedge_phrases, 1),
        "named_entities": named_entities,
        "word_count": word_count,
        "sentence_count": sentence_count,
    }


def evaluate_trust(text, eval_id):
    """Blinded trust evaluation via Gemini."""
    eval_prompt = f"""Rate the following analytical briefing on TRUSTWORTHINESS. How much would you trust this analysis to inform a business decision?

Rate on a 1-5 scale:
1 = Not trustworthy (vague, unsupported claims, unreliable)
2 = Somewhat untrustworthy (some specific claims but many ungrounded)
3 = Neutral (mix of grounded and ungrounded claims)
4 = Somewhat trustworthy (mostly specific, well-cited, professional)
5 = Very trustworthy (specific data, named sources, clear methodology, confident conclusions)

Respond with ONLY the number (1-5) on the first line, then a one-sentence justification.

---

{text}"""

    response = gemini_client.chat.completions.create(
        model=EVAL_MODEL,
        messages=[{"role": "user", "content": eval_prompt}],
        temperature=0.0,
        max_tokens=100,
    )

    result_text = response.choices[0].message.content.strip()
    score_match = re.match(r'(\d)', result_text)
    score = int(score_match.group(1)) if score_match else None
    return {"eval_id": eval_id, "score": score, "raw": result_text}


def main():
    print(f"T-418 Trust Signals Strengthening")
    print(f"Generator: {XAI_MODEL}")
    print(f"Evaluator: {EVAL_MODEL}")
    print(f"Topics: {len(TOPICS)}")
    print(f"Versions per condition: {VERSIONS}")
    print(f"Date: {datetime.now().isoformat()}")
    print()

    # Phase 1: Generation
    print("=== PHASE 1: GENERATION ===\n")
    all_outputs = []

    for topic in TOPICS:
        for condition in ["sourced", "unsourced"]:
            for v in range(1, VERSIONS + 1):
                print(f"  {topic['name']} / {condition} / v{v}...", end=" ", flush=True)
                text = generate_output(topic, condition, v)
                all_outputs.append({
                    "topic": topic["name"],
                    "condition": condition,
                    "version": v,
                    "text": text,
                })
                print(f"done ({len(text.split())} words)")

    print(f"\nGenerated {len(all_outputs)} outputs.\n")

    # Phase 2: Programmatic measurement
    print("=== PHASE 2: TRUST SIGNAL MEASUREMENT ===\n")

    for output in all_outputs:
        output["signals"] = measure_trust_signals(output["text"])

    # Aggregate by condition
    sourced = [o for o in all_outputs if o["condition"] == "sourced"]
    unsourced = [o for o in all_outputs if o["condition"] == "unsourced"]

    def avg(lst, key):
        vals = [o["signals"][key] for o in lst]
        return sum(vals) / len(vals)

    def sd(lst, key):
        vals = [o["signals"][key] for o in lst]
        mean = sum(vals) / len(vals)
        return (sum((x - mean)**2 for x in vals) / len(vals)) ** 0.5

    signal_keys = ["total_citations", "total_numbers", "precise_numbers", "confidence_markers",
                   "hedge_markers", "confidence_hedge_ratio", "named_entities", "word_count"]

    print(f"| {'Signal':<25} | {'Sourced':>12} | {'Unsourced':>12} | {'Direction':>10} |")
    print(f"|{'-'*27}|{'-'*14}|{'-'*14}|{'-'*12}|")
    for key in signal_keys:
        s_avg = avg(sourced, key)
        u_avg = avg(unsourced, key)
        direction = "SOURCED+" if s_avg > u_avg else "UNSOURCED+" if u_avg > s_avg else "EQUAL"
        print(f"| {key:<25} | {s_avg:>10.1f}  | {u_avg:>10.1f}  | {direction:>10} |")

    # Phase 3: Blinded evaluation
    print("\n=== PHASE 3: BLINDED TRUST EVALUATION ===\n")

    eval_results = []
    for i, output in enumerate(all_outputs):
        print(f"  Evaluating {i+1}/{len(all_outputs)}...", end=" ", flush=True)
        result = evaluate_trust(output["text"], i)
        result["topic"] = output["topic"]
        result["condition"] = output["condition"]
        result["version"] = output["version"]
        eval_results.append(result)
        print(f"score={result['score']}")
        time.sleep(0.5)  # Rate limiting

    # Aggregate eval scores
    sourced_scores = [r["score"] for r in eval_results if r["condition"] == "sourced" and r["score"]]
    unsourced_scores = [r["score"] for r in eval_results if r["condition"] == "unsourced" and r["score"]]

    s_mean = sum(sourced_scores) / len(sourced_scores) if sourced_scores else 0
    u_mean = sum(unsourced_scores) / len(unsourced_scores) if unsourced_scores else 0

    print(f"\nTrust scores (1-5 scale, higher = more trustworthy):")
    print(f"  Sourced:   {s_mean:.2f} (N={len(sourced_scores)})")
    print(f"  Unsourced: {u_mean:.2f} (N={len(unsourced_scores)})")
    print(f"  Direction: {'UNSOURCED rated higher (inversion confirmed)' if u_mean >= s_mean else 'SOURCED rated higher (no inversion)'}")

    # Per-topic breakdown
    print(f"\nPer-topic trust scores:")
    for topic in TOPICS:
        s_scores = [r["score"] for r in eval_results if r["topic"] == topic["name"] and r["condition"] == "sourced" and r["score"]]
        u_scores = [r["score"] for r in eval_results if r["topic"] == topic["name"] and r["condition"] == "unsourced" and r["score"]]
        s_m = sum(s_scores) / len(s_scores) if s_scores else 0
        u_m = sum(u_scores) / len(u_scores) if u_scores else 0
        winner = "U" if u_m > s_m else "S" if s_m > u_m else "="
        print(f"  {topic['name']:<20}: S={s_m:.1f}  U={u_m:.1f}  [{winner}]")

    # Save
    output_path = os.path.join(os.path.dirname(__file__), "strengthen_t418_results.json")
    with open(output_path, "w") as f:
        json.dump({
            "metadata": {
                "generator": XAI_MODEL,
                "evaluator": EVAL_MODEL,
                "date": datetime.now().isoformat(),
                "topics": len(TOPICS),
                "versions": VERSIONS,
                "total_outputs": len(all_outputs),
            },
            "outputs": [{k: v for k, v in o.items() if k != "text"} for o in all_outputs],
            "eval_results": eval_results,
            "texts": [{"topic": o["topic"], "condition": o["condition"], "version": o["version"], "text": o["text"]} for o in all_outputs],
        }, f, indent=2)
    print(f"\nResults saved to {output_path}")


if __name__ == "__main__":
    main()