""" T-418 Strengthening: Trust Signals Inversion Original: N=1 evaluator, 6 documents. This: 10 topics, programmatic trust signal measurement, blinded LLM evaluation. Two phases: 1. GENERATION: 10 topics x 2 conditions (sourced/unsourced) x 3 versions = 60 outputs (xAI) 2. MEASUREMENT: Programmatic trust signal extraction (zero LLM) + blinded trust rating (Gemini) """ import os import json import re import sys import time from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from _config import get_xai_client, get_gemini_client # Clients (proxy-routed: master keys never enter this process) xai_client = get_xai_client() gemini_client = get_gemini_client() XAI_MODEL = "grok-4-1-fast" EVAL_MODEL = "gemini-2.5-flash" VERSIONS = 3 # 10 topics with real source data (verifiable statistics) TOPICS = [ { "name": "remote_work", "prompt": "Analyze the impact of remote work on organizational productivity and employee retention.", "source": """Remote work data from recent surveys and studies: - Stanford study (Bloom et al.): remote workers showed 13% performance increase in a 9-month trial at a Chinese travel agency (16,000 employees, 2013-2014). - Gallup 2023 State of the Workplace: 6 in 10 exclusively remote workers say they are "extremely likely to change companies" if not offered remote flexibility. Engagement highest for hybrid workers (35%) vs fully remote (27%) vs fully on-site (29%). - US Bureau of Labor Statistics: as of 2024, approximately 27.6% of employed persons worked remotely at least part of the day, down from 35% in 2020 but up from 24% pre-pandemic. - Microsoft Work Trend Index 2023: 85% of leaders say shift to hybrid work has made it challenging to have confidence employees are being productive. Weekly meeting time for Teams users increased 153% since February 2020. - Owl Labs State of Remote Work 2023: remote workers report saving an average of 40 minutes per day on commute; 62% of workers feel more productive when working remotely.""", }, { "name": "cybersecurity", "prompt": "Analyze the current state of enterprise cybersecurity threats and organizational readiness.", "source": """Cybersecurity data from industry reports: - IBM Cost of a Data Breach Report 2024: average cost of a data breach reached $4.88 million globally, a 10% increase over 2023. Healthcare sector averaged $9.77 million per breach. Average time to identify and contain a breach: 258 days. - Verizon 2024 Data Breach Investigations Report: 68% of breaches involved a non-malicious human element (social engineering or error). Ransomware or extortion involved in 32% of breaches. Median time from access to data exfiltration: less than 2 days. - Cybersecurity Ventures: global cybercrime costs projected to reach $10.5 trillion annually by 2025. Ransomware attack frequency: one attack every 11 seconds by 2024 estimate. - ISACA State of Cybersecurity 2024: 48% of organizations experienced more cyberattacks compared to prior year. 71% report unfilled cybersecurity positions. Average time to fill a cybersecurity role: 3-6 months.""", }, { "name": "renewable_energy", "prompt": "Analyze the trajectory of renewable energy investment and its impact on energy markets.", "source": """Renewable energy data from IEA and industry sources: - IEA World Energy Investment 2024: global clean energy investment reached $1.8 trillion in 2023, nearly double fossil fuel investment ($1 trillion). Solar PV investment alone exceeded $380 billion. - BloombergNEF: levelized cost of electricity for solar PV dropped 89% between 2010 and 2023 (from $359/MWh to $40/MWh). Onshore wind dropped 69% ($111/MWh to $34/MWh). Battery storage costs dropped 90% since 2010. - IEA Renewables 2024: renewable capacity additions reached 507 GW in 2023, 50% higher than 2022. Solar PV accounted for three-quarters of additions. China installed as much solar PV in 2023 as the entire world did in 2022. - US EIA: in 2023, renewables generated 21.4% of US electricity (up from 19.8% in 2022). Natural gas: 43.1%. Coal: 16.2%.""", }, { "name": "ai_adoption", "prompt": "Analyze the state of AI adoption in enterprise organizations and its measurable business impact.", "source": """AI adoption data from McKinsey, Gartner, and surveys: - McKinsey Global Survey on AI 2024: 72% of organizations have adopted AI in at least one business function, up from 55% in 2023. Generative AI specifically adopted by 65% of respondents, nearly double from 10 months prior. - Gartner 2024: only 54% of AI projects make it from pilot to production. Average enterprise AI project takes 8-36 months to deploy. Top deployment barriers: data quality (45%), lack of skilled talent (42%), integration complexity (38%). - Harvard Business Review analysis: companies using AI in customer service reported 13.8% increase in issue resolution per hour. Companies using AI for code generation reported 26% faster task completion but no significant difference in code quality metrics. - Deloitte State of AI in the Enterprise 2024: average ROI on AI investments was 3.5x, but 47% of organizations report difficulty measuring AI ROI accurately.""", }, { "name": "ecommerce", "prompt": "Analyze the evolution of e-commerce and its impact on retail business models.", "source": """E-commerce data from US Census Bureau and industry reports: - US Census Bureau Q2 2024: e-commerce accounted for 16.0% of total US retail sales ($291.6 billion in Q2), up from 15.4% year prior. Total retail sales: $1.823 trillion. - Statista: global e-commerce revenue reached $6.3 trillion in 2023, projected to grow to $8.1 trillion by 2026. Amazon accounted for 37.6% of US e-commerce sales. - Shopify 2024 Commerce Report: average e-commerce conversion rate: 1.4%. Average cart abandonment rate: 70.19%. Mobile commerce: 60% of global e-commerce traffic but 45% of revenue (conversion gap). - National Retail Federation: US retail returns totaled $743 billion in 2023 (14.5% of total sales). Online purchase return rate: 17.6% vs in-store: 10.02%.""", }, { "name": "saas_metrics", "prompt": "Analyze SaaS business model health indicators and what separates top-performing companies.", "source": """SaaS metrics data from industry benchmarks: - OpenView 2024 SaaS Benchmarks: median net revenue retention for top-quartile SaaS companies: 120%. Median for all SaaS: 100%. Bottom quartile: below 85%. NRR above 130% correlates with 2x faster growth. - Bessemer Cloud Index: median Rule of 40 score for public SaaS companies: 27% (sum of revenue growth rate + free cash flow margin). Top decile: above 55%. - ProfitWell/Paddle: median SaaS monthly churn rate: 3.2% (monthly revenue churn). Annual gross churn: 31.2% for SMB-focused, 8.5% for enterprise-focused. CAC payback period median: 15 months. - Gartner: SaaS spending reached $197 billion in 2023, growing 17.9% YoY. Average enterprise uses 130 SaaS applications (up from 80 in 2020). SaaS sprawl drives 25-30% of licenses unused.""", }, { "name": "supply_chain", "prompt": "Analyze the state of global supply chain resilience and strategies organizations are adopting.", "source": """Supply chain data from industry surveys and reports: - McKinsey Global Institute: companies can expect supply chain disruptions lasting 1-2 months to occur every 3.7 years on average. Disruptions erased nearly 45% of one year's EBITDA over the course of a decade for the average company. - Gartner Supply Chain Top 25 (2024): 83% of supply chain leaders are investing in supply chain visibility technology. Only 6% report full end-to-end supply chain visibility. Average time to detect a supply disruption: 3-7 days. - BCI Supply Chain Resilience Report 2024: 72.3% of organizations experienced at least one supply chain disruption in 2023. Cyber attacks and data breaches ranked as top disruption source (38.6%), followed by adverse weather (28.1%). - Flexport: average global container shipping costs peaked at $10,377 per 40ft container in September 2021, normalized to approximately $1,500-2,000 by late 2023.""", }, { "name": "digital_payments", "prompt": "Analyze the transformation of digital payment systems and implications for financial services.", "source": """Digital payments data from central banks and industry: - Federal Reserve Payments Study 2023: US noncash payment value reached $128.51 trillion in 2022. Card payments: 211.5 billion transactions. ACH transfers: 30 billion transactions ($80 trillion value). - McKinsey Global Payments Report 2024: global payments revenue reached $2.4 trillion in 2023, growing 7% YoY. Digital wallets represented 50% of global e-commerce payment value and 30% of POS payment value. - BIS: 134 central banks (98% of global GDP) are exploring CBDCs. 36 pilots active. 3 fully launched (Bahamas, Jamaica, Nigeria). Adoption rates for launched CBDCs remain below 1% of payment volume. - FIS Global Payments Report: buy-now-pay-later (BNPL) accounted for 5% of global e-commerce value in 2023. Default rates on BNPL: 3.7% vs credit cards 2.1%.""", }, { "name": "cloud_computing", "prompt": "Analyze the evolution of cloud computing infrastructure and enterprise adoption patterns.", "source": """Cloud computing data from Synergy, Gartner, and providers: - Synergy Research Group Q1 2024: cloud infrastructure services market reached $76 billion quarterly run rate. AWS market share: 31%. Azure: 25%. Google Cloud: 11%. Combined: 67% of market. - Gartner: worldwide public cloud spending projected at $679 billion in 2024 (up 20.4% from 2023). IaaS grew fastest at 26.6%. SaaS remained largest segment at $247 billion. - Flexera 2024 State of the Cloud: 87% of enterprises have a multi-cloud strategy. Average enterprise uses 2.3 public clouds and 2.7 private clouds. 28% of cloud spend is estimated as waste. - Uptime Institute 2024: average data center PUE (power usage effectiveness) improved to 1.58, down from 2.0 in 2010. Hyperscale operators average 1.2-1.3 PUE.""", }, { "name": "talent_retention", "prompt": "Analyze the factors driving employee retention and the cost of turnover in knowledge work.", "source": """Employee retention data from HR research: - SHRM 2024: average cost to replace an employee ranges from 50-200% of annual salary depending on role level. For knowledge workers: average 100-150%. Time to full productivity for a new hire: 8-12 months. - Gallup 2024 State of the Global Workplace: 23% of employees globally are engaged at work. 59% are "quiet quitting" (not engaged). 18% are actively disengaged. Engaged business units show 23% higher profitability. - LinkedIn 2024 Workforce Report: average employee tenure has declined from 4.1 years in 2020 to 3.5 years in 2024. Top reasons for leaving: compensation (38%), career growth (32%), flexibility (28%), management quality (23%). - Work Institute 2024 Retention Report: 77.4% of turnover is preventable. Manager quality is the strongest predictor of retention. Employees who receive regular feedback are 3.6x more likely to be engaged.""", }, ] ANALYTICAL_PROMPT = """Write a 400-600 word analytical briefing on the following topic. Include specific data points, cite sources where relevant, draw conclusions, and make actionable recommendations. Topic: {prompt}""" SOURCED_PROMPT = """The following source data is available for your analysis. Use only numbers from the source material below. If the source doesn't contain a relevant number, make the analytical point without inventing numbers. {source} Write a 400-600 word analytical briefing on the following topic. Include specific data points from the source material, cite sources where relevant, draw conclusions, and make actionable recommendations. Topic: {prompt}""" def generate_output(topic, condition, version): """Generate one analytical output.""" if condition == "sourced": prompt = SOURCED_PROMPT.format(source=topic["source"], prompt=topic["prompt"]) else: prompt = ANALYTICAL_PROMPT.format(prompt=topic["prompt"]) response = xai_client.chat.completions.create( model=XAI_MODEL, messages=[{"role": "user", "content": prompt}], temperature=1.0, max_tokens=1500, ) return response.choices[0].message.content def measure_trust_signals(text): """Programmatic trust signal extraction. Zero LLM.""" # Citation patterns named_citations = len(re.findall( r'(?:according to|per|from|by|reported by|published by|study by|survey by|data from|report from)\s+[A-Z][a-zA-Z]+', text, re.IGNORECASE )) parenthetical_citations = len(re.findall(r'\([A-Z][a-zA-Z]+.*?\d{4}\)', text)) # Number patterns specific_numbers = len(re.findall(r'\d+\.?\d*\s*(?:%|percent|billion|million|trillion|x\b)', text, re.IGNORECASE)) dollar_amounts = len(re.findall(r'\$\d+[\d,]*(?:\.\d+)?(?:\s*(?:billion|million|trillion))?', text, re.IGNORECASE)) precise_numbers = len(re.findall(r'\d+\.\d{1,}(?:\s*%)?', text)) # Numbers with decimals # Confidence markers confidence_phrases = len(re.findall( r'\b(?:clearly|undoubtedly|certainly|definitely|without doubt|unquestionably|inevitably|obviously|significantly|dramatically|fundamentally|critical|essential|paramount|imperative)\b', text, re.IGNORECASE )) # Hedge markers hedge_phrases = len(re.findall( r'\b(?:may|might|possibly|perhaps|approximately|roughly|around|about|estimated|likely|suggests|appears|seems|could|uncertain|unclear)\b', text, re.IGNORECASE )) # Specificity markers named_entities = len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)) # Multi-word proper nouns word_count = len(text.split()) sentence_count = len(re.findall(r'[.!?]+', text)) return { "named_citations": named_citations, "parenthetical_citations": parenthetical_citations, "total_citations": named_citations + parenthetical_citations, "specific_numbers": specific_numbers, "dollar_amounts": dollar_amounts, "precise_numbers": precise_numbers, "total_numbers": specific_numbers + dollar_amounts, "confidence_markers": confidence_phrases, "hedge_markers": hedge_phrases, "confidence_hedge_ratio": confidence_phrases / max(hedge_phrases, 1), "named_entities": named_entities, "word_count": word_count, "sentence_count": sentence_count, } def evaluate_trust(text, eval_id): """Blinded trust evaluation via Gemini.""" eval_prompt = f"""Rate the following analytical briefing on TRUSTWORTHINESS. How much would you trust this analysis to inform a business decision? Rate on a 1-5 scale: 1 = Not trustworthy (vague, unsupported claims, unreliable) 2 = Somewhat untrustworthy (some specific claims but many ungrounded) 3 = Neutral (mix of grounded and ungrounded claims) 4 = Somewhat trustworthy (mostly specific, well-cited, professional) 5 = Very trustworthy (specific data, named sources, clear methodology, confident conclusions) Respond with ONLY the number (1-5) on the first line, then a one-sentence justification. --- {text}""" response = gemini_client.chat.completions.create( model=EVAL_MODEL, messages=[{"role": "user", "content": eval_prompt}], temperature=0.0, max_tokens=100, ) result_text = response.choices[0].message.content.strip() score_match = re.match(r'(\d)', result_text) score = int(score_match.group(1)) if score_match else None return {"eval_id": eval_id, "score": score, "raw": result_text} def main(): print(f"T-418 Trust Signals Strengthening") print(f"Generator: {XAI_MODEL}") print(f"Evaluator: {EVAL_MODEL}") print(f"Topics: {len(TOPICS)}") print(f"Versions per condition: {VERSIONS}") print(f"Date: {datetime.now().isoformat()}") print() # Phase 1: Generation print("=== PHASE 1: GENERATION ===\n") all_outputs = [] for topic in TOPICS: for condition in ["sourced", "unsourced"]: for v in range(1, VERSIONS + 1): print(f" {topic['name']} / {condition} / v{v}...", end=" ", flush=True) text = generate_output(topic, condition, v) all_outputs.append({ "topic": topic["name"], "condition": condition, "version": v, "text": text, }) print(f"done ({len(text.split())} words)") print(f"\nGenerated {len(all_outputs)} outputs.\n") # Phase 2: Programmatic measurement print("=== PHASE 2: TRUST SIGNAL MEASUREMENT ===\n") for output in all_outputs: output["signals"] = measure_trust_signals(output["text"]) # Aggregate by condition sourced = [o for o in all_outputs if o["condition"] == "sourced"] unsourced = [o for o in all_outputs if o["condition"] == "unsourced"] def avg(lst, key): vals = [o["signals"][key] for o in lst] return sum(vals) / len(vals) def sd(lst, key): vals = [o["signals"][key] for o in lst] mean = sum(vals) / len(vals) return (sum((x - mean)**2 for x in vals) / len(vals)) ** 0.5 signal_keys = ["total_citations", "total_numbers", "precise_numbers", "confidence_markers", "hedge_markers", "confidence_hedge_ratio", "named_entities", "word_count"] print(f"| {'Signal':<25} | {'Sourced':>12} | {'Unsourced':>12} | {'Direction':>10} |") print(f"|{'-'*27}|{'-'*14}|{'-'*14}|{'-'*12}|") for key in signal_keys: s_avg = avg(sourced, key) u_avg = avg(unsourced, key) direction = "SOURCED+" if s_avg > u_avg else "UNSOURCED+" if u_avg > s_avg else "EQUAL" print(f"| {key:<25} | {s_avg:>10.1f} | {u_avg:>10.1f} | {direction:>10} |") # Phase 3: Blinded evaluation print("\n=== PHASE 3: BLINDED TRUST EVALUATION ===\n") eval_results = [] for i, output in enumerate(all_outputs): print(f" Evaluating {i+1}/{len(all_outputs)}...", end=" ", flush=True) result = evaluate_trust(output["text"], i) result["topic"] = output["topic"] result["condition"] = output["condition"] result["version"] = output["version"] eval_results.append(result) print(f"score={result['score']}") time.sleep(0.5) # Rate limiting # Aggregate eval scores sourced_scores = [r["score"] for r in eval_results if r["condition"] == "sourced" and r["score"]] unsourced_scores = [r["score"] for r in eval_results if r["condition"] == "unsourced" and r["score"]] s_mean = sum(sourced_scores) / len(sourced_scores) if sourced_scores else 0 u_mean = sum(unsourced_scores) / len(unsourced_scores) if unsourced_scores else 0 print(f"\nTrust scores (1-5 scale, higher = more trustworthy):") print(f" Sourced: {s_mean:.2f} (N={len(sourced_scores)})") print(f" Unsourced: {u_mean:.2f} (N={len(unsourced_scores)})") print(f" Direction: {'UNSOURCED rated higher (inversion confirmed)' if u_mean >= s_mean else 'SOURCED rated higher (no inversion)'}") # Per-topic breakdown print(f"\nPer-topic trust scores:") for topic in TOPICS: s_scores = [r["score"] for r in eval_results if r["topic"] == topic["name"] and r["condition"] == "sourced" and r["score"]] u_scores = [r["score"] for r in eval_results if r["topic"] == topic["name"] and r["condition"] == "unsourced" and r["score"]] s_m = sum(s_scores) / len(s_scores) if s_scores else 0 u_m = sum(u_scores) / len(u_scores) if u_scores else 0 winner = "U" if u_m > s_m else "S" if s_m > u_m else "=" print(f" {topic['name']:<20}: S={s_m:.1f} U={u_m:.1f} [{winner}]") # Save output_path = os.path.join(os.path.dirname(__file__), "strengthen_t418_results.json") with open(output_path, "w") as f: json.dump({ "metadata": { "generator": XAI_MODEL, "evaluator": EVAL_MODEL, "date": datetime.now().isoformat(), "topics": len(TOPICS), "versions": VERSIONS, "total_outputs": len(all_outputs), }, "outputs": [{k: v for k, v in o.items() if k != "text"} for o in all_outputs], "eval_results": eval_results, "texts": [{"topic": o["topic"], "condition": o["condition"], "version": o["version"], "text": o["text"]} for o in all_outputs], }, f, indent=2) print(f"\nResults saved to {output_path}") if __name__ == "__main__": main()