""" EXP-081f: Source Quality Degradation Test KILL TEST for the operational recipe (source + prohibition = 1.6% fabrication). All prior EXP-081 runs used ideal sources: 2-4KB structured bullet-pointed summaries with explicit numbers. Real analytical work uses partial, sparse, or unstructured sources. If the grounding effect collapses with degraded sources, the recipe has narrow operational scope. Three source conditions: FULL: Complete source (baseline replication from 081b) PARTIAL: ~50% of sections REMOVED — specifically the sections most relevant to the topic prompt. Model must write about productivity when productivity data is missing. SPARSE: Source condensed to 5-6 key bullet points (~200 words). Covers the topic but with minimal detail. Architecture: T3-PROHIBITION only (the operational recipe). 3 topics × 3 source conditions × 2 versions = 18 documents. Measurement: - Number matching against FULL source (not degraded version) → GROUNDED: appears in provided degraded source AND full source → PARAMETRIC: appears in full source but NOT in provided degraded source → FABRICATED: appears in neither - Word count (does model write less with less source?) - Qualitative: does model omit, pivot, or fabricate for uncovered territory? The delta that matters: FULL ≈ 081b baseline (~1.6%) → replication PARTIAL stays <10% → recipe robust to gaps PARTIAL jumps to 30%+ → recipe scope-limited SPARSE stays <10% → recipe robust to density """ import json import os import re import sys import time sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from _config import ( GENERATOR_MODEL, get_generator_client, call_generator, ) SOURCE_DIR = os.path.join(os.path.dirname(__file__), "sources") def load_source(filename): path = os.path.join(SOURCE_DIR, filename) with open(path, "r") as f: return f.read() # ============================================================ # Source Degradation Definitions # ============================================================ # PARTIAL: remove the sections most relevant to the topic prompt. # This creates the hardest test: model is asked about X, but source # doesn't cover X. Under prohibition, what happens? PARTIAL_REMOVE = { "remote_work": [ "## Productivity by Location (Task-Based)", "## Manager Productivity Assessment", "## AI Adoption", ], "communication": [ "## Communication Satisfaction", "## Non-Desk vs Desk-Based Gap", "## Workplace Loneliness", ], "ai_workflows": [ "## Trust & Accuracy (THE KEY DATA)", "## Debugging & Productivity", "## Vibe Coding", ], } # SPARSE: hand-crafted minimal summaries (~200 words each) SPARSE_SOURCES = { "remote_work": """# Remote Work Key Data (Owl Labs 2025, N=2,000 US workers) - Workforce: In-office 63%, Hybrid 28%, Remote 9% - 40% would start job hunting if flexibility removed; 5% would quit outright - 37% won't accept a job with no flexible hours - Hybrid saves $37/day when WFH (commute $15, parking $9, meals $31 vs $18 remote) - Average commute: 31 minutes each way - 90% experience workplace stress; 27% feeling burnt out - 77% lost time to technical difficulties in hybrid meetings - 92% haven't changed jobs in 2025; 27% actively seeking - Top reasons for seeking: Better pay (49%), Work-life balance (48%) """, "communication": """# Internal Communication Key Data (Staffbase/YouGov 2025, N=3,574 across 6 countries) - Communication quality satisfaction: 42% (LOWEST workplace factor) - Communication amount satisfaction: 43% - Coworker relationships: 76% satisfied (highest) - 33% cite poor communication as MAJOR leaving factor (total: 63%) - "Excellent" communication → 76% "very likely" to stay - "Poor" communication → 20% "very likely" to stay - Most trusted source: Immediate supervisor (57%) - Primary channel: Email/memos (51%) - Well-informed about changes: only 23% - Vision "very clear": only 20% - Productivity impact of communication: 63% """, "ai_workflows": """# AI Developer Workflows Key Data (Stack Overflow 2025) - 84% of developers use or plan to use AI tools (up from 76% in 2024) - 51% of professional developers use AI tools daily - 60% positive sentiment (DOWN from 70%+ in 2023-2024) - 46% actively distrust AI tool accuracy; only 3% "highly trust" - 76% don't plan to use AI for deployment/monitoring - ChatGPT: 81.7% preference; GitHub Copilot: 67.9% - 75.3% would ask humans when distrusting AI answers - 87% concerned about accuracy; 81% about security/privacy - 72% are NOT vibe coding; only 14.7% actively participate """, } def create_partial_source(full_source, sections_to_remove): """Remove specified sections from a source document.""" lines = full_source.split("\n") result = [] skip = False for line in lines: # Check if this line starts a section to remove should_skip = False for section in sections_to_remove: if line.strip().startswith(section): should_skip = True skip = True break if should_skip: continue # Check if this line starts a NEW section (not one to remove) if skip and line.strip().startswith("## "): # New section — stop skipping if not any(line.strip().startswith(s) for s in sections_to_remove): skip = False if not skip: result.append(line) return "\n".join(result) # ============================================================ # Topics (same as 081b) # ============================================================ TOPICS = { "remote_work": { "source_file": "source_1_remote_work.md", "topic": ( "Remote and hybrid work effectiveness: what the productivity data " "actually shows vs. what companies claim, where task type determines " "optimal location, and why the flexibility-retention link matters " "more than the productivity debate" ), "judge_role": "Engineering manager, 5+ years leading distributed teams", "judge_axis": ( "Actionable diagnosis of what actually changes productivity; " "distinguishing signal from preference" ), "judge_cares": ( "What actually changes productivity by task type; flexibility as " "retention lever vs productivity lever; cost-benefit that includes " "hidden overhead" ), }, "communication": { "source_file": "source_3_communication.md", "topic": ( "Internal communication as organizational infrastructure: why " "communication satisfaction is the lowest-rated workplace factor " "despite having the largest impact on retention, satisfaction, and " "productivity — and what the non-desk worker gap reveals about " "systemic failures" ), "judge_role": ( "Executive coach, 50+ leader engagements, tracks behavioral outcomes" ), "judge_axis": ( "Observable behavioral patterns with measurable downstream effects; " "channel-specific trust data" ), "judge_cares": ( "Observable patterns with measurable effects; channel trust data; " "the gap between communication impact and communication quality; " "non-desk vs desk-based disparities" ), }, "ai_workflows": { "source_file": "source_4_ai_workflows.md", "topic": ( "AI-assisted development workflows: how developers actually use AI " "coding tools, where they trust vs. distrust the output, and what " "the productivity claims miss about debugging overhead and code " "quality trade-offs" ), "judge_role": ( "Staff engineer who's shipped AI-integrated features AND maintained " "them 6+ months" ), "judge_axis": ( "Trust/accuracy trade-offs in production; debugging overhead vs " "generation speed; what code quality metrics actually show" ), "judge_cares": ( "Trust/accuracy trade-offs in production; debugging overhead vs " "generation speed; what code quality metrics actually show" ), }, } VERSIONS = 2 # ============================================================ # T3-PROHIBITION Prompt (exact copy from 081b) # ============================================================ def t3_prohibition(source, topic, judge_role, judge_axis, judge_cares): return ( f"SOURCE MATERIAL:\n{source}\n\n---\n\n" f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic}\n\n" f"JUDGE: {judge_role} — evaluating for {judge_axis}\n" f" Cares about: {judge_cares}\n\n" f"NUMERICAL CONSTRAINT (ABSOLUTE — violation = automatic failure):\n" f"You may ONLY use specific numbers, percentages, and dollar amounts\n" f"that appear in the SOURCE MATERIAL above. For any quantitative point\n" f"where the source does not provide a number, you MUST use qualitative\n" f"language instead (\"most\", \"a significant minority\", \"substantial\",\n" f"\"roughly half\"). Do NOT invent illustrative numbers. Do NOT pull\n" f"numbers from your training data. If the source says it, cite it.\n" f"If the source doesn't say it, use words, not numbers.\n\n" f"This constraint applies to:\n" f"- Percentages in examples and hypotheticals\n" f"- Dollar amounts in cost analyses\n" f"- Multipliers (e.g., \"1.5x\", \"10x\")\n" f"- Thresholds in falsifiers (use \"significant\" not \"15%\")\n" f"- Industry benchmarks not in source (use \"industry research suggests\")\n\n" f"FAIL (any = failure):\n" f"- Structure survives topic swap = generic.\n" f"- Claims hedge without conditions = uncommitted.\n" f"- Paragraph survives removal = padding.\n" f"- Any number not traceable to the source material above = failure.\n" f"- Position without falsifier.\n\n" f"Must GUARANTEE:\n" f"- Every number in the document appears in the source material.\n" f"- Mechanism for each causal claim: HOW it produces the outcome.\n" f"- Analytical depth achieved through reasoning, not invented specifics.\n\n" f"Write the analysis now. Use markdown with ## headings." ) # ============================================================ # Number Matching (from 081b — measures against FULL source) # ============================================================ def extract_numbers(text): numbers = [] seen = set() for m in re.finditer(r'(\d+(?:\.\d+)?)\s*%', text): val = m.group(1) ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "percentage", m.start()) if key not in seen: seen.add(key) numbers.append({"value": val, "raw": m.group(0), "context": context, "type": "percentage"}) for m in re.finditer(r'\$(\d+(?:\.\d+)?(?:,\d{3})*)', text): val = m.group(1).replace(",", "") ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "dollar", m.start()) if key not in seen: seen.add(key) numbers.append({"value": val, "raw": m.group(0), "context": context, "type": "dollar"}) for m in re.finditer(r'(? 0: print(f" Grounded rate: {total_grounded/total_nums:.1%}") print(f" Parametric rate: {total_parametric/total_nums:.1%}") print(f" Fabricated rate: {total_fabricated/total_nums:.1%}") print(f" Total unsourced: " f"{(total_parametric+total_fabricated)/total_nums:.1%}") print(f" Avg words: {avg_words:.0f}") # Per-topic breakdown for topic_key in TOPICS: topic_results = [r for r in cond_results if r["topic"] == topic_key] t_nums = sum(r["numbers"]["total_numbers"] for r in topic_results) t_ground = sum(r["numbers"]["grounded"] for r in topic_results) t_param = sum(r["numbers"]["parametric"] for r in topic_results) t_fab = sum(r["numbers"]["fabricated"] for r in topic_results) if t_nums > 0: print(f" {topic_key}: {t_ground}G/{t_param}P/{t_fab}F " f"({t_fab/t_nums:.1%} fab, " f"{(t_param+t_fab)/t_nums:.1%} unsourced)") # Key comparison print(f"\n{'='*60}") print(f" KEY COMPARISON: Fabrication Rate by Condition") print(f"{'='*60}") for condition_name in ["FULL", "PARTIAL", "SPARSE"]: cond_results = [r for r in results if r["condition"] == condition_name] total_nums = sum(r["numbers"]["total_numbers"] for r in cond_results) total_fab = sum(r["numbers"]["fabricated"] for r in cond_results) total_param = sum(r["numbers"]["parametric"] for r in cond_results) fab_rate = total_fab / total_nums if total_nums else 0 param_rate = total_param / total_nums if total_nums else 0 unsourced = (total_fab + total_param) / total_nums if total_nums else 0 print(f" {condition_name:8s}: fabricated {fab_rate:5.1%} | " f"parametric {param_rate:5.1%} | " f"total unsourced {unsourced:5.1%}") print() # Parametric details for PARTIAL (the key question) print(f"\n{'='*60}") print(f" PARTIAL: What happens in uncovered territory?") print(f"{'='*60}") partial_results = [r for r in results if r["condition"] == "PARTIAL"] all_parametric = [] all_fabricated = [] for r in partial_results: for d in r["numbers"]["parametric_details"]: d["topic"] = r["topic"] d["version"] = r["version"] all_parametric.append(d) for d in r["numbers"]["fabricated_details"]: d["topic"] = r["topic"] d["version"] = r["version"] all_fabricated.append(d) print(f"\n Parametric numbers (from model memory, not in provided source):") for d in all_parametric: print(f" [{d['topic']} v{d['version']}] {d['value']} ({d['type']})") print(f" Context: {d['context'][:80]}...") print(f"\n Fabricated numbers (in neither provided nor full source):") for d in all_fabricated: print(f" [{d['topic']} v{d['version']}] {d['value']} ({d['type']})") print(f" Context: {d['context'][:80]}...") # Save results output = { "experiment": "EXP-081f-source-degradation", "description": ( "Kill test for source + prohibition recipe. Three source conditions: " "FULL (baseline), PARTIAL (key sections removed), SPARSE (5-6 bullets). " "T3-PROHIBITION architecture. 3-way number classification: grounded " "(in provided source), parametric (in full source but not provided), " "fabricated (in neither)." ), "generator": model, "design": "3 topics × 3 conditions × 2 versions = 18 docs", "partial_sections_removed": PARTIAL_REMOVE, "results": [ { "condition": r["condition"], "topic": r["topic"], "version": r["version"], "word_count": r["word_count"], "source_chars": r["source_chars"], "numbers": r["numbers"], "text": r["text"], } for r in results ], } out_path = os.path.join( os.path.dirname(__file__), "exp081_source_degradation_results.json" ) with open(out_path, "w") as f: json.dump(output, f, indent=2) print(f"\n Saved: {out_path}") if __name__ == "__main__": main()