""" EXP-081b: Prompt Architecture Stress Test Tests whether T3's unsourced-number problem is fixable by prompt design or is a structural model limitation. Three architectures, same topics, same model: T3-CURRENT: Baseline (inline EXTENDS labeling — 37% compliance) T3-PROHIBITION: No unsourced numbers allowed. Qualitative language for gaps. T3-SEPARATED: Two-pass. Generate qualitatively, then insert source numbers. 3 topics × 3 architectures × 2 versions = 18 documents. Measurement: programmatic number matching (zero LLM judgment). The delta tells us where the failure lives: PROHIBITION >> CURRENT → model CAN avoid unsourced numbers (instruction type) SEPARATED >> PROHIBITION → inline monitoring is the problem (architectural fix) Neither works → model limitation """ import json import os import re import sys import time sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from _config import ( GENERATOR_MODEL, get_generator_client, call_generator, format_results, ) SOURCE_DIR = os.path.join(os.path.dirname(__file__), "sources") def load_source(filename): path = os.path.join(SOURCE_DIR, filename) with open(path, "r") as f: return f.read() # ============================================================ # Topic Definitions (same as full run) # ============================================================ TOPICS = { "remote_work": { "source_file": "source_1_remote_work.md", "topic": ( "Remote and hybrid work effectiveness: what the productivity data " "actually shows vs. what companies claim, where task type determines " "optimal location, and why the flexibility-retention link matters " "more than the productivity debate" ), "judge_role": "Engineering manager, 5+ years leading distributed teams", "judge_axis": ( "Actionable diagnosis of what actually changes productivity; " "distinguishing signal from preference" ), "judge_cares": ( "What actually changes productivity by task type; flexibility as " "retention lever vs productivity lever; cost-benefit that includes " "hidden overhead" ), }, "communication": { "source_file": "source_3_communication.md", "topic": ( "Internal communication as organizational infrastructure: why " "communication satisfaction is the lowest-rated workplace factor " "despite having the largest impact on retention, satisfaction, and " "productivity — and what the non-desk worker gap reveals about " "systemic failures" ), "judge_role": ( "Executive coach, 50+ leader engagements, tracks behavioral outcomes" ), "judge_axis": ( "Observable behavioral patterns with measurable downstream effects; " "channel-specific trust data" ), "judge_cares": ( "Observable patterns with measurable effects; channel trust data; " "the gap between communication impact and communication quality; " "non-desk vs desk-based disparities" ), }, "ai_workflows": { "source_file": "source_4_ai_workflows.md", "topic": ( "AI-assisted development workflows: how developers actually use AI " "coding tools, where they trust vs. distrust the output, and what " "the productivity claims miss about debugging overhead and code " "quality trade-offs" ), "judge_role": ( "Staff engineer who's shipped AI-integrated features AND maintained " "them 6+ months" ), "judge_axis": ( "Trust/accuracy trade-offs in production; debugging overhead vs " "generation speed; what code quality metrics actually show" ), "judge_cares": ( "Trust/accuracy trade-offs in production; debugging overhead vs " "generation speed; what code quality metrics actually show" ), }, } VERSIONS = 2 # ============================================================ # Three Prompt Architectures # ============================================================ def t3_current(source, topic, judge_role, judge_axis, judge_cares): """T3-CURRENT: Baseline — inline EXTENDS labeling (existing T3).""" return ( f"SOURCE MATERIAL:\n{source}\n\n---\n\n" f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic}\n\n" f"JUDGE: {judge_role} — evaluating for {judge_axis}\n" f" Cares about: {judge_cares}\n\n" f"FAIL (any = failure):\n" f"- Structure survives topic swap. Headings that work for any document\n" f" in this domain = generic, not shaped.\n" f"- Claims hedge without conditions. \"It depends\" only with explicit\n" f" conditions and consequences. \"On the other hand\" after every claim\n" f" = uncommitted.\n" f"- Paragraph survives removal without changing the argument = padding.\n" f"- Analysis survives context-swap. Generic = failure.\n" f"- Claims presented as established fact without sourcing. If you cite\n" f" a number, name the source. If you can't \u2192 qualitative language.\n" f"- Position without falsifier. If nothing could prove your thesis\n" f" wrong, it's not a thesis.\n\n" f"Must GUARANTEE:\n" f"- Every factual claim traces to the source material OR is explicitly\n" f" labeled EXTENDS (beyond source) with confidence level (H/M/L).\n" f"- Mechanism for each causal claim: HOW it produces the outcome.\n\n" f"Write the analysis now. Use markdown with ## headings." ) def t3_prohibition(source, topic, judge_role, judge_axis, judge_cares): """T3-PROHIBITION: No unsourced numbers allowed. Qualitative for gaps.""" return ( f"SOURCE MATERIAL:\n{source}\n\n---\n\n" f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic}\n\n" f"JUDGE: {judge_role} — evaluating for {judge_axis}\n" f" Cares about: {judge_cares}\n\n" f"NUMERICAL CONSTRAINT (ABSOLUTE — violation = automatic failure):\n" f"You may ONLY use specific numbers, percentages, and dollar amounts\n" f"that appear in the SOURCE MATERIAL above. For any quantitative point\n" f"where the source does not provide a number, you MUST use qualitative\n" f"language instead (\"most\", \"a significant minority\", \"substantial\",\n" f"\"roughly half\"). Do NOT invent illustrative numbers. Do NOT pull\n" f"numbers from your training data. If the source says it, cite it.\n" f"If the source doesn't say it, use words, not numbers.\n\n" f"This constraint applies to:\n" f"- Percentages in examples and hypotheticals\n" f"- Dollar amounts in cost analyses\n" f"- Multipliers (e.g., \"1.5x\", \"10x\")\n" f"- Thresholds in falsifiers (use \"significant\" not \"15%\")\n" f"- Industry benchmarks not in source (use \"industry research suggests\")\n\n" f"FAIL (any = failure):\n" f"- Structure survives topic swap = generic.\n" f"- Claims hedge without conditions = uncommitted.\n" f"- Paragraph survives removal = padding.\n" f"- Any number not traceable to the source material above = failure.\n" f"- Position without falsifier.\n\n" f"Must GUARANTEE:\n" f"- Every number in the document appears in the source material.\n" f"- Mechanism for each causal claim: HOW it produces the outcome.\n" f"- Analytical depth achieved through reasoning, not invented specifics.\n\n" f"Write the analysis now. Use markdown with ## headings." ) def t3_separated_pass1(source, topic, judge_role, judge_axis, judge_cares): """T3-SEPARATED pass 1: Generate analysis with qualitative language.""" return ( f"SOURCE MATERIAL:\n{source}\n\n---\n\n" f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic}\n\n" f"JUDGE: {judge_role} — evaluating for {judge_axis}\n" f" Cares about: {judge_cares}\n\n" f"IMPORTANT: This is pass 1 of 2. In this pass, write the analytical\n" f"structure and argument WITHOUT specific numbers. Use placeholder\n" f"markers where data should go:\n" f"- Write [SOURCE: description] where a sourced number belongs\n" f"- Write [QUAL: description] where no source number exists and\n" f" qualitative language should be used\n\n" f"Example: \"[SOURCE: % of workers who would job hunt if flex removed]\n" f"would start looking elsewhere\" or \"[QUAL: productivity variance\n" f"across task types] differs meaningfully by location.\"\n\n" f"Focus on:\n" f"- Argument structure shaped by THIS analysis\n" f"- Causal mechanisms (HOW each effect produces its outcome)\n" f"- Falsifiers (what would prove the thesis wrong)\n" f"- Committed positions, not hedging\n\n" f"Write the skeleton now. Use markdown with ## headings." ) def t3_separated_pass2(source, pass1_output): """T3-SEPARATED pass 2: Insert numbers from source only.""" return ( f"SOURCE MATERIAL:\n{source}\n\n---\n\n" f"ANALYTICAL SKELETON (from pass 1):\n{pass1_output}\n\n---\n\n" f"TASK: Fill in the [SOURCE: ...] and [QUAL: ...] placeholders:\n\n" f"For [SOURCE: ...] markers:\n" f"- Find the exact number in the SOURCE MATERIAL above\n" f"- Insert it with a brief source reference\n" f"- If no matching number exists in source, convert to qualitative\n\n" f"For [QUAL: ...] markers:\n" f"- Replace with appropriate qualitative language\n" f"- Do NOT invent a number. Use words: \"most\", \"nearly half\",\n" f" \"a significant minority\", \"substantial\", etc.\n\n" f"CONSTRAINT: Do NOT add any new numbers that are not in the source\n" f"material. If you're unsure whether a number is in the source,\n" f"use qualitative language instead.\n\n" f"Output the completed document. Maintain all analytical structure,\n" f"mechanisms, and falsifiers from pass 1." ) # ============================================================ # Number Matching (from exp081_number_match.py) # ============================================================ def extract_numbers(text): """Extract all numerical values from text.""" numbers = [] seen = set() for m in re.finditer(r'(\d+(?:\.\d+)?)\s*%', text): val = m.group(1) ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "percentage", m.start()) if key not in seen: seen.add(key) numbers.append({"value": val, "raw": m.group(0), "context": context, "type": "percentage"}) for m in re.finditer(r'\$(\d+(?:\.\d+)?(?:,\d{3})*)', text): val = m.group(1).replace(",", "") ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "dollar", m.start()) if key not in seen: seen.add(key) numbers.append({"value": val, "raw": m.group(0), "context": context, "type": "dollar"}) for m in re.finditer(r'(? 0 else 0 return { "total_numbers": total, "in_source": len(in_source), "not_in_source": len(not_in_source), "unsourced_rate": round(rate, 3), "years_excluded": years, "word_counts_excluded": wc, "unsourced_details": [ {"value": n["value"], "type": n["type"], "raw": n["raw"], "context": n["context"]} for n in not_in_source ], } # ============================================================ # Main # ============================================================ def main(): client, model = get_generator_client() print(f"Generator: {model}") print(f"Topics: {len(TOPICS)}") print(f"Architectures: T3-CURRENT, T3-PROHIBITION, T3-SEPARATED") print(f"Versions: {VERSIONS}") print(f"Total documents: {len(TOPICS) * 3 * VERSIONS}") print(f"Measurement: programmatic number matching (zero LLM)") print() sources = {} for topic_key, cfg in TOPICS.items(): sources[topic_key] = load_source(cfg["source_file"]) print(f" Source [{topic_key}]: {len(sources[topic_key])} chars") results = [] for topic_key, cfg in TOPICS.items(): source = sources[topic_key] # --- T3-CURRENT --- print(f"\n{'='*60}") print(f" T3-CURRENT | {topic_key}") print(f"{'='*60}") for v in range(VERSIONS): print(f" v{v+1}...", end=" ", flush=True) t0 = time.time() prompt = t3_current( source, cfg["topic"], cfg["judge_role"], cfg["judge_axis"], cfg["judge_cares"]) text = call_generator(client, model, [{"role": "user", "content": prompt}]) elapsed = time.time() - t0 words = len(text.split()) nm = analyze_numbers(text, source) print(f"done ({words}w, {nm['total_numbers']} nums, " f"{nm['not_in_source']} unsourced [{nm['unsourced_rate']:.1%}], " f"{elapsed:.1f}s)") results.append({ "arch": "T3_CURRENT", "topic": topic_key, "version": v + 1, "word_count": words, "text": text, "numbers": nm, }) time.sleep(3) # --- T3-PROHIBITION --- print(f"\n{'='*60}") print(f" T3-PROHIBITION | {topic_key}") print(f"{'='*60}") for v in range(VERSIONS): print(f" v{v+1}...", end=" ", flush=True) t0 = time.time() prompt = t3_prohibition( source, cfg["topic"], cfg["judge_role"], cfg["judge_axis"], cfg["judge_cares"]) text = call_generator(client, model, [{"role": "user", "content": prompt}]) elapsed = time.time() - t0 words = len(text.split()) nm = analyze_numbers(text, source) print(f"done ({words}w, {nm['total_numbers']} nums, " f"{nm['not_in_source']} unsourced [{nm['unsourced_rate']:.1%}], " f"{elapsed:.1f}s)") results.append({ "arch": "T3_PROHIBITION", "topic": topic_key, "version": v + 1, "word_count": words, "text": text, "numbers": nm, }) time.sleep(3) # --- T3-SEPARATED --- print(f"\n{'='*60}") print(f" T3-SEPARATED | {topic_key}") print(f"{'='*60}") for v in range(VERSIONS): print(f" v{v+1} pass1...", end=" ", flush=True) t0 = time.time() p1_prompt = t3_separated_pass1( source, cfg["topic"], cfg["judge_role"], cfg["judge_axis"], cfg["judge_cares"]) pass1_text = call_generator(client, model, [{"role": "user", "content": p1_prompt}]) elapsed1 = time.time() - t0 print(f"done ({len(pass1_text.split())}w, {elapsed1:.1f}s)") time.sleep(3) print(f" pass2...", end=" ", flush=True) t0 = time.time() p2_prompt = t3_separated_pass2(source, pass1_text) text = call_generator(client, model, [{"role": "user", "content": p2_prompt}]) elapsed2 = time.time() - t0 words = len(text.split()) nm = analyze_numbers(text, source) print(f"done ({words}w, {nm['total_numbers']} nums, " f"{nm['not_in_source']} unsourced [{nm['unsourced_rate']:.1%}], " f"{elapsed2:.1f}s)") results.append({ "arch": "T3_SEPARATED", "topic": topic_key, "version": v + 1, "word_count": words, "text": text, "pass1_text": pass1_text, "numbers": nm, }) time.sleep(3) # ============================================================ # Summary # ============================================================ print(f"\n{'='*60}") print(f" RESULTS: Prompt Architecture Stress Test") print(f" Generator: {model}") print(f"{'='*60}") # By architecture print(f"\n --- By Architecture ---") for arch in ["T3_CURRENT", "T3_PROHIBITION", "T3_SEPARATED"]: arch_r = [r for r in results if r["arch"] == arch] total_nums = sum(r["numbers"]["total_numbers"] for r in arch_r) total_unsrc = sum(r["numbers"]["not_in_source"] for r in arch_r) rate = total_unsrc / total_nums if total_nums > 0 else 0 avg_words = sum(r["word_count"] for r in arch_r) / len(arch_r) print(f" {arch}: {total_nums} numbers, " f"{total_unsrc} unsourced [{rate:.1%}], " f"avg {avg_words:.0f} words") # Matrix: Architecture × Topic print(f"\n --- Unsourced Rate Matrix (Arch × Topic) ---") print(f" {'':20s} {'remote_work':>14s} {'communication':>14s} {'ai_workflows':>14s}") for arch in ["T3_CURRENT", "T3_PROHIBITION", "T3_SEPARATED"]: row = f" {arch:20s}" for topic_key in ["remote_work", "communication", "ai_workflows"]: cell = [r for r in results if r["arch"] == arch and r["topic"] == topic_key] if cell: total = sum(r["numbers"]["total_numbers"] for r in cell) unsrc = sum(r["numbers"]["not_in_source"] for r in cell) rate = unsrc / total if total > 0 else 0 row += f" {unsrc}/{total} ({rate:.0%})".rjust(14) print(row) # Unsourced details for each architecture for arch in ["T3_CURRENT", "T3_PROHIBITION", "T3_SEPARATED"]: arch_r = [r for r in results if r["arch"] == arch] has_unsrc = any(r["numbers"]["unsourced_details"] for r in arch_r) if has_unsrc: print(f"\n --- Unsourced Numbers: {arch} ---") for r in arch_r: if r["numbers"]["unsourced_details"]: print(f" {r['topic']} v{r['version']}:") for d in r["numbers"]["unsourced_details"]: print(f" {d['raw']:>8s} ({d['type']}) " f"...{d['context'][:70]}...") # Save output_path = os.path.join(os.path.dirname(__file__), "exp081_prompt_arch_results.json") output = format_results( findings={ "experiment": "EXP-081b-prompt-architecture", "description": ( "Prompt architecture stress test. Three T3 variants: " "CURRENT (inline EXTENDS labeling), PROHIBITION (no unsourced " "numbers allowed), SEPARATED (two-pass: generate qualitatively, " "then insert source numbers). 3 topics × 3 architectures × " "2 versions = 18 documents. Measured by programmatic number " "matching (zero LLM judgment)." ), "architectures": { "T3_CURRENT": "Inline EXTENDS labeling (baseline)", "T3_PROHIBITION": "Unsourced numbers prohibited, qualitative required", "T3_SEPARATED": "Two-pass: skeleton with placeholders, then source insertion", }, "results": results, }, generator=model, ) with open(output_path, "w") as f: json.dump(output, f, indent=2) print(f"\n Saved: {output_path}") if __name__ == "__main__": main()