""" EXP-081d: Cross-Generator Replication of Prohibition Tests whether the prohibition > monitoring finding (5x, EXP-081b) replicates across model families. If it holds, it's a robust prompt architecture principle. If not, it's a model quirk. Design: 3 topics × 2 architectures (CURRENT, PROHIBITION) × 2 versions = 12 docs Generator: gemini-3-flash-preview (vs grok-4-1-fast in 081b) Measurement: programmatic number matching (zero LLM) Replication target: 081b (xAI): CURRENT 7.7%, PROHIBITION 1.6% (5x improvement) 081d (Gemini): ? """ import json import os import re import sys import time sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from _config import ( get_gemini_client, gemini_evaluate, format_results, ) GEMINI_MODEL = "gemini-3-flash-preview" SOURCE_DIR = os.path.join(os.path.dirname(__file__), "sources") def load_source(filename): path = os.path.join(SOURCE_DIR, filename) with open(path, "r") as f: return f.read() # ============================================================ # Topics (same as 081/081b) # ============================================================ TOPICS = { "remote_work": { "source_file": "source_1_remote_work.md", "topic": ( "Remote and hybrid work effectiveness: what the productivity data " "actually shows vs. what companies claim, where task type determines " "optimal location, and why the flexibility-retention link matters " "more than the productivity debate" ), "judge_role": "Engineering manager, 5+ years leading distributed teams", "judge_axis": ( "Actionable diagnosis of what actually changes productivity; " "distinguishing signal from preference" ), "judge_cares": ( "What actually changes productivity by task type; flexibility as " "retention lever vs productivity lever; cost-benefit that includes " "hidden overhead" ), }, "communication": { "source_file": "source_3_communication.md", "topic": ( "Internal communication as organizational infrastructure: why " "communication satisfaction is the lowest-rated workplace factor " "despite having the largest impact on retention, satisfaction, and " "productivity — and what the non-desk worker gap reveals about " "systemic failures" ), "judge_role": ( "Executive coach, 50+ leader engagements, tracks behavioral outcomes" ), "judge_axis": ( "Observable behavioral patterns with measurable downstream effects; " "channel-specific trust data" ), "judge_cares": ( "Observable patterns with measurable effects; channel trust data; " "the gap between communication impact and communication quality; " "non-desk vs desk-based disparities" ), }, "ai_workflows": { "source_file": "source_4_ai_workflows.md", "topic": ( "AI-assisted development workflows: how developers actually use AI " "coding tools, where they trust vs. distrust the output, and what " "the productivity claims miss about debugging overhead and code " "quality trade-offs" ), "judge_role": ( "Staff engineer who's shipped AI-integrated features AND maintained " "them 6+ months" ), "judge_axis": ( "Trust/accuracy trade-offs in production; debugging overhead vs " "generation speed; what code quality metrics actually show" ), "judge_cares": ( "Trust/accuracy trade-offs in production; debugging overhead vs " "generation speed; what code quality metrics actually show" ), }, } VERSIONS = 2 # ============================================================ # Prompts (same as 081b) # ============================================================ def t3_current(source, topic, judge_role, judge_axis, judge_cares): return ( f"SOURCE MATERIAL:\n{source}\n\n---\n\n" f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic}\n\n" f"JUDGE: {judge_role} — evaluating for {judge_axis}\n" f" Cares about: {judge_cares}\n\n" f"FAIL (any = failure):\n" f"- Structure survives topic swap. Headings that work for any document\n" f" in this domain = generic, not shaped.\n" f"- Claims hedge without conditions. \"It depends\" only with explicit\n" f" conditions and consequences. \"On the other hand\" after every claim\n" f" = uncommitted.\n" f"- Paragraph survives removal without changing the argument = padding.\n" f"- Analysis survives context-swap. Generic = failure.\n" f"- Claims presented as established fact without sourcing. If you cite\n" f" a number, name the source. If you can't \u2192 qualitative language.\n" f"- Position without falsifier. If nothing could prove your thesis\n" f" wrong, it's not a thesis.\n\n" f"Must GUARANTEE:\n" f"- Every factual claim traces to the source material OR is explicitly\n" f" labeled EXTENDS (beyond source) with confidence level (H/M/L).\n" f"- Mechanism for each causal claim: HOW it produces the outcome.\n\n" f"Write the analysis now. Use markdown with ## headings." ) def t3_prohibition(source, topic, judge_role, judge_axis, judge_cares): return ( f"SOURCE MATERIAL:\n{source}\n\n---\n\n" f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic}\n\n" f"JUDGE: {judge_role} — evaluating for {judge_axis}\n" f" Cares about: {judge_cares}\n\n" f"NUMERICAL CONSTRAINT (ABSOLUTE — violation = automatic failure):\n" f"You may ONLY use specific numbers, percentages, and dollar amounts\n" f"that appear in the SOURCE MATERIAL above. For any quantitative point\n" f"where the source does not provide a number, you MUST use qualitative\n" f"language instead (\"most\", \"a significant minority\", \"substantial\",\n" f"\"roughly half\"). Do NOT invent illustrative numbers. Do NOT pull\n" f"numbers from your training data. If the source says it, cite it.\n" f"If the source doesn't say it, use words, not numbers.\n\n" f"This constraint applies to:\n" f"- Percentages in examples and hypotheticals\n" f"- Dollar amounts in cost analyses\n" f"- Multipliers (e.g., \"1.5x\", \"10x\")\n" f"- Thresholds in falsifiers (use \"significant\" not \"15%\")\n" f"- Industry benchmarks not in source (use \"industry research suggests\")\n\n" f"FAIL (any = failure):\n" f"- Structure survives topic swap = generic.\n" f"- Claims hedge without conditions = uncommitted.\n" f"- Paragraph survives removal = padding.\n" f"- Any number not traceable to the source material above = failure.\n" f"- Position without falsifier.\n\n" f"Must GUARANTEE:\n" f"- Every number in the document appears in the source material.\n" f"- Mechanism for each causal claim: HOW it produces the outcome.\n" f"- Analytical depth achieved through reasoning, not invented specifics.\n\n" f"Write the analysis now. Use markdown with ## headings." ) # ============================================================ # Number matching (from exp081_number_match.py) # ============================================================ def extract_numbers(text): numbers = [] seen = set() for m in re.finditer(r'(\d+(?:\.\d+)?)\s*%', text): val = m.group(1) ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "percentage", m.start()) if key not in seen: seen.add(key) numbers.append({"value": val, "raw": m.group(0), "context": context, "type": "percentage"}) for m in re.finditer(r'\$(\d+(?:\.\d+)?(?:,\d{3})*)', text): val = m.group(1).replace(",", "") ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "dollar", m.start()) if key not in seen: seen.add(key) numbers.append({"value": val, "raw": m.group(0), "context": context, "type": "dollar"}) for m in re.finditer(r'(? 0 else 0 return { "total_numbers": total, "in_source": len(in_source), "not_in_source": len(not_in_source), "unsourced_rate": round(rate, 3), "years_excluded": years, "word_counts_excluded": wc, "unsourced_details": [ {"value": n["value"], "type": n["type"], "raw": n["raw"], "context": n["context"]} for n in not_in_source ], } # ============================================================ # Main # ============================================================ def main(): client = get_gemini_client() model = GEMINI_MODEL print(f"Generator: {model}") print(f"Topics: {len(TOPICS)}") print(f"Architectures: T3-CURRENT, T3-PROHIBITION") print(f"Versions: {VERSIONS}") print(f"Total documents: {len(TOPICS) * 2 * VERSIONS}") print(f"Measurement: programmatic number matching (zero LLM)") print() # Load sources sources = {} for topic_key, cfg in TOPICS.items(): sources[topic_key] = load_source(cfg["source_file"]) print(f" Source [{topic_key}]: {len(sources[topic_key])} chars") results = [] for topic_key, cfg in TOPICS.items(): source = sources[topic_key] for arch_name, prompt_fn in [("T3_CURRENT", t3_current), ("T3_PROHIBITION", t3_prohibition)]: print(f"\n{'='*60}") print(f" {arch_name} | {topic_key} | {model}") print(f"{'='*60}") for v in range(VERSIONS): print(f" v{v+1}...", end=" ", flush=True) t0 = time.time() prompt = prompt_fn( source, cfg["topic"], cfg["judge_role"], cfg["judge_axis"], cfg["judge_cares"]) text = gemini_evaluate( prompt, model=model, temperature=1.0, max_output_tokens=8192, client=client) if text is None: print(f"FAILED (API error)") continue elapsed = time.time() - t0 words = len(text.split()) nm = analyze_numbers(text, source) print(f"done ({words}w, {nm['total_numbers']} nums, " f"{nm['not_in_source']} unsourced [{nm['unsourced_rate']:.1%}], " f"{elapsed:.1f}s)") results.append({ "arch": arch_name, "topic": topic_key, "version": v + 1, "word_count": words, "text": text, "numbers": nm, }) time.sleep(2) # ============================================================ # Summary # ============================================================ print(f"\n{'='*60}") print(f" RESULTS: Cross-Generator Replication ({model})") print(f"{'='*60}") # By architecture print(f"\n --- By Architecture ---") for arch in ["T3_CURRENT", "T3_PROHIBITION"]: arch_r = [r for r in results if r["arch"] == arch] if not arch_r: print(f" {arch}: NO DATA") continue total_nums = sum(r["numbers"]["total_numbers"] for r in arch_r) total_unsrc = sum(r["numbers"]["not_in_source"] for r in arch_r) rate = total_unsrc / total_nums if total_nums > 0 else 0 avg_words = sum(r["word_count"] for r in arch_r) / len(arch_r) print(f" {arch}: {total_nums} numbers, " f"{total_unsrc} unsourced [{rate:.1%}], " f"avg {avg_words:.0f} words") # Comparison to xAI print(f"\n --- Cross-Generator Comparison ---") print(f" {'':20s} {'xAI (081b)':>14s} {'Gemini (081d)':>14s}") for arch in ["T3_CURRENT", "T3_PROHIBITION"]: arch_r = [r for r in results if r["arch"] == arch] if arch_r: total = sum(r["numbers"]["total_numbers"] for r in arch_r) unsrc = sum(r["numbers"]["not_in_source"] for r in arch_r) rate = unsrc / total if total > 0 else 0 # 081b xAI rates xai_rates = {"T3_CURRENT": "7.7%", "T3_PROHIBITION": "1.6%"} print(f" {arch:20s} {xai_rates.get(arch, '?'):>14s} " f"{rate:.1%}".rjust(14)) # Matrix: Architecture × Topic print(f"\n --- Unsourced Rate Matrix ---") print(f" {'':20s} {'remote_work':>14s} {'communication':>14s} {'ai_workflows':>14s}") for arch in ["T3_CURRENT", "T3_PROHIBITION"]: row = f" {arch:20s}" for topic_key in ["remote_work", "communication", "ai_workflows"]: cell = [r for r in results if r["arch"] == arch and r["topic"] == topic_key] if cell: total = sum(r["numbers"]["total_numbers"] for r in cell) unsrc = sum(r["numbers"]["not_in_source"] for r in cell) rate = unsrc / total if total > 0 else 0 row += f" {unsrc}/{total} ({rate:.0%})".rjust(14) else: row += " N/A".rjust(14) print(row) # Save output_path = os.path.join(os.path.dirname(__file__), "exp081_cross_gen_results.json") output = format_results( findings={ "experiment": "EXP-081d-cross-generator", "description": ( "Cross-generator replication of prohibition > monitoring " "finding. Tests whether EXP-081b's 5x improvement (7.7% → 1.6%) " "holds with gemini-3-flash-preview. Same topics, same prompts, " "same measurement. Only variable: generator model." ), "replication_target": { "xai_current": 0.077, "xai_prohibition": 0.016, }, "results": results, }, generator=model, ) with open(output_path, "w") as f: json.dump(output, f, indent=2) print(f"\n Saved: {output_path}") if __name__ == "__main__": main()