#!/usr/bin/env python3 """ EXP-078b: Cross-Generator Temporal Consistency Replication Replicates the Layer 2 temporal consistency test across two additional generator families to test whether the 100% fabrication finding is universal. Original finding (gemini-3.1-flash-lite-preview, 20 topics): 100% fabrication rate on model-generated percentages 0.000 heading Jaccard across versions This replication adds: - xAI (grok-4-1-fast) — vault's current default generator - Gemini 3 Flash (gemini-3-flash-preview) — stronger Gemini model (within-family test) Design: 10 topics × 3 STANDARD versions × 2 generators = 60 documents 5 topics × 2 BASIC versions × 2 generators = 20 documents Total: 80 generations Topics: matched subset from cross_gen_docs.json (overlap with original 20) Same prompts as fpr_estimation.py. Same analysis pipeline (claim_extraction.py). No LLM evaluation at any point — pure regex extraction + set comparison. Kill signals (pre-registered): KS1: <3 numerical claims per document → insufficient signal density KS2: No discrimination between STANDARD and BASIC → doesn't separate conditions KS3: All claims stable (<10% fabrication) → no fabrication to detect KS4: All claims variable (>90% fabrication) → no gradient to exploit KS5: Generator fabrication rates within 5% of each other → no gradient across generators """ import json import os import statistics import sys import time from collections import defaultdict from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent)) from _config import ( get_xai_client, get_gemini_client, GENERATOR_MODEL, ) from claim_extraction import extract_all, compare_numerical_across_versions DATA_DIR = Path(__file__).parent DOCS_CACHE = DATA_DIR / "exp078b_cross_gen_temporal_docs.json" RESULTS_PATH = DATA_DIR / "exp078b_cross_gen_temporal_results.json" ORIGINAL_RESULTS = DATA_DIR / "exp078_temporal_consistency_results.json" N_SAMPLES_STANDARD = 3 N_SAMPLES_BASIC = 2 # 10 topics matching cross_gen_docs.json for direct comparison TOPICS = [ {"id": "business_strategy", "domain": "business strategy", "task": "Analyze why mid-stage B2B SaaS companies (Series B, $10-30M ARR) " "consistently fail to transition from founder-led sales to a scalable " "sales organization, and what the structural causes are."}, {"id": "technical_architecture", "domain": "technical architecture", "task": "Analyze why microservice architectures degrade into distributed " "monoliths within 18-24 months at companies with fewer than 50 engineers, " "and what architectural decisions cause this."}, {"id": "research_analysis", "domain": "research analysis", "task": "Analyze why randomized controlled trials in educational technology " "consistently show smaller effect sizes than quasi-experimental studies, " "and what this reveals about the mechanisms of ed-tech interventions."}, {"id": "platform_migration", "domain": "engineering management", "task": "Why platform migrations consistently take 3x longer than estimated and " "what structural factors make them resistant to better planning"}, {"id": "pricing_psychology", "domain": "product strategy", "task": "How B2B SaaS companies systematically underprice their products and the " "cognitive biases that prevent correction"}, {"id": "ml_deployment", "domain": "machine learning engineering", "task": "Why most ML models that work in notebooks fail in production and the " "specific engineering gaps responsible"}, {"id": "remote_culture", "domain": "organizational design", "task": "How remote-first companies develop distinct failure modes in decision-making " "that co-located teams don't experience"}, {"id": "api_design", "domain": "software architecture", "task": "Why backward-compatible API evolution is structurally harder than breaking " "changes and the design principles that make it tractable"}, {"id": "data_governance", "domain": "data engineering", "task": "Why data quality initiatives fail even with executive support and what " "makes the problem structurally different from code quality"}, {"id": "supply_chain_resilience", "domain": "operations management", "task": "Why lean supply chains that optimized for efficiency pre-2020 proved " "catastrophically fragile and what structural trade-offs between efficiency " "and resilience are genuinely unavoidable"}, ] # ================================================================ # PROMPTS — identical to fpr_estimation.py # ================================================================ def build_standard_prompt(topic): return ( f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic['task']}\n\n" f"Requirements:\n" f"- SHAPED: Your document structure must be shaped by THIS specific analysis. " f"FAIL: your headings survive topic swap. If someone could paste these headings " f"onto a different {topic['domain']} document and they'd still fit, you wrote " f"generic structure, not shaped analysis. Each heading should name the specific " f"mechanism, finding, or claim it covers.\n" f"- COMMITTED: Take a clear position. Don't hedge with 'it depends' or 'there are " f"many factors.' Name the specific causes and commit to your diagnosis.\n" f"- DENSE: Every section must earn its place. If removing a paragraph doesn't change " f"the argument, that paragraph shouldn't exist.\n\n" f"Write the analysis now. Use markdown with ## headings." ) def build_basic_prompt(topic): return ( f"Write an analysis of the following topic (~1000 words):\n\n" f"{topic['task']}\n\n" f"Write the analysis now. Use markdown with ## headings." ) # ================================================================ # GENERATION # ================================================================ GEMINI_GEN_MODEL = "gemini-3-flash-preview" def generate_with_xai(client, model, prompt, max_retries=5): """Generate using xAI (OpenAI-compatible API).""" messages = [{"role": "user", "content": prompt}] for attempt in range(max_retries): try: resp = client.chat.completions.create( model=model, messages=messages, max_tokens=4096, temperature=1.0, ) content = resp.choices[0].message.content or "" if content.strip() and len(content.strip()) > 100: return content.strip() raise ValueError("Empty or too short response") except Exception as e: err_str = str(e) is_rate = "429" in err_str or "rate" in err_str.lower() wait = (60 if is_rate else 10) * (attempt + 1) print(f" attempt {attempt+1}/{max_retries} failed: " f"{'RATE LIMIT' if is_rate else 'ERROR'} — {err_str[:80]}") if attempt < max_retries - 1: print(f" waiting {wait}s...") time.sleep(wait) print(f" FAILED after {max_retries} attempts") return None def generate_with_gemini(client, model, prompt, max_retries=5): """Generate using Gemini API.""" from google.genai import types for attempt in range(max_retries): try: resp = client.models.generate_content( model=model, contents=prompt, config=types.GenerateContentConfig( temperature=1.0, max_output_tokens=4096, ), ) text = resp.text if text and text.strip() and len(text.strip()) > 100: return text.strip() raise ValueError("Empty or too short response") except Exception as e: err_str = str(e) is_rate = "429" in err_str or "rate" in err_str.lower() wait = (30 if is_rate else 10) * (attempt + 1) print(f" attempt {attempt+1}/{max_retries} failed: " f"{'RATE LIMIT' if is_rate else 'ERROR'} — {err_str[:80]}") if attempt < max_retries - 1: print(f" waiting {wait}s...") time.sleep(wait) print(f" FAILED after {max_retries} attempts") return None def _save_docs(documents): """Save documents incrementally.""" with open(DOCS_CACHE, "w") as f: json.dump(documents, f, indent=2) def generate_all_documents(): """Generate documents for all generators. Resumes from cache if available.""" documents = [] if DOCS_CACHE.exists(): with open(DOCS_CACHE) as f: documents = json.load(f) print(f"Loaded {len(documents)} cached documents") # Track what's already generated existing = set() for d in documents: existing.add((d["generator"], d["condition"], d["topic_id"], d.get("sample_idx", 0))) # Build clients once xai_client = get_xai_client() gemini_client = get_gemini_client() generators = [ { "label": "xai", "model": GENERATOR_MODEL, # grok-4-1-fast "gen_fn": lambda prompt, c=xai_client, m=GENERATOR_MODEL: ( generate_with_xai(c, m, prompt)), "delay": 3, }, { "label": "gemini3f", "model": GEMINI_GEN_MODEL, # gemini-3-flash-preview "gen_fn": lambda prompt, c=gemini_client, m=GEMINI_GEN_MODEL: ( generate_with_gemini(c, m, prompt)), "delay": 1, }, ] total_expected = (len(TOPICS) * N_SAMPLES_STANDARD + 5 * N_SAMPLES_BASIC) * 2 gen_count = len(existing) for gen_info in generators: label = gen_info["label"] model = gen_info["model"] gen_fn = gen_info["gen_fn"] delay = gen_info["delay"] print(f"\n{'='*60}") print(f"GENERATOR: {label} ({model})") print(f"{'='*60}") # STANDARD: 10 topics × 3 samples for topic in TOPICS: for sample_idx in range(N_SAMPLES_STANDARD): key = (label, "STANDARD", topic["id"], sample_idx) if key in existing: gen_count += 1 print(f" [{gen_count}/{total_expected}] " f"STANDARD {topic['id']} #{sample_idx+1}... CACHED") continue gen_count += 1 print(f" [{gen_count}/{total_expected}] " f"STANDARD {topic['id']} #{sample_idx+1}...", end=" ", flush=True) prompt = build_standard_prompt(topic) text = gen_fn(prompt) if text: wc = len(text.split()) hc = len([l for l in text.split('\n') if l.strip().startswith('##')]) print(f"OK ({wc}w, {hc}h)") documents.append({ "topic_id": topic["id"], "domain": topic["domain"], "task": topic["task"], "condition": "STANDARD", "generator": label, "generator_model": model, "sample_idx": sample_idx, "text": text, "word_count": wc, "heading_count": hc, }) existing.add(key) else: print("FAILED") time.sleep(delay) # BASIC: first 5 topics × 2 samples for topic in TOPICS[:5]: for sample_idx in range(N_SAMPLES_BASIC): key = (label, "BASIC", topic["id"], sample_idx) if key in existing: gen_count += 1 print(f" [{gen_count}/{total_expected}] " f"BASIC {topic['id']} #{sample_idx+1}... CACHED") continue gen_count += 1 print(f" [{gen_count}/{total_expected}] " f"BASIC {topic['id']} #{sample_idx+1}...", end=" ", flush=True) prompt = build_basic_prompt(topic) text = gen_fn(prompt) if text: wc = len(text.split()) hc = len([l for l in text.split('\n') if l.strip().startswith('##')]) print(f"OK ({wc}w, {hc}h)") documents.append({ "topic_id": topic["id"], "domain": topic["domain"], "task": topic["task"], "condition": "BASIC", "generator": label, "generator_model": model, "sample_idx": sample_idx, "text": text, "word_count": wc, "heading_count": hc, }) existing.add(key) else: print("FAILED") time.sleep(delay) # Save after each generator completes _save_docs(documents) print(f" Saved {len(documents)} documents (incremental)") print(f"\nTotal generated: {len(documents)}/{total_expected} documents") return documents # ================================================================ # ANALYSIS — same pipeline as exp078_temporal_consistency.py # ================================================================ def analyze_generator(docs, generator_label): """Run temporal consistency analysis for one generator's documents.""" # Group by (topic_id, condition) groups = defaultdict(list) for doc in docs: if doc["generator"] == generator_label: groups[(doc["topic_id"], doc["condition"])].append(doc) triple_groups = { k: sorted(v, key=lambda d: d.get("sample_idx", 0)) for k, v in groups.items() if len(v) >= 3 } double_groups = { k: sorted(v, key=lambda d: d.get("sample_idx", 0)) for k, v in groups.items() if len(v) == 2 } # STANDARD analysis (3-version groups) standard_results = {} for (topic_id, condition), doc_group in sorted(triple_groups.items()): version_data = [extract_all(doc["text"]) for doc in doc_group] comparison = compare_numerical_across_versions(version_data) standard_results[f"{topic_id}_{condition}"] = comparison # BASIC analysis (2-version groups) basic_results = {} for (topic_id, condition), doc_group in sorted(double_groups.items()): version_data = [extract_all(doc["text"]) for doc in doc_group] comparison = compare_numerical_across_versions(version_data) basic_results[f"{topic_id}_{condition}"] = comparison return standard_results, basic_results def compute_aggregate(results): """Compute aggregate stats from per-topic results.""" pct_rates = [r["fabrication_rate_pct"] for r in results.values() if r["fabrication_rate_pct"] is not None] all_rates = [r["all_numbers_fab_rate"] for r in results.values() if r["all_numbers_fab_rate"] is not None] heading_j = [r["heading_jaccard"] for r in results.values()] pct_totals = [r["n_pct_total"] for r in results.values()] all_totals = [r["all_numbers_total"] for r in results.values()] agg = {} if pct_rates: agg["pct_fab_rate_mean"] = round(statistics.mean(pct_rates), 4) agg["pct_fab_rate_median"] = round(statistics.median(pct_rates), 4) agg["pct_fab_rate_range"] = [round(min(pct_rates), 4), round(max(pct_rates), 4)] agg["n_topics_with_pct"] = len(pct_rates) if all_rates: agg["all_fab_rate_mean"] = round(statistics.mean(all_rates), 4) if heading_j: agg["mean_heading_jaccard"] = round(statistics.mean(heading_j), 3) if pct_totals: agg["mean_pct_per_topic"] = round(statistics.mean(pct_totals), 1) if all_totals: agg["mean_all_nums_per_topic"] = round(statistics.mean(all_totals), 1) return agg def print_generator_results(gen_label, std_results, bas_results, agg): """Print results for one generator.""" print(f"\n{'='*60}") print(f"GENERATOR: {gen_label}") print(f"{'='*60}") print(f"\nSTANDARD (n={len(std_results)} topics):") for key, r in sorted(std_results.items()): fab = r["fabrication_rate_pct"] fab_str = f"{fab:.0%}" if fab is not None else "N/A (no pcts)" all_fab = r["all_numbers_fab_rate"] all_str = f"{all_fab:.0%}" if all_fab is not None else "N/A" print(f" {key}: pct_fab={fab_str} all_fab={all_str} " f"heading_J={r['heading_jaccard']:.3f} " f"nums={r['numerical_counts']}") if agg: print(f"\n Aggregate:") for k, v in agg.items(): print(f" {k}: {v}") if bas_results: print(f"\nBASIC (n={len(bas_results)} topics):") for key, r in sorted(bas_results.items()): fab = r["fabrication_rate_pct"] fab_str = f"{fab:.0%}" if fab is not None else "N/A" print(f" {key}: pct_fab={fab_str} " f"heading_J={r['heading_jaccard']:.3f}") def load_original_gemini_results(): """Load original Gemini results for comparison.""" if not ORIGINAL_RESULTS.exists(): print(" (original results file not found — skipping comparison)") return None with open(ORIGINAL_RESULTS) as f: data = json.load(f) # Extract per-topic fabrication rates for matched topics matched = {} for key, r in data.get("standard_results", {}).items(): topic_id = key.replace("_STANDARD", "") if r["fabrication_rate_pct"] is not None: matched[topic_id] = r["fabrication_rate_pct"] return { "aggregate": data.get("aggregate", {}), "per_topic": matched, } def main(): print("=" * 70) print("EXP-078b: CROSS-GENERATOR TEMPORAL CONSISTENCY REPLICATION") print("=" * 70) print(f"Generators: xAI ({GENERATOR_MODEL}), Gemini 3 Flash ({GEMINI_GEN_MODEL})") print(f"Topics: {len(TOPICS)}") print(f"STANDARD: {len(TOPICS)} × {N_SAMPLES_STANDARD} versions × 2 generators " f"= {len(TOPICS) * N_SAMPLES_STANDARD * 2}") print(f"BASIC: 5 × {N_SAMPLES_BASIC} versions × 2 generators " f"= {5 * N_SAMPLES_BASIC * 2}") print(f"No LLM evaluation — pure regex extraction + set comparison") print() # === Phase 1: Generate documents === docs = generate_all_documents() # === Phase 2: Analyze each generator === all_generator_results = {} for gen_label in ["xai", "gemini3f"]: gen_docs = [d for d in docs if d["generator"] == gen_label] if not gen_docs: print(f"\nWARNING: No documents for {gen_label}") continue gen_model = gen_docs[0]["generator_model"] std_results, bas_results = analyze_generator(docs, gen_label) std_agg = compute_aggregate(std_results) print_generator_results(gen_label, std_results, bas_results, std_agg) all_generator_results[gen_label] = { "generator_model": gen_model, "n_standard_topics": len(std_results), "n_basic_topics": len(bas_results), "standard_results": std_results, "basic_results": bas_results, "aggregate": std_agg, } # === Phase 3: Cross-generator comparison === print() print("=" * 70) print("CROSS-GENERATOR COMPARISON") print("=" * 70) # Load original Gemini results gemini_orig = load_original_gemini_results() # Build comparison table generators_compared = {} if gemini_orig: gemini_agg = gemini_orig["aggregate"] generators_compared["gemini (original)"] = { "model": "gemini-3.1-flash-lite-preview", "pct_fab_mean": gemini_agg.get("standard_pct_fab_rate_mean"), "all_fab_mean": gemini_agg.get("standard_all_fab_rate_mean"), "heading_jaccard": gemini_agg.get("mean_heading_jaccard"), "n_topics": 20, } for gen_label, gen_data in all_generator_results.items(): agg = gen_data["aggregate"] generators_compared[gen_label] = { "model": gen_data["generator_model"], "pct_fab_mean": agg.get("pct_fab_rate_mean"), "all_fab_mean": agg.get("all_fab_rate_mean"), "heading_jaccard": agg.get("mean_heading_jaccard"), "n_topics": gen_data["n_standard_topics"], } print(f"\n{'Generator':<25} {'Model':<30} {'Pct Fab':<10} " f"{'All Fab':<10} {'Head J':<10} {'N':<5}") print("-" * 90) for gen, data in generators_compared.items(): pct = f"{data['pct_fab_mean']:.1%}" if data['pct_fab_mean'] is not None else "N/A" all_f = f"{data['all_fab_mean']:.1%}" if data['all_fab_mean'] is not None else "N/A" hj = f"{data['heading_jaccard']:.3f}" if data['heading_jaccard'] is not None else "N/A" print(f"{gen:<25} {data['model']:<30} {pct:<10} {all_f:<10} {hj:<10} {data['n_topics']}") # Per-topic comparison (matched topics only) if gemini_orig: print(f"\nPer-topic fabrication rates (matched topics, STANDARD):") print(f"{'Topic':<25} {'Gemini(orig)':<12} {'xAI':<12} {'Gemini3F':<12}") print("-" * 60) for topic in TOPICS: tid = topic["id"] gem_rate = gemini_orig["per_topic"].get(tid) xai_rate = None gpt_rate = None if "xai" in all_generator_results: key = f"{tid}_STANDARD" r = all_generator_results["xai"]["standard_results"].get(key, {}) xai_rate = r.get("fabrication_rate_pct") g3f_rate = None if "gemini3f" in all_generator_results: key = f"{tid}_STANDARD" r = all_generator_results["gemini3f"]["standard_results"].get(key, {}) g3f_rate = r.get("fabrication_rate_pct") gem_str = f"{gem_rate:.0%}" if gem_rate is not None else "—" xai_str = f"{xai_rate:.0%}" if xai_rate is not None else "—" g3f_str = f"{g3f_rate:.0%}" if g3f_rate is not None else "—" print(f"{tid:<25} {gem_str:<12} {xai_str:<12} {g3f_str:<12}") # === Phase 4: Kill signal assessment === print() print("=" * 70) print("KILL SIGNAL ASSESSMENT") print("=" * 70) for gen_label, gen_data in all_generator_results.items(): agg = gen_data["aggregate"] print(f"\n{gen_label} ({gen_data['generator_model']}):") # KS1: Signal density mean_pct = agg.get("mean_pct_per_topic", 0) mean_all = agg.get("mean_all_nums_per_topic", 0) if mean_pct < 3 and mean_all < 5: print(f" KS1 — Signal density: CONCERN (mean pct/topic={mean_pct}, " f"all nums/topic={mean_all})") else: print(f" KS1 — Signal density: PASS (mean pct/topic={mean_pct}, " f"all nums/topic={mean_all})") # KS3: Fabrication detectable? fab = agg.get("pct_fab_rate_mean") if fab is not None: if fab < 0.10: print(f" KS3 — Fabrication: KILL SIGNAL (<10% fab rate: {fab:.1%})") elif fab < 0.20: print(f" KS3 — Fabrication: WEAK (10-20% fab rate: {fab:.1%})") else: print(f" KS3 — Fabrication: PASS ({fab:.0%} fabrication)") else: print(f" KS3 — Fabrication: NO DATA (no percentage claims found)") # KS4: Gradient exists? fab_range = agg.get("pct_fab_rate_range") if fab_range: if fab_range[0] > 0.90: print(f" KS4 — Gradient: KILL (all >90%, range {fab_range[0]:.0%}-{fab_range[1]:.0%})") else: print(f" KS4 — Gradient: PASS (range {fab_range[0]:.0%}-{fab_range[1]:.0%})") # KS5: Cross-generator gradient fab_rates = {g: d.get("pct_fab_mean") for g, d in generators_compared.items() if d.get("pct_fab_mean") is not None} if len(fab_rates) >= 2: vals = list(fab_rates.values()) spread = max(vals) - min(vals) print(f"\nKS5 — Cross-generator gradient:") print(f" Rates: {', '.join(f'{g}={v:.1%}' for g, v in fab_rates.items())}") print(f" Spread: {spread:.1%}") if spread < 0.05: print(f" >>> KILL: <5% spread — fabrication rate is generator-invariant") print(f" >>> Interpretation: fabrication is universal (STRONG for convergence thesis)") elif spread < 0.20: print(f" >>> MODERATE gradient — fabrication varies but is present across generators") else: print(f" >>> LARGE gradient — fabrication is generator-specific (WEAKENS convergence)") # === Phase 5: Verdict === print() print("=" * 70) print("VERDICT") print("=" * 70) universal = True for gen_label, gen_data in all_generator_results.items(): fab = gen_data["aggregate"].get("pct_fab_rate_mean") if fab is not None and fab < 0.50: universal = False print(f" {gen_label}: {fab:.0%} fabrication — BELOW 50% threshold") elif fab is not None: print(f" {gen_label}: {fab:.0%} fabrication — ABOVE 50% threshold") else: universal = False print(f" {gen_label}: No percentage data — CANNOT ASSESS") if universal: print("\n UNIVERSAL FABRICATION: All generators show majority fabrication.") print(" The convergence thesis strengthens — constrained generation") print(" produces fabricated specificity regardless of model family.") else: print("\n GENERATOR-SPECIFIC: Fabrication rate varies significantly.") print(" The convergence thesis needs revision — the mechanism is") print(" not universal but model-family-dependent.") # === Save results === output = { "experiment": "EXP-078b_cross_generator_temporal_consistency", "method": "Cross-generation numerical claim variance (same as EXP-078 Layer 2)", "generators": { gen: { "model": data["generator_model"], "n_standard_topics": data["n_standard_topics"], "n_basic_topics": data["n_basic_topics"], "aggregate": data["aggregate"], "standard_results": data["standard_results"], "basic_results": data["basic_results"], } for gen, data in all_generator_results.items() }, "cross_generator_comparison": generators_compared, "n_topics": len(TOPICS), "n_standard_samples": N_SAMPLES_STANDARD, "n_basic_samples": N_SAMPLES_BASIC, "prompts_source": "fpr_estimation.py (identical)", "analysis_pipeline": "claim_extraction.py (identical)", "llm_evaluation": "NONE — pure regex extraction + set comparison", } with open(RESULTS_PATH, "w") as f: json.dump(output, f, indent=2, default=str) print(f"\nResults saved to {RESULTS_PATH.name}") if __name__ == "__main__": main()