""" EXP-081c: Commensurable Bridge Closes the measurement gap between EXP-048-080 (temporal consistency) and EXP-081 (source matching) by applying BOTH measurements to the SAME documents under BOTH conditions (source-present, source-absent). Design: 3 topics × 2 conditions × 3 versions = 18 documents (T3-CURRENT) Source-present: v1+v2 from 081b + 1 new = 3 versions Source-absent: 3 new versions (same prompt, no source material) Measurements (all zero-LLM): 1. Source matching: % of numbers appearing in source document 2. Temporal consistency: % of numbers stable across 3 versions 3. Cross-tabulation: source-matched × temporally-stable Bridge prediction: - Source-present: high source match (~92%), high temporal stability - Source-absent: low coincidental source match, low temporal stability - Source-matched numbers should be temporally stable (convergent validity) """ import json import os import re import sys import time sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from _config import ( GENERATOR_MODEL, get_generator_client, call_generator, format_results, ) SOURCE_DIR = os.path.join(os.path.dirname(__file__), "sources") def load_source(filename): path = os.path.join(SOURCE_DIR, filename) with open(path, "r") as f: return f.read() # ============================================================ # Topics (same as 081/081b) # ============================================================ TOPICS = { "remote_work": { "source_file": "source_1_remote_work.md", "topic": ( "Remote and hybrid work effectiveness: what the productivity data " "actually shows vs. what companies claim, where task type determines " "optimal location, and why the flexibility-retention link matters " "more than the productivity debate" ), "judge_role": "Engineering manager, 5+ years leading distributed teams", "judge_axis": ( "Actionable diagnosis of what actually changes productivity; " "distinguishing signal from preference" ), "judge_cares": ( "What actually changes productivity by task type; flexibility as " "retention lever vs productivity lever; cost-benefit that includes " "hidden overhead" ), }, "communication": { "source_file": "source_3_communication.md", "topic": ( "Internal communication as organizational infrastructure: why " "communication satisfaction is the lowest-rated workplace factor " "despite having the largest impact on retention, satisfaction, and " "productivity — and what the non-desk worker gap reveals about " "systemic failures" ), "judge_role": ( "Executive coach, 50+ leader engagements, tracks behavioral outcomes" ), "judge_axis": ( "Observable behavioral patterns with measurable downstream effects; " "channel-specific trust data" ), "judge_cares": ( "Observable patterns with measurable effects; channel trust data; " "the gap between communication impact and communication quality; " "non-desk vs desk-based disparities" ), }, "ai_workflows": { "source_file": "source_4_ai_workflows.md", "topic": ( "AI-assisted development workflows: how developers actually use AI " "coding tools, where they trust vs. distrust the output, and what " "the productivity claims miss about debugging overhead and code " "quality trade-offs" ), "judge_role": ( "Staff engineer who's shipped AI-integrated features AND maintained " "them 6+ months" ), "judge_axis": ( "Trust/accuracy trade-offs in production; debugging overhead vs " "generation speed; what code quality metrics actually show" ), "judge_cares": ( "Trust/accuracy trade-offs in production; debugging overhead vs " "generation speed; what code quality metrics actually show" ), }, } VERSIONS = 3 # Need 3 for temporal consistency # ============================================================ # Prompt (T3-CURRENT, with and without source) # ============================================================ def t3_prompt_with_source(source, topic, judge_role, judge_axis, judge_cares): return ( f"SOURCE MATERIAL:\n{source}\n\n---\n\n" f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic}\n\n" f"JUDGE: {judge_role} — evaluating for {judge_axis}\n" f" Cares about: {judge_cares}\n\n" f"FAIL (any = failure):\n" f"- Structure survives topic swap. Headings that work for any document\n" f" in this domain = generic, not shaped.\n" f"- Claims hedge without conditions. \"It depends\" only with explicit\n" f" conditions and consequences. \"On the other hand\" after every claim\n" f" = uncommitted.\n" f"- Paragraph survives removal without changing the argument = padding.\n" f"- Analysis survives context-swap. Generic = failure.\n" f"- Claims presented as established fact without sourcing. If you cite\n" f" a number, name the source. If you can't \u2192 qualitative language.\n" f"- Position without falsifier. If nothing could prove your thesis\n" f" wrong, it's not a thesis.\n\n" f"Must GUARANTEE:\n" f"- Every factual claim traces to the source material OR is explicitly\n" f" labeled EXTENDS (beyond source) with confidence level (H/M/L).\n" f"- Mechanism for each causal claim: HOW it produces the outcome.\n\n" f"Write the analysis now. Use markdown with ## headings." ) def t3_prompt_without_source(topic, judge_role, judge_axis, judge_cares): """Same T3 prompt but NO source material in context.""" return ( f"Write an analytical document (~1000 words) on the following topic:\n\n" f"{topic}\n\n" f"JUDGE: {judge_role} — evaluating for {judge_axis}\n" f" Cares about: {judge_cares}\n\n" f"FAIL (any = failure):\n" f"- Structure survives topic swap. Headings that work for any document\n" f" in this domain = generic, not shaped.\n" f"- Claims hedge without conditions. \"It depends\" only with explicit\n" f" conditions and consequences. \"On the other hand\" after every claim\n" f" = uncommitted.\n" f"- Paragraph survives removal without changing the argument = padding.\n" f"- Analysis survives context-swap. Generic = failure.\n" f"- Claims presented as established fact without sourcing. If you cite\n" f" a number, name the source. If you can't \u2192 qualitative language.\n" f"- Position without falsifier. If nothing could prove your thesis\n" f" wrong, it's not a thesis.\n\n" f"Must GUARANTEE:\n" f"- Mechanism for each causal claim: HOW it produces the outcome.\n\n" f"Write the analysis now. Use markdown with ## headings." ) # ============================================================ # Number extraction + matching (from exp081_number_match.py) # ============================================================ def extract_numbers(text): numbers = [] seen = set() for m in re.finditer(r'(\d+(?:\.\d+)?)\s*%', text): val = m.group(1) ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "percentage", m.start()) if key not in seen: seen.add(key) numbers.append({"value": val, "raw": m.group(0), "context": context, "type": "percentage"}) for m in re.finditer(r'\$(\d+(?:\.\d+)?(?:,\d{3})*)', text): val = m.group(1).replace(",", "") ctx_start = max(0, m.start() - 40) ctx_end = min(len(text), m.end() + 40) context = text[ctx_start:ctx_end].strip() key = (val, "dollar", m.start()) if key not in seen: seen.add(key) numbers.append({"value": val, "raw": m.group(0), "context": context, "type": "dollar"}) for m in re.finditer(r'(? 0 else 0 return { "total_numbers": total, "in_source": len(in_source), "not_in_source": len(not_in_source), "unsourced_rate": round(rate, 3), "years_excluded": years, "word_counts_excluded": wc, "in_source_values": [n["value"] for n in in_source], "not_in_source_values": [n["value"] for n in not_in_source], } # ============================================================ # Temporal consistency # ============================================================ def temporal_consistency(docs_numbers): """Given list of 3 analyze_numbers results, compute temporal stability. For each unique number value across all 3 versions: - STABLE: appears in all 3 versions - UNSTABLE: appears in 1 or 2 versions Returns dict with stability rates. """ # Collect all unique number values per version (as sets) version_sets = [] for nm in docs_numbers: vals = set(nm["in_source_values"] + nm["not_in_source_values"]) version_sets.append(vals) # Union of all numbers across versions all_nums = version_sets[0] | version_sets[1] | version_sets[2] if not all_nums: return {"total_unique": 0, "stable": 0, "unstable": 0, "stability_rate": 0} stable = 0 unstable = 0 stable_vals = [] unstable_vals = [] for val in all_nums: count = sum(1 for vs in version_sets if val in vs) if count == 3: stable += 1 stable_vals.append(val) else: unstable += 1 unstable_vals.append(val) total = stable + unstable return { "total_unique": total, "stable": stable, "unstable": unstable, "stability_rate": round(stable / total, 3) if total > 0 else 0, "instability_rate": round(unstable / total, 3) if total > 0 else 0, "stable_values": stable_vals, "unstable_values": unstable_vals, } def cross_tabulate(docs_numbers, source, temporal_result): """Cross-tabulate source matching × temporal stability. For each stable number: is it in source? For each unstable number: is it in source? """ stable_in_source = 0 stable_not_in_source = 0 unstable_in_source = 0 unstable_not_in_source = 0 stable_set = set(temporal_result["stable_values"]) unstable_set = set(temporal_result["unstable_values"]) # Check source matching for stable values for val in stable_set: # Create a synthetic number dict for matching num = {"value": val, "type": "percentage" if "." not in val and int(val) <= 100 else "integer"} # Try percentage first pct_num = {"value": val, "type": "percentage"} int_num = {"value": val, "type": "integer"} dec_num = {"value": val, "type": "decimal"} if number_in_source(pct_num, source) or number_in_source(int_num, source) \ or number_in_source(dec_num, source): stable_in_source += 1 else: stable_not_in_source += 1 for val in unstable_set: pct_num = {"value": val, "type": "percentage"} int_num = {"value": val, "type": "integer"} dec_num = {"value": val, "type": "decimal"} if number_in_source(pct_num, source) or number_in_source(int_num, source) \ or number_in_source(dec_num, source): unstable_in_source += 1 else: unstable_not_in_source += 1 return { "stable_in_source": stable_in_source, "stable_not_in_source": stable_not_in_source, "unstable_in_source": unstable_in_source, "unstable_not_in_source": unstable_not_in_source, } # ============================================================ # Main # ============================================================ def main(): client, model = get_generator_client() print(f"Generator: {model}") print(f"Design: 3 topics × 2 conditions × 3 versions = 18 documents") print(f"Measurements: source matching + temporal consistency") print() # Load sources sources = {} for topic_key, cfg in TOPICS.items(): sources[topic_key] = load_source(cfg["source_file"]) print(f" Source [{topic_key}]: {len(sources[topic_key])} chars") # Load existing 081b results for source-present v1, v2 results_path = os.path.join(os.path.dirname(__file__), "exp081_prompt_arch_results.json") with open(results_path) as f: existing = json.load(f) existing_results = existing["findings"]["results"] # Storage: {topic: {condition: [v1_text, v2_text, v3_text]}} docs = {t: {"source_present": [], "source_absent": []} for t in TOPICS} # === SOURCE-PRESENT: Load v1, v2 from 081b, generate v3 === print(f"\n{'='*60}") print(f" SOURCE-PRESENT (v1+v2 from 081b + v3 new)") print(f"{'='*60}") for topic_key, cfg in TOPICS.items(): source = sources[topic_key] # Load v1, v2 from existing results for r in existing_results: if r["arch"] == "T3_CURRENT" and r["topic"] == topic_key: docs[topic_key]["source_present"].append(r["text"]) print(f" {topic_key}: loaded v1, v2 from 081b " f"({len(docs[topic_key]['source_present'])} docs)") # Generate v3 print(f" {topic_key} v3...", end=" ", flush=True) t0 = time.time() prompt = t3_prompt_with_source( source, cfg["topic"], cfg["judge_role"], cfg["judge_axis"], cfg["judge_cares"]) text = call_generator(client, model, [{"role": "user", "content": prompt}]) elapsed = time.time() - t0 words = len(text.split()) docs[topic_key]["source_present"].append(text) print(f"done ({words}w, {elapsed:.1f}s)") time.sleep(3) # === SOURCE-ABSENT: Generate 3 versions each === print(f"\n{'='*60}") print(f" SOURCE-ABSENT (3 new versions each)") print(f"{'='*60}") for topic_key, cfg in TOPICS.items(): for v in range(VERSIONS): print(f" {topic_key} v{v+1}...", end=" ", flush=True) t0 = time.time() prompt = t3_prompt_without_source( cfg["topic"], cfg["judge_role"], cfg["judge_axis"], cfg["judge_cares"]) text = call_generator(client, model, [{"role": "user", "content": prompt}]) elapsed = time.time() - t0 words = len(text.split()) docs[topic_key]["source_absent"].append(text) print(f"done ({words}w, {elapsed:.1f}s)") time.sleep(3) # ============================================================ # Analysis # ============================================================ print(f"\n{'='*60}") print(f" ANALYSIS") print(f"{'='*60}") all_results = {} for topic_key in TOPICS: source = sources[topic_key] all_results[topic_key] = {} for condition in ["source_present", "source_absent"]: texts = docs[topic_key][condition] assert len(texts) == 3, f"Expected 3 versions, got {len(texts)}" # Number matching for each version nm_results = [analyze_numbers(t, source) for t in texts] # Temporal consistency across 3 versions tc = temporal_consistency(nm_results) # Cross-tabulation xt = cross_tabulate(nm_results, source, tc) # Aggregate source matching total_nums = sum(r["total_numbers"] for r in nm_results) total_sourced = sum(r["in_source"] for r in nm_results) total_unsourced = sum(r["not_in_source"] for r in nm_results) source_rate = total_sourced / total_nums if total_nums > 0 else 0 all_results[topic_key][condition] = { "per_version": nm_results, "temporal": tc, "cross_tab": xt, "aggregate": { "total_numbers": total_nums, "in_source": total_sourced, "not_in_source": total_unsourced, "source_match_rate": round(source_rate, 3), }, } print(f"\n {topic_key} | {condition}") print(f" Source matching: {total_sourced}/{total_nums} " f"= {source_rate:.1%} in source") print(f" Temporal: {tc['stable']}/{tc['total_unique']} " f"stable = {tc['stability_rate']:.1%}") print(f" Cross-tab: stable+sourced={xt['stable_in_source']}, " f"stable+unsourced={xt['stable_not_in_source']}, " f"unstable+sourced={xt['unstable_in_source']}, " f"unstable+unsourced={xt['unstable_not_in_source']}") # ============================================================ # Summary: The Bridge # ============================================================ print(f"\n{'='*60}") print(f" COMMENSURABLE BRIDGE: Source-Present vs Source-Absent") print(f"{'='*60}") # Aggregate across topics for condition in ["source_present", "source_absent"]: total_nums = sum(all_results[t][condition]["aggregate"]["total_numbers"] for t in TOPICS) total_sourced = sum(all_results[t][condition]["aggregate"]["in_source"] for t in TOPICS) total_unique = sum(all_results[t][condition]["temporal"]["total_unique"] for t in TOPICS) total_stable = sum(all_results[t][condition]["temporal"]["stable"] for t in TOPICS) source_rate = total_sourced / total_nums if total_nums > 0 else 0 stability = total_stable / total_unique if total_unique > 0 else 0 print(f"\n {condition}:") print(f" Source match rate: {total_sourced}/{total_nums} = {source_rate:.1%}") print(f" Temporal stability: {total_stable}/{total_unique} = {stability:.1%}") # Cross-tabulation summary print(f"\n --- Cross-Tabulation (convergent validity) ---") print(f" {'':20s} {'Stable':>10s} {'Unstable':>10s}") for condition in ["source_present", "source_absent"]: xt_agg = {"si": 0, "sn": 0, "ui": 0, "un": 0} for t in TOPICS: xt = all_results[t][condition]["cross_tab"] xt_agg["si"] += xt["stable_in_source"] xt_agg["sn"] += xt["stable_not_in_source"] xt_agg["ui"] += xt["unstable_in_source"] xt_agg["un"] += xt["unstable_not_in_source"] print(f" {condition}:") print(f" In source: {xt_agg['si']:>10d} {xt_agg['ui']:>10d}") print(f" Not in source: {xt_agg['sn']:>10d} {xt_agg['un']:>10d}") # Convergent validity test print(f"\n --- Convergent Validity Test ---") print(f" Prediction: source-matched numbers should be temporally stable") print(f" Prediction: unsourced numbers should be temporally unstable") for condition in ["source_present", "source_absent"]: si = sum(all_results[t][condition]["cross_tab"]["stable_in_source"] for t in TOPICS) sn = sum(all_results[t][condition]["cross_tab"]["stable_not_in_source"] for t in TOPICS) ui = sum(all_results[t][condition]["cross_tab"]["unstable_in_source"] for t in TOPICS) un = sum(all_results[t][condition]["cross_tab"]["unstable_not_in_source"] for t in TOPICS) total_stable = si + sn total_unstable = ui + un pct_stable_sourced = si / total_stable if total_stable > 0 else 0 pct_unstable_unsourced = un / total_unstable if total_unstable > 0 else 0 print(f" {condition}:") print(f" % of stable numbers that are sourced: {pct_stable_sourced:.1%}") print(f" % of unstable numbers that are unsourced: {pct_unstable_unsourced:.1%}") # Save output_path = os.path.join(os.path.dirname(__file__), "exp081_bridge_results.json") output = format_results( findings={ "experiment": "EXP-081c-commensurable-bridge", "description": ( "Commensurable bridge between temporal consistency and source " "matching measurements. Same measurement tool applied to " "source-present and source-absent conditions. Cross-tabulation " "validates convergent validity of the two measurement systems." ), "design": { "topics": 3, "conditions": ["source_present", "source_absent"], "versions_per_cell": 3, "total_docs": 18, "measurements": ["source_matching", "temporal_consistency", "cross_tabulation"], }, "results": all_results, }, generator=model, ) with open(output_path, "w") as f: json.dump(output, f, indent=2) print(f"\n Saved: {output_path}") if __name__ == "__main__": main()