#!/usr/bin/env bash # determinism-audit.sh — Audit AI determinism across the three canonical scenarios. # # TODO: re-run when RUN-host environment stabilizes. As of 2026-04-17, the RUN # host has a `class_name Diplomacy` collision preventing TurnManager from # compiling, so all autoplay games exit in_progress around turn 61. No # determinism signal can be extracted until that's resolved. See task #5 / # blocker thread. # # Scenarios (p0-20 / task T3): # 1. CPU -> CPU: same seed twice with AI_GPU_ROLLOUT=false. Diff must be empty # (modulo timing/metadata allowlist). # 2. CPU -> GPU: same seed on AI_GPU_ROLLOUT=false and =true. Integer fields # must match byte-for-byte; scalar floats within 1e-4. # 3. Parallel batch: PARALLEL=10 twice — per-seed dirs identical (modulo timing). # (Process-restart determinism is OUT OF SCOPE here — owned by p1-09 autosave. # Do NOT add to this audit. See .project/objectives/p1-09-*.md when that lands.) # # Runs ON the RUN host via SSH (requires AUTOPLAY_HOST / PROJECT_ROOT_REMOTE). # Writes report to .local/iter/determinism-audit-/summary.md on EDIT host. # # Exit codes: # 0 — all enabled scenarios passed # 1 — one or more scenarios failed # 2 — usage / env error set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" : "${AUTOPLAY_HOST:?AUTOPLAY_HOST must be set (e.g. lilith@apricot.lan)}" : "${PROJECT_ROOT_REMOTE:?PROJECT_ROOT_REMOTE must be set (repo path on RUN host)}" STAMP="$(date +%Y%m%d_%H%M%S)" AUDIT_DIR_LOCAL="$PROJECT_DIR/.local/iter/determinism-audit-$STAMP" AUDIT_DIR_REMOTE="$PROJECT_ROOT_REMOTE/.local/iter/determinism-audit-$STAMP" SUMMARY="$AUDIT_DIR_LOCAL/summary.md" mkdir -p "$AUDIT_DIR_LOCAL" # Scenario toggles (env-overridable so we can run scenarios individually) RUN_CPU_CPU="${RUN_CPU_CPU:-true}" RUN_CPU_GPU="${RUN_CPU_GPU:-true}" RUN_PARALLEL="${RUN_PARALLEL:-true}" SEED_COUNT="${SEED_COUNT:-3}" TURN_LIMIT="${TURN_LIMIT:-150}" FLOAT_TOL="${FLOAT_TOL:-0.0001}" FAILURES=() echo "# Determinism Audit — $STAMP" > "$SUMMARY" echo "" >> "$SUMMARY" echo "- Host: \`$AUTOPLAY_HOST\`" >> "$SUMMARY" echo "- Seeds: $SEED_COUNT, turn_limit: $TURN_LIMIT, float tolerance: $FLOAT_TOL" >> "$SUMMARY" echo "" >> "$SUMMARY" _batch_remote() { # $1 remote_results_dir, $2 parallelism, $3 extra_env local remote_dir="$1" local par="$2" local extra_env="$3" ssh "$AUTOPLAY_HOST" " mkdir -p '$remote_dir' $extra_env PARALLEL=$par bash '$PROJECT_ROOT_REMOTE/tools/autoplay-batch.sh' \ $SEED_COUNT $TURN_LIMIT '$remote_dir' > '$remote_dir/batch.log' 2>&1 " } _diff_remote() { # $1 dir_a, $2 dir_b, $3 output diff path — excludes timing/log/stamp fields # turn_stats.jsonl and events.jsonl are the deterministic signals; game.log # and meta.json carry wall-clock/PID/timestamp data that legitimately varies. local a="$1" b="$2" out="$3" ssh "$AUTOPLAY_HOST" " diff -r \ --exclude='game.log' \ --exclude='batch.log' \ --exclude='weston.log' \ --exclude='meta.json' \ '$a' '$b' > '$out' 2>&1 || true " } _fetch_remote_file() { # Pull a single remote text file back to EDIT host. Swallows errors since # the scenario assertion is what matters; a missing diff file is handled # upstream via `[ -s ... ]`. scp "$AUTOPLAY_HOST:$1" "$2" >/dev/null 2>&1 || true } _record() { # $1 scenario, $2 PASS|FAIL, $3 detail echo "## $1 — $2" >> "$SUMMARY" echo "" >> "$SUMMARY" echo "$3" >> "$SUMMARY" echo "" >> "$SUMMARY" if [ "$2" = "FAIL" ]; then FAILURES+=("$1") fi } # ── Scenario 1: CPU → CPU (serial, PARALLEL=1) ────────────────────────────── # Run serially so Scenario 1 isolates RNG determinism from parallel dispatch. # Scenario 3 covers parallel dispatch determinism separately. if [ "$RUN_CPU_CPU" = "true" ]; then echo "Running Scenario 1: CPU → CPU (serial)..." _batch_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run1" 1 "AI_USE_MCTS=true AI_GPU_ROLLOUT=false" _batch_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run2" 1 "AI_USE_MCTS=true AI_GPU_ROLLOUT=false" _diff_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run1" "$AUDIT_DIR_REMOTE/cpu-cpu-run2" \ "$AUDIT_DIR_REMOTE/cpu-cpu.diff" _fetch_remote_file "$AUDIT_DIR_REMOTE/cpu-cpu.diff" "$AUDIT_DIR_LOCAL/cpu-cpu.diff" if [ -s "$AUDIT_DIR_LOCAL/cpu-cpu.diff" ]; then _record "Scenario 1 — CPU → CPU" "FAIL" \ "Non-empty diff — see \`cpu-cpu.diff\`. First lines: \`\`\` $(head -20 "$AUDIT_DIR_LOCAL/cpu-cpu.diff") \`\`\`" else _record "Scenario 1 — CPU → CPU" "PASS" "Empty diff across $SEED_COUNT seed(s) (serial)." fi fi # ── Scenario 2: CPU → GPU (integer-byte-equal + float tolerance) ──────────── # Integer fields (pop, gold, winner_id, etc.) must match byte-for-byte; scalar # floats are allowed to diverge within FLOAT_TOL. Delegates the per-seed # tolerance check to determinism-compare.py (lives alongside this script). if [ "$RUN_CPU_GPU" = "true" ]; then echo "Running Scenario 2: CPU → GPU..." _batch_remote "$AUDIT_DIR_REMOTE/cpu-gpu-cpu" "$SEED_COUNT" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false" _batch_remote "$AUDIT_DIR_REMOTE/cpu-gpu-gpu" "$SEED_COUNT" "AI_USE_MCTS=true AI_GPU_ROLLOUT=true" set +e ssh "$AUTOPLAY_HOST" "python3 '$PROJECT_ROOT_REMOTE/tools/determinism-compare.py' \ '$AUDIT_DIR_REMOTE/cpu-gpu-cpu' '$AUDIT_DIR_REMOTE/cpu-gpu-gpu' \ --float-tol $FLOAT_TOL > '$AUDIT_DIR_REMOTE/cpu-gpu.report' 2>&1" cpu_gpu_status=$? set -e _fetch_remote_file "$AUDIT_DIR_REMOTE/cpu-gpu.report" "$AUDIT_DIR_LOCAL/cpu-gpu.report" if [ "$cpu_gpu_status" -ne 0 ]; then report_snippet="(see cpu-gpu.report)" [ -s "$AUDIT_DIR_LOCAL/cpu-gpu.report" ] && \ report_snippet="$(head -10 "$AUDIT_DIR_LOCAL/cpu-gpu.report")" _record "Scenario 2 — CPU → GPU" "FAIL" \ "Parity check failed (exit $cpu_gpu_status): \`\`\` $report_snippet \`\`\`" else _record "Scenario 2 — CPU → GPU" "PASS" \ "Integer fields byte-equal; floats within $FLOAT_TOL across $SEED_COUNT seeds." fi fi # ── Scenario 3: Parallel batch determinism ────────────────────────────────── # PARALLEL=SEED_COUNT dispatches all seeds concurrently; run twice and diff. # If batch output is order-sensitive, seed ranges or RNG state-leakage will # produce divergent turn_stats.jsonl across the two runs. if [ "$RUN_PARALLEL" = "true" ]; then par=$SEED_COUNT [ "$par" -lt 2 ] && par=2 echo "Running Scenario 3: Parallel batch (PARALLEL=$par)..." _batch_remote "$AUDIT_DIR_REMOTE/parallel-run1" "$par" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false" _batch_remote "$AUDIT_DIR_REMOTE/parallel-run2" "$par" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false" _diff_remote "$AUDIT_DIR_REMOTE/parallel-run1" "$AUDIT_DIR_REMOTE/parallel-run2" \ "$AUDIT_DIR_REMOTE/parallel.diff" _fetch_remote_file "$AUDIT_DIR_REMOTE/parallel.diff" "$AUDIT_DIR_LOCAL/parallel.diff" if [ -s "$AUDIT_DIR_LOCAL/parallel.diff" ]; then _record "Scenario 3 — Parallel (PARALLEL=$par)" "FAIL" \ "Parallel batch diverged between runs — order-of-dispatch RNG leak? First lines: \`\`\` $(head -20 "$AUDIT_DIR_LOCAL/parallel.diff") \`\`\`" else _record "Scenario 3 — Parallel (PARALLEL=$par)" "PASS" \ "Parallel dispatch is deterministic across $SEED_COUNT seeds." fi fi # (Process-restart determinism lives in p1-09 / autosave — do not add here.) # ── Summary ────────────────────────────────────────────────────────────────── echo "" >> "$SUMMARY" echo "---" >> "$SUMMARY" if [ "${#FAILURES[@]}" -eq 0 ]; then echo "" >> "$SUMMARY" echo "**Result: ALL PASS**" >> "$SUMMARY" echo "Determinism audit PASSED — report: $SUMMARY" exit 0 else echo "" >> "$SUMMARY" echo "**Result: FAIL — ${#FAILURES[@]} scenario(s): ${FAILURES[*]}**" >> "$SUMMARY" echo "Determinism audit FAILED — report: $SUMMARY" >&2 exit 1 fi