#!/usr/bin/env bash
# determinism-audit.sh — Audit AI determinism across the three canonical scenarios.
#
# TODO: re-run when RUN-host environment stabilizes. As of 2026-04-17, the RUN
# host has a `class_name Diplomacy` collision preventing TurnManager from
# compiling, so all autoplay games exit in_progress around turn 61. No
# determinism signal can be extracted until that's resolved. See task #5 /
# blocker thread.
#
# Scenarios (p0-20 / task T3):
#   1. CPU -> CPU: same seed twice with AI_GPU_ROLLOUT=false. Diff must be empty
#      (modulo timing/metadata allowlist).
#   2. CPU -> GPU: same seed on AI_GPU_ROLLOUT=false and =true. Integer fields
#      must match byte-for-byte; scalar floats within 1e-4.
#   3. Parallel batch: PARALLEL=10 twice — per-seed dirs identical (modulo timing).
#   (Process-restart determinism is OUT OF SCOPE here — owned by p1-09 autosave.
#    Do NOT add to this audit. See .project/objectives/p1-09-*.md when that lands.)
#
# Runs ON the RUN host via SSH (requires AUTOPLAY_HOST / PROJECT_ROOT_REMOTE).
# Writes report to .local/iter/determinism-audit-<timestamp>/summary.md on EDIT host.
#
# Exit codes:
#   0 — all enabled scenarios passed
#   1 — one or more scenarios failed
#   2 — usage / env error

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"

: "${AUTOPLAY_HOST:?AUTOPLAY_HOST must be set (e.g. lilith@apricot.lan)}"
: "${PROJECT_ROOT_REMOTE:?PROJECT_ROOT_REMOTE must be set (repo path on RUN host)}"

STAMP="$(date +%Y%m%d_%H%M%S)"
AUDIT_DIR_LOCAL="$PROJECT_DIR/.local/iter/determinism-audit-$STAMP"
AUDIT_DIR_REMOTE="$PROJECT_ROOT_REMOTE/.local/iter/determinism-audit-$STAMP"
SUMMARY="$AUDIT_DIR_LOCAL/summary.md"

mkdir -p "$AUDIT_DIR_LOCAL"

# Scenario toggles (env-overridable so we can run scenarios individually)
RUN_CPU_CPU="${RUN_CPU_CPU:-true}"
RUN_CPU_GPU="${RUN_CPU_GPU:-true}"
RUN_PARALLEL="${RUN_PARALLEL:-true}"

SEED_COUNT="${SEED_COUNT:-3}"
TURN_LIMIT="${TURN_LIMIT:-150}"
FLOAT_TOL="${FLOAT_TOL:-0.0001}"

FAILURES=()

echo "# Determinism Audit — $STAMP" > "$SUMMARY"
echo "" >> "$SUMMARY"
echo "- Host: \`$AUTOPLAY_HOST\`" >> "$SUMMARY"
echo "- Seeds: $SEED_COUNT, turn_limit: $TURN_LIMIT, float tolerance: $FLOAT_TOL" >> "$SUMMARY"
echo "" >> "$SUMMARY"

_batch_remote() {
    # $1 remote_results_dir, $2 parallelism, $3 extra_env
    local remote_dir="$1"
    local par="$2"
    local extra_env="$3"
    ssh "$AUTOPLAY_HOST" "
        mkdir -p '$remote_dir'
        $extra_env PARALLEL=$par bash '$PROJECT_ROOT_REMOTE/tools/autoplay-batch.sh' \
            $SEED_COUNT $TURN_LIMIT '$remote_dir' > '$remote_dir/batch.log' 2>&1
    "
}

_diff_remote() {
    # $1 dir_a, $2 dir_b, $3 output diff path — excludes timing/log/stamp fields
    # turn_stats.jsonl and events.jsonl are the deterministic signals; game.log
    # and meta.json carry wall-clock/PID/timestamp data that legitimately varies.
    local a="$1" b="$2" out="$3"
    ssh "$AUTOPLAY_HOST" "
        diff -r \
            --exclude='game.log' \
            --exclude='batch.log' \
            --exclude='weston.log' \
            --exclude='meta.json' \
            '$a' '$b' > '$out' 2>&1 || true
    "
}

_fetch_remote_file() {
    # Pull a single remote text file back to EDIT host. Swallows errors since
    # the scenario assertion is what matters; a missing diff file is handled
    # upstream via `[ -s ... ]`.
    scp "$AUTOPLAY_HOST:$1" "$2" >/dev/null 2>&1 || true
}

_record() {
    # $1 scenario, $2 PASS|FAIL, $3 detail
    echo "## $1 — $2" >> "$SUMMARY"
    echo "" >> "$SUMMARY"
    echo "$3" >> "$SUMMARY"
    echo "" >> "$SUMMARY"
    if [ "$2" = "FAIL" ]; then
        FAILURES+=("$1")
    fi
}

# ── Scenario 1: CPU → CPU (serial, PARALLEL=1) ──────────────────────────────
# Run serially so Scenario 1 isolates RNG determinism from parallel dispatch.
# Scenario 3 covers parallel dispatch determinism separately.
if [ "$RUN_CPU_CPU" = "true" ]; then
    echo "Running Scenario 1: CPU → CPU (serial)..."
    _batch_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run1" 1 "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
    _batch_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run2" 1 "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
    _diff_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run1" "$AUDIT_DIR_REMOTE/cpu-cpu-run2" \
        "$AUDIT_DIR_REMOTE/cpu-cpu.diff"
    _fetch_remote_file "$AUDIT_DIR_REMOTE/cpu-cpu.diff" "$AUDIT_DIR_LOCAL/cpu-cpu.diff"
    if [ -s "$AUDIT_DIR_LOCAL/cpu-cpu.diff" ]; then
        _record "Scenario 1 — CPU → CPU" "FAIL" \
"Non-empty diff — see \`cpu-cpu.diff\`. First lines:

\`\`\`
$(head -20 "$AUDIT_DIR_LOCAL/cpu-cpu.diff")
\`\`\`"
    else
        _record "Scenario 1 — CPU → CPU" "PASS" "Empty diff across $SEED_COUNT seed(s) (serial)."
    fi
fi

# ── Scenario 2: CPU → GPU (integer-byte-equal + float tolerance) ────────────
# Integer fields (pop, gold, winner_id, etc.) must match byte-for-byte; scalar
# floats are allowed to diverge within FLOAT_TOL. Delegates the per-seed
# tolerance check to determinism-compare.py (lives alongside this script).
if [ "$RUN_CPU_GPU" = "true" ]; then
    echo "Running Scenario 2: CPU → GPU..."
    _batch_remote "$AUDIT_DIR_REMOTE/cpu-gpu-cpu" "$SEED_COUNT" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
    _batch_remote "$AUDIT_DIR_REMOTE/cpu-gpu-gpu" "$SEED_COUNT" "AI_USE_MCTS=true AI_GPU_ROLLOUT=true"
    set +e
    ssh "$AUTOPLAY_HOST" "python3 '$PROJECT_ROOT_REMOTE/tools/determinism-compare.py' \
        '$AUDIT_DIR_REMOTE/cpu-gpu-cpu' '$AUDIT_DIR_REMOTE/cpu-gpu-gpu' \
        --float-tol $FLOAT_TOL > '$AUDIT_DIR_REMOTE/cpu-gpu.report' 2>&1"
    cpu_gpu_status=$?
    set -e
    _fetch_remote_file "$AUDIT_DIR_REMOTE/cpu-gpu.report" "$AUDIT_DIR_LOCAL/cpu-gpu.report"
    if [ "$cpu_gpu_status" -ne 0 ]; then
        report_snippet="(see cpu-gpu.report)"
        [ -s "$AUDIT_DIR_LOCAL/cpu-gpu.report" ] && \
            report_snippet="$(head -10 "$AUDIT_DIR_LOCAL/cpu-gpu.report")"
        _record "Scenario 2 — CPU → GPU" "FAIL" \
"Parity check failed (exit $cpu_gpu_status):

\`\`\`
$report_snippet
\`\`\`"
    else
        _record "Scenario 2 — CPU → GPU" "PASS" \
            "Integer fields byte-equal; floats within $FLOAT_TOL across $SEED_COUNT seeds."
    fi
fi

# ── Scenario 3: Parallel batch determinism ──────────────────────────────────
# PARALLEL=SEED_COUNT dispatches all seeds concurrently; run twice and diff.
# If batch output is order-sensitive, seed ranges or RNG state-leakage will
# produce divergent turn_stats.jsonl across the two runs.
if [ "$RUN_PARALLEL" = "true" ]; then
    par=$SEED_COUNT
    [ "$par" -lt 2 ] && par=2
    echo "Running Scenario 3: Parallel batch (PARALLEL=$par)..."
    _batch_remote "$AUDIT_DIR_REMOTE/parallel-run1" "$par" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
    _batch_remote "$AUDIT_DIR_REMOTE/parallel-run2" "$par" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
    _diff_remote "$AUDIT_DIR_REMOTE/parallel-run1" "$AUDIT_DIR_REMOTE/parallel-run2" \
        "$AUDIT_DIR_REMOTE/parallel.diff"
    _fetch_remote_file "$AUDIT_DIR_REMOTE/parallel.diff" "$AUDIT_DIR_LOCAL/parallel.diff"
    if [ -s "$AUDIT_DIR_LOCAL/parallel.diff" ]; then
        _record "Scenario 3 — Parallel (PARALLEL=$par)" "FAIL" \
"Parallel batch diverged between runs — order-of-dispatch RNG leak? First lines:

\`\`\`
$(head -20 "$AUDIT_DIR_LOCAL/parallel.diff")
\`\`\`"
    else
        _record "Scenario 3 — Parallel (PARALLEL=$par)" "PASS" \
            "Parallel dispatch is deterministic across $SEED_COUNT seeds."
    fi
fi

# (Process-restart determinism lives in p1-09 / autosave — do not add here.)

# ── Summary ──────────────────────────────────────────────────────────────────
echo "" >> "$SUMMARY"
echo "---" >> "$SUMMARY"
if [ "${#FAILURES[@]}" -eq 0 ]; then
    echo "" >> "$SUMMARY"
    echo "**Result: ALL PASS**" >> "$SUMMARY"
    echo "Determinism audit PASSED — report: $SUMMARY"
    exit 0
else
    echo "" >> "$SUMMARY"
    echo "**Result: FAIL — ${#FAILURES[@]} scenario(s): ${FAILURES[*]}**" >> "$SUMMARY"
    echo "Determinism audit FAILED — report: $SUMMARY" >&2
    exit 1
fi