diff --git a/scripts/run/verify.sh b/scripts/run/verify.sh index 884ace79..9312118f 100644 --- a/scripts/run/verify.sh +++ b/scripts/run/verify.sh @@ -85,7 +85,7 @@ cmd_verify() { echo -e "${BLUE}─────────────────────────────────────────────────${NC}" } - local TOTAL=15 + local TOTAL=16 # Step 0 — Game data schema validation _verify_step 0 $TOTAL "game data JSON schemas" \ @@ -154,6 +154,12 @@ cmd_verify() { _verify_step 14 $TOTAL "godot headless boot (no script errors)" \ _godot_headless_boot + # Step 15 — Autoplay hang-regression smoke test (p0-10 gate). + # Skips silently when neither AUTOPLAY_HOST nor local flatpak is available + # so this gate runs opportunistically on dev boxes without a RUN host. + _verify_step 15 $TOTAL "autoplay hang smoke (seed 1, T100, 180s budget)" \ + _verify_autoplay_smoke + _verify_summary return $overall_exit } @@ -224,6 +230,17 @@ _verify_file_size_cap() { return 0 } +_verify_autoplay_smoke() { + # Skips when no RUN host and no local flatpak — dev boxes without a batch + # target still get the rest of the pipeline. + if [ -z "${AUTOPLAY_HOST:-}" ] && ! command -v flatpak >/dev/null 2>&1; then + echo "SKIP: no AUTOPLAY_HOST and no local flatpak" + return 0 + fi + bash "$REPO_ROOT/tools/ci-autoplay-smoke.sh" +} + + _godot_headless_boot() { # Boot Godot headless and check for SCRIPT ERRORs. # Catches class_name resolution failures, GDExtension load failures, diff --git a/src/game/engine/src/generation/auto_play.gd b/src/game/engine/src/generation/auto_play.gd index 04a10c5f..7a01aec8 100644 --- a/src/game/engine/src/generation/auto_play.gd +++ b/src/game/engine/src/generation/auto_play.gd @@ -478,6 +478,10 @@ func _process(_delta: float) -> void: if _frame == 10: _turn_count += 1 _play_turn() + # SMOKE-TEST HANG INJECTION — remove before commit + if _turn_count == 5: + while true: + OS.delay_msec(10000) if _turn_count % _screenshot_interval == 1 or _turn_count <= 3: _screenshot("turn_%03d" % _turn_count) if _frame == 20: diff --git a/src/simulator/crates/mc-ai/src/game_state.rs b/src/simulator/crates/mc-ai/src/game_state.rs index a979912e..f38855af 100644 --- a/src/simulator/crates/mc-ai/src/game_state.rs +++ b/src/simulator/crates/mc-ai/src/game_state.rs @@ -213,3 +213,165 @@ impl StrategicWeights { } } } + +#[cfg(test)] +mod tests { + use super::*; + + // ── AxisId ─────────────────────────────────────────────────────────── + + #[test] + fn axis_id_discriminants_are_stable() { + // These discriminants are the GPU upload contract — changing them + // invalidates in-flight AbstractRolloutState axes arrays. Lock them. + assert_eq!(AxisId::Expansion as u8, 0); + assert_eq!(AxisId::Production as u8, 1); + assert_eq!(AxisId::Wealth as u8, 2); + assert_eq!(AxisId::Culture as u8, 3); + assert_eq!(AxisId::COUNT, 8, "COUNT must match the flat array size"); + } + + #[test] + fn axis_id_as_str_matches_json_keys() { + // The flat-map round-trip relies on these names matching what lives + // in public/games/age-of-dwarves/data/ai_personalities.json keys. + assert_eq!(AxisId::Expansion.as_str(), "expansion"); + assert_eq!(AxisId::Production.as_str(), "production"); + assert_eq!(AxisId::Wealth.as_str(), "wealth"); + assert_eq!(AxisId::Culture.as_str(), "culture"); + } + + // ── axes_to_flat / flat_to_axes round-trip ─────────────────────────── + + #[test] + fn axes_to_flat_encodes_named_axes_into_fixed_slots() { + let mut axes = HashMap::new(); + axes.insert("expansion".to_string(), 7); + axes.insert("production".to_string(), 3); + axes.insert("wealth".to_string(), 9); + axes.insert("culture".to_string(), 1); + let flat = axes_to_flat(&axes); + assert_eq!(flat[0], 7, "expansion → slot 0"); + assert_eq!(flat[1], 3, "production → slot 1"); + assert_eq!(flat[2], 9, "wealth → slot 2"); + assert_eq!(flat[3], 1, "culture → slot 3"); + assert_eq!(&flat[4..], &[0, 0, 0, 0], "slots 4-7 must be zero (reserved)"); + } + + #[test] + fn axes_to_flat_treats_missing_keys_as_zero() { + let axes: HashMap = HashMap::new(); + let flat = axes_to_flat(&axes); + assert_eq!(flat, [0u8; 8], "empty input → all zeros"); + } + + #[test] + fn axes_to_flat_ignores_unknown_keys() { + let mut axes = HashMap::new(); + axes.insert("expansion".to_string(), 5); + axes.insert("nonsense_axis".to_string(), 99); // should be ignored + axes.insert("magic".to_string(), 42); // reserved slot, not named + let flat = axes_to_flat(&axes); + assert_eq!(flat[0], 5); + assert!( + !flat.contains(&42) && !flat.contains(&99), + "unknown keys must not leak into slots: {flat:?}" + ); + } + + #[test] + fn flat_to_axes_decodes_only_named_slots() { + let flat = [7u8, 3, 9, 1, 99, 99, 99, 99]; // slots 4-7 poisoned + let axes = flat_to_axes(&flat); + assert_eq!(axes.len(), 4, "only 4 named slots must round-trip"); + assert_eq!(axes.get("expansion"), Some(&7)); + assert_eq!(axes.get("production"), Some(&3)); + assert_eq!(axes.get("wealth"), Some(&9)); + assert_eq!(axes.get("culture"), Some(&1)); + // Reserved slots 4-7 must not appear under any string key. + assert!(!axes.values().any(|&v| v == 99)); + } + + #[test] + fn axes_round_trip_preserves_named_values() { + // The only claim we make is round-trip fidelity for the named axes. + // This is the GPU upload's canonical invariant. + let mut axes = HashMap::new(); + axes.insert("expansion".to_string(), 4); + axes.insert("production".to_string(), 8); + axes.insert("wealth".to_string(), 2); + axes.insert("culture".to_string(), 6); + let flat = axes_to_flat(&axes); + let back = flat_to_axes(&flat); + assert_eq!(back.get("expansion"), Some(&4)); + assert_eq!(back.get("production"), Some(&8)); + assert_eq!(back.get("wealth"), Some(&2)); + assert_eq!(back.get("culture"), Some(&6)); + } + + // ── StrategicWeights ───────────────────────────────────────────────── + + #[test] + fn strategic_weights_neutral_is_balanced() { + let w = StrategicWeights::neutral(); + for &(label, v) in &[ + ("aggression", w.aggression), + ("expansion", w.expansion), + ("research", w.research), + ("defense", w.defense), + ("economy", w.economy), + ] { + assert!( + (0.0..=1.0).contains(&v), + "{label} neutral weight {v} out of [0,1]" + ); + assert!( + (v - 0.5).abs() < 1e-6, + "{label} neutral must be 0.5, got {v}" + ); + } + } + + #[test] + fn strategic_weights_from_race_axes_normalizes_to_0_1() { + // Extreme inputs: -10 → 0.0, +10 → 1.0, 0 → 0.5. + let mut axes = HashMap::new(); + axes.insert("expansion".to_string(), 10); + axes.insert("wealth".to_string(), -10); + axes.insert("culture".to_string(), 0); + let w = StrategicWeights::from_race_axes(&axes); + + assert!((w.expansion - 1.0).abs() < 1e-6, "expansion=+10 → 1.0, got {}", w.expansion); + assert!((w.aggression - 1.0).abs() < 1e-6, "aggression tracks expansion, got {}", w.aggression); + assert!((w.economy - 0.0).abs() < 1e-6, "wealth=-10 → economy 0.0, got {}", w.economy); + // defense = max(1 - expansion, 0.2) = max(0, 0.2) = 0.2 floor + assert!((w.defense - 0.2).abs() < 1e-6, "defense floor 0.2 when expansion=1.0, got {}", w.defense); + // research = (culture + wealth) / 2 = (0.5 + 0) / 2 = 0.25 + assert!((w.research - 0.25).abs() < 1e-6, "research is (culture+wealth)/2, got {}", w.research); + } + + #[test] + fn strategic_weights_from_race_axes_handles_missing_keys() { + // Missing keys default to 0 (which normalizes to 0.5), so neutral-ish. + let axes: HashMap = HashMap::new(); + let w = StrategicWeights::from_race_axes(&axes); + for v in [w.aggression, w.expansion, w.research, w.economy] { + assert!((v - 0.5).abs() < 1e-6, "missing-key default must be 0.5, got {v}"); + } + // defense floor clamps at 0.2 — but at expansion=0.5, 1-0.5=0.5 wins. + assert!((w.defense - 0.5).abs() < 1e-6, "defense {}; expected 0.5 when expansion=0.5", w.defense); + } + + #[test] + fn strategic_weights_from_race_axes_clamps_out_of_range() { + // Inputs beyond [-10, +10] should be clamped, not panic or produce NaN. + let mut axes = HashMap::new(); + axes.insert("expansion".to_string(), 99); + axes.insert("wealth".to_string(), -99); + let w = StrategicWeights::from_race_axes(&axes); + for v in [w.aggression, w.expansion, w.research, w.defense, w.economy] { + assert!(v.is_finite(), "weight must be finite, got {v}"); + assert!((0.0..=1.0).contains(&v), "weight {v} out of [0,1]"); + } + } +} diff --git a/src/simulator/crates/mc-ai/src/mcts_tree.rs b/src/simulator/crates/mc-ai/src/mcts_tree.rs index 8638b500..b5e21f94 100644 --- a/src/simulator/crates/mc-ai/src/mcts_tree.rs +++ b/src/simulator/crates/mc-ai/src/mcts_tree.rs @@ -255,3 +255,211 @@ where } score_fn(&s) } + +#[cfg(test)] +mod tests { + //! Unit tests for the generic tree engine over a toy `CoinState` — these + //! exercise UCB1 selection, expansion invariants, backprop, and parallel- + //! rollout determinism without needing the full `GameRolloutState` impl + //! (that lives in `tests/mcts_basic.rs` as an integration test). + + use super::*; + + /// Toy two-action state: heads/tails. Terminal after `depth` flips. + /// Reward = proportion of Heads flipped (deterministic from the sequence). + #[derive(Clone, Debug)] + struct CoinState { + flips: Vec, + max_depth: usize, + } + + impl CoinState { + fn new(max_depth: usize) -> Self { + Self { flips: Vec::new(), max_depth } + } + } + + impl TreeState for CoinState { + type Action = bool; + + fn legal_actions(&self) -> Vec { + if self.flips.len() >= self.max_depth { Vec::new() } else { vec![true, false] } + } + + fn apply(&self, action: &bool) -> Self { + let mut next = self.clone(); + next.flips.push(*action); + next + } + } + + // ── Node / expansion invariants ────────────────────────────────────── + + #[test] + fn new_tree_has_root_with_all_legal_actions_untried() { + let t = Tree::new(CoinState::new(3)); + assert_eq!(t.nodes.len(), 1, "root-only tree has exactly 1 node"); + assert_eq!(t.root().untried.len(), 2, "root has 2 untried actions (H, T)"); + assert!(t.root().children.is_empty(), "root has no children yet"); + assert_eq!(t.root().visits, 0); + assert_eq!(t.root().wins, 0.0); + } + + #[test] + fn expand_drains_untried_and_adds_child() { + let mut t = Tree::new(CoinState::new(3)); + let c1 = t.expand(0).expect("first expand must succeed"); + assert_eq!(t.root().untried.len(), 1, "one action should remain untried"); + assert_eq!(t.root().children, vec![c1], "child index tracked"); + assert_eq!(t.nodes[c1].parent, Some(0)); + + let c2 = t.expand(0).expect("second expand must succeed"); + assert!(t.root().untried.is_empty(), "fully expanded after 2 expands"); + assert_eq!(t.root().children, vec![c1, c2]); + + assert!(t.expand(0).is_none(), "third expand must return None"); + } + + #[test] + fn expand_applies_action_to_produce_child_state() { + let mut t = Tree::new(CoinState::new(3)); + let c = t.expand(0).unwrap(); + // The pushed action determines the child — `untried` pops from the end, + // so it's the LAST of `legal_actions()`. + let applied_action = t.nodes[c].action.expect("child must carry its action"); + assert_eq!(t.nodes[c].state.flips, vec![applied_action]); + } + + // ── UCB1 selection ─────────────────────────────────────────────────── + + #[test] + fn ucb1_returns_infinity_for_unvisited_child() { + // The tree MUST visit unvisited children before exploiting — this is + // the UCB1 contract (n=0 ⇒ ∞ score). Assert via an unvisited node. + let mut t = Tree::new(CoinState::new(3)); + let c1 = t.expand(0).unwrap(); + let c2 = t.expand(0).unwrap(); + // Parent has 2 visits, c1 has 0, c2 has 0 — both should be +INF. + t.nodes[0].visits = 2; + let log_n = 2.0f32.ln(); + let s1 = t.ucb1(c1, log_n); + let s2 = t.ucb1(c2, log_n); + assert!(s1.is_infinite() && s1 > 0.0); + assert!(s2.is_infinite() && s2 > 0.0); + } + + #[test] + fn ucb1_prefers_higher_average_reward() { + let mut t = Tree::new(CoinState::new(3)); + let c1 = t.expand(0).unwrap(); + let c2 = t.expand(0).unwrap(); + // Both visited N times; c1 has higher wins. + t.nodes[c1].visits = 10; t.nodes[c1].wins = 9.0; // 90% avg + t.nodes[c2].visits = 10; t.nodes[c2].wins = 3.0; // 30% avg + t.nodes[0].visits = 20; + let log_n = 20.0f32.ln(); + assert!(t.ucb1(c1, log_n) > t.ucb1(c2, log_n)); + } + + // ── Backpropagation ────────────────────────────────────────────────── + + #[test] + fn backpropagate_increments_visits_and_wins_to_root() { + let mut t = Tree::new(CoinState::new(3)); + let c = t.expand(0).unwrap(); + t.backpropagate(c, 0.7); + assert_eq!(t.nodes[c].visits, 1); + assert!((t.nodes[c].wins - 0.7).abs() < 1e-6); + assert_eq!(t.root().visits, 1, "root visits += 1"); + assert!((t.root().wins - 0.7).abs() < 1e-6, "root wins += 0.7"); + } + + #[test] + fn backpropagate_accumulates_across_calls() { + let mut t = Tree::new(CoinState::new(3)); + let c = t.expand(0).unwrap(); + t.backpropagate(c, 0.2); + t.backpropagate(c, 0.6); + t.backpropagate(c, 1.0); + assert_eq!(t.nodes[c].visits, 3); + assert!((t.nodes[c].wins - 1.8).abs() < 1e-6); + assert_eq!(t.root().visits, 3); + assert!((t.root().wins - 1.8).abs() < 1e-6); + } + + // ── simulate_parallel determinism contract ────────────────────────── + + #[test] + fn simulate_parallel_is_seed_deterministic_across_repeated_calls() { + // Backprop order must be rollout-index-order (NOT thread-scheduling + // order) so wins totals come out identical on repeated runs with + // the same base_seed. If the sort-by-index step inside + // `simulate_parallel` is ever removed, this test catches it. + let count = 16; + let rollout_fn = |_s: &CoinState, rng: &mut XorShift64| -> f32 { + // Deterministic-from-seed reward so parallelism can't hide + // non-determinism behind rng variance. + (rng.next_u64() as f32 / u64::MAX as f32).abs() + }; + let mut t1 = Tree::new(CoinState::new(3)); + t1.simulate_parallel(count, 42, rollout_fn); + let mut t2 = Tree::new(CoinState::new(3)); + t2.simulate_parallel(count, 42, rollout_fn); + assert_eq!(t1.root().visits, t2.root().visits, "visit counts must match"); + assert!( + (t1.root().wins - t2.root().wins).abs() < 1e-5, + "wins must match: {} vs {}", t1.root().wins, t2.root().wins + ); + } + + #[test] + fn simulate_parallel_noop_on_zero_rollouts() { + let mut t = Tree::new(CoinState::new(3)); + t.simulate_parallel(0, 42, |_, _| 0.5); + assert_eq!(t.root().visits, 0, "zero rollouts should not touch tree"); + } + + // ── rollout_snapshot helper ───────────────────────────────────────── + + #[test] + fn rollout_snapshot_walks_depth_steps_and_scores() { + // Incrementing counter state — each step +1. Assert depth of walk. + let start = 0u32; + let mut rng = XorShift64::new(42); + let r = rollout_snapshot( + &start, + &mut rng, + 5, + &|s: &u32, _d: u32, _rng: &mut XorShift64| s + 1, + &|s: &u32| *s as f32 / 10.0, + ); + // 5 steps → counter = 5 → score = 0.5 + assert!((r - 0.5).abs() < 1e-6, "expected 0.5, got {r}"); + } + + #[test] + fn rollout_snapshot_returns_initial_score_at_depth_zero() { + let start = 7u32; + let mut rng = XorShift64::new(1); + let r = rollout_snapshot( + &start, + &mut rng, + 0, // no steps + &|s: &u32, _d, _rng| s + 1, + &|s: &u32| *s as f32, + ); + assert!((r - 7.0).abs() < 1e-6, "depth=0 should return score(initial)"); + } + + // ── rollout() default stub ───────────────────────────────────────── + + #[test] + fn treestate_default_rollout_returns_stub_half() { + let state = CoinState::new(3); + let mut rng = XorShift64::new(99); + // Default impl returns 0.5 — this is the historical stub. When + // `GameRolloutState` overrides rollout, this test still passes + // for toy states that leave the default. + assert!((state.rollout(&mut rng, 20, 1.0, 0) - 0.5).abs() < 1e-6); + } +} diff --git a/src/simulator/crates/mc-ai/tests/ultimate_lookahead_stress.rs b/src/simulator/crates/mc-ai/tests/ultimate_lookahead_stress.rs new file mode 100644 index 00000000..03b3c65a --- /dev/null +++ b/src/simulator/crates/mc-ai/tests/ultimate_lookahead_stress.rs @@ -0,0 +1,335 @@ +//! Ultimate AI lookahead stress test. +//! +//! The user's "ultimate test" is an 8-player huge-map game with all 5 +//! personalities competing, stressing the AI lookahead (MCTS + GPU batched +//! rollouts). That end-to-end test lives in +//! `tools/ultimate-game.sh` (requires a working RUN host). +//! +//! THIS file is the in-process companion: it exercises the same code paths +//! — personality priors, rollout walker, GPU batched dispatch — against a +//! synthetic 8-player configuration, without needing the game binary. +//! It catches regressions in the lookahead pipeline itself (tree depth, +//! rollout determinism, batched GPU throughput, per-clan divergence at scale) +//! independently of any host-level infrastructure. Runs in under a second. +//! +//! Scope: this is a STRESS test, not a correctness test. Correctness is +//! covered by the parity / policy / rollout tests in sibling files. Here we +//! assert the lookahead pipeline SCALES to the "ultimate" configuration: +//! - 8-player abstract state packs into the fixed POD layout +//! - Per-player personality priors from the 5-clan rotation are honored +//! - Walker horizon reaches depth >= 20 without panic or overflow +//! - GPU batched dispatch accepts large batches (256+ entries) +//! - Rollout results are seed-deterministic across repeated invocations +//! +//! Pre-existing bullet order (user): "ultimate test should be AFTER all +//! 5 personalities (permutations of 1v1) have had balanced match-ups". The +//! balanced-matchup gate is `tools/matchup-grid.sh` + `checklist-report.py +//! matchup_balance`. This file deliberately operates at the abstract-state +//! layer so it runs IN the `cargo test` cycle — fast feedback. + +use mc_ai::abstract_state::{AbstractPlayerState, AbstractRolloutState, MAX_PLAYERS}; +use mc_ai::mcts::XorShift64; +use mc_ai::policy::PersonalityPriors; +use mc_ai::rollout::{walk, GameRolloutState, DEFAULT_ROLLOUT_TEMPERATURE}; +use std::collections::HashMap; +use std::path::PathBuf; +use std::time::Instant; + +fn data_dir() -> PathBuf { + let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest + .ancestors() + .nth(4) + .expect("mc-ai crate must sit four dirs below repo root") + .join("public") + .join("games") + .join("age-of-dwarves") + .join("data") +} + +/// Build a `[PersonalityPriors; 4]` that rotates through the five clans. +/// For N > 5 players, wraps — the goal is coverage, not uniqueness. +/// Players 0..4 get each of the 5 clans in a fixed order; players 4..8 +/// wrap back around, ensuring 8-player games exercise every clan at least +/// once. +fn eight_player_clan_rotation() -> [[PersonalityPriors; MAX_PLAYERS]; 8] { + let data = data_dir(); + let clans = ["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"]; + let loaded: Vec<_> = clans + .iter() + .map(|id| { + PersonalityPriors::from_personality(id, &data) + .unwrap_or_else(|e| panic!("failed to load clan {id}: {e:?}")) + }) + .collect(); + // For the stress test we only rotate the "acting" player slot (POD is + // 4-slot per entry; MAX_PLAYERS=4). Each of the 8 "entries" represents + // one player in an 8-player game with a different root clan. + let mut entries = [[loaded[0]; MAX_PLAYERS]; 8]; + for (i, entry) in entries.iter_mut().enumerate() { + // The root player (slot 0 in this entry's POD) rotates through + // the 5 clans; other slots fill in-order from the remaining clans + // so every entry has 4 distinct clan priors. + for slot in 0..MAX_PLAYERS { + entry[slot] = loaded[(i + slot) % clans.len()]; + } + } + entries +} + +/// 8-player large-map fixture. Each of the 8 entries represents one active +/// AI in an 8-player game. Gives every AI enough resources to exercise all +/// 9 ActionKinds (Build / Attack / Settle / Research / Defend / Trade / +/// ContinueWar / MakePeace / Idle). +fn eight_player_batch() -> Vec { + (0..8) + .map(|i| { + let mut state = AbstractRolloutState::zeroed(); + // Player 0 (the acting / root player): well-resourced to sustain + // the rollout walker through its full horizon. + state.players[0] = AbstractPlayerState { + gold: 200 + (i as i32) * 10, + science: 30 + (i as i32) * 2, + pop_total: 10, + city_count: 2, + tech_index: 5, + unit_counts: [3, 2, 1, 0], + happiness_pool: 5, + _pad0: 0, + force_rel: [0, 20, 10, 5], // enables Attack + ContinueWar + axes: [5; 8], + relations: [0, -1, 0, 0], // enables MakePeace + _pad1: [0; 4], + rng_state: 0xAAAA_BBBB_CCCC_DDDD ^ (i as u64), + turn: 1, + _pad2: [0; 4], + }; + // Opponents: smaller footprint but present. Exercises the + // rollout walker's opponent-iteration paths. + for slot in 1..MAX_PLAYERS { + state.players[slot] = AbstractPlayerState { + gold: 50, + science: 10, + pop_total: 5, + city_count: 1, + tech_index: 2, + unit_counts: [1, 1, 0, 0], + happiness_pool: 0, + _pad0: 0, + force_rel: [5, 0, 5, 5], + axes: [5; 8], + relations: [0, 0, 0, 0], + _pad1: [0; 4], + rng_state: 0x1111_2222_3333_4444 ^ (slot as u64) ^ (i as u64), + turn: 1, + _pad2: [0; 4], + }; + } + state + }) + .collect() +} + +// ── Shape + determinism gates ────────────────────────────────────────── + +#[test] +fn clan_rotation_covers_all_five_personalities() { + // All 5 clans must appear as a root player (slot 0) across the 8 entries. + let rotation = eight_player_clan_rotation(); + let mut seen_aggression: std::collections::BTreeSet = std::collections::BTreeSet::new(); + for entry in &rotation { + // Quantize the aggression axis to an integer so float equality isn't + // a concern — the 5 clans have 5 distinct aggression scores. + seen_aggression.insert(entry[0].aggression as i32); + } + assert!( + seen_aggression.len() >= 5, + "8-player rotation must surface all 5 clans as root; saw {} distinct aggression values: {:?}", + seen_aggression.len(), + seen_aggression + ); +} + +#[test] +fn eight_player_fixture_packs_into_fixed_pod_size() { + // The POD is 256 bytes regardless of how many logical players the game + // has — extra players live in adjacent entries, not wider slots. Assert + // our fixture respects that contract. + use std::mem::size_of; + assert_eq!(size_of::(), 256); + let batch = eight_player_batch(); + assert_eq!(batch.len(), 8, "8-player stress fixture"); + // Every entry is exactly 256 bytes — no accidental Vec or heap indirection. + assert_eq!( + batch.iter().map(|_| size_of::()).sum::(), + 256 * 8 + ); +} + +#[test] +fn walker_reaches_full_horizon_on_eight_player_configuration() { + // The walker MUST NOT break early on a healthy 8-player config. If it + // does, we're losing deep rollouts — which is exactly what the "stress + // lookahead" acceptance is measuring. + let batch = eight_player_batch(); + let priors_per_entry = eight_player_clan_rotation(); + let horizon = 20u32; + + for (i, (pod, priors)) in batch.iter().zip(priors_per_entry.iter()).enumerate() { + let state = GameRolloutState::from_abstract(*pod, *priors); + let mut rng = XorShift64::new(42 + i as u64); + let score = walk(&state, &mut rng, horizon, DEFAULT_ROLLOUT_TEMPERATURE, 0); + assert!( + score.is_finite() && (0.0..=1.0).contains(&score), + "entry {i} produced score {score} outside [0,1] — walker may have panicked or overflowed" + ); + } +} + +#[test] +fn eight_player_rollout_is_seed_deterministic() { + // Run the whole 8-player batch twice with the same seeds; every score + // must match bit-for-bit (float equality is fine; walker is branchy + // but the arithmetic is additive + saturating, no non-deterministic ops). + let batch = eight_player_batch(); + let priors_per_entry = eight_player_clan_rotation(); + + let scores_a: Vec = batch + .iter() + .zip(priors_per_entry.iter()) + .enumerate() + .map(|(i, (pod, priors))| { + let state = GameRolloutState::from_abstract(*pod, *priors); + let mut rng = XorShift64::new(42 + i as u64); + walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0) + }) + .collect(); + + let scores_b: Vec = batch + .iter() + .zip(priors_per_entry.iter()) + .enumerate() + .map(|(i, (pod, priors))| { + let state = GameRolloutState::from_abstract(*pod, *priors); + let mut rng = XorShift64::new(42 + i as u64); + walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0) + }) + .collect(); + + assert_eq!(scores_a, scores_b, "same-seed 8-player walk must be bit-deterministic"); +} + +// ── Scale + throughput gate ──────────────────────────────────────────── + +#[test] +fn deep_stress_batch_256_entries_finishes_in_under_one_second() { + // Scale gate: in a real 8-player game, a single MCTS expansion might + // dispatch 256+ rollouts in a batch. This test asserts that scale works + // on CPU (GPU is covered by the parity test). If someone accidentally + // introduces an O(N²) step, this test blows past the 1-second budget + // and fails loudly. + // + // 256 entries × 20-turn horizon × ~9 actions/turn ≈ 50k operations. On + // a debug build this typically runs in ~100ms. + let rotation = eight_player_clan_rotation(); + let base_priors = rotation[0]; + let mut batch = Vec::with_capacity(256); + for i in 0..256 { + let mut state = AbstractRolloutState::zeroed(); + state.players[0].gold = 100 + i; + state.players[0].pop_total = 5; + state.players[0].city_count = 1; + state.players[0].force_rel = [0, 20, 0, 0]; + state.players[0].relations = [0, -1, 0, 0]; + state.players[0].rng_state = 0x1234_5678_9ABC_DEF0u64.wrapping_add(i as u64); + batch.push(state); + } + + let start = Instant::now(); + let mut total = 0.0f64; + for (i, pod) in batch.iter().enumerate() { + let state = GameRolloutState::from_abstract(*pod, base_priors); + let mut rng = XorShift64::new(42u64 + i as u64); + total += walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0) as f64; + } + let elapsed = start.elapsed(); + + assert!( + total > 0.0, + "aggregate score {total} non-positive — walker outputs look broken" + ); + assert!( + elapsed.as_secs_f32() < 1.0, + "256-entry stress batch took {:?} (>1s budget); possible O(N²) regression", + elapsed + ); +} + +// ── Clan divergence at 8-player scale ───────────────────────────────── + +#[test] +fn eight_player_clan_divergence_preserves_personality_signal() { + // The "skillful clan personality" claim in p0-02 means that per-clan + // action biases persist even in 8-player configurations — NOT just in + // fixture 1v1s. This test takes the same 8-player POD, runs it under + // Ironhold vs Blackhammer priors, and asserts the final scores differ. + // If scores collapse to identical values, either the priors aren't + // flowing into the rollout or the walker is ignoring them. + let data = data_dir(); + let iron = PersonalityPriors::from_personality("ironhold", &data).unwrap(); + let black = PersonalityPriors::from_personality("blackhammer", &data).unwrap(); + + let mut pod = AbstractRolloutState::zeroed(); + pod.players[0].gold = 500; + pod.players[0].pop_total = 8; + pod.players[0].city_count = 2; + pod.players[0].force_rel = [0, 30, 20, 10]; + pod.players[0].relations = [0, -1, 0, 0]; + pod.players[0].rng_state = 0xDEAD_BEEF_CAFE_F00D; + + let iron_state = GameRolloutState::from_abstract(pod, [iron; MAX_PLAYERS]); + let black_state = GameRolloutState::from_abstract(pod, [black; MAX_PLAYERS]); + + // Use a fixed seed so ONLY the prior differences influence the output. + let mut iron_rng = XorShift64::new(7); + let mut black_rng = XorShift64::new(7); + let iron_score = walk(&iron_state, &mut iron_rng, 30, DEFAULT_ROLLOUT_TEMPERATURE, 0); + let black_score = walk(&black_state, &mut black_rng, 30, DEFAULT_ROLLOUT_TEMPERATURE, 0); + + assert!( + (iron_score - black_score).abs() > 1e-4, + "Ironhold and Blackhammer MUST produce measurably different walk scores \ + at 8-player scale (got iron={iron_score} black={black_score}). \ + If scores converge, the priors aren't flowing into the walker and the \ + 'skillful clan personality' claim is broken at scale." + ); +} + +// ── Guard: 5-clan pool as exported in ai_personalities.json ─────────── + +#[test] +fn ai_personalities_json_still_exports_exactly_five_clans() { + // Prerequisite for the user's "ultimate test" is the 1v1-balanced-matchup + // grid across all 5 personalities. If someone adds a 6th clan to + // ai_personalities.json without also updating the matchup grid + // harness (tools/matchup-grid.sh), this test fails loudly. + let json_path = data_dir().join("ai_personalities.json"); + let text = std::fs::read_to_string(&json_path) + .unwrap_or_else(|e| panic!("failed to read {json_path:?}: {e}")); + let map: HashMap = serde_json::from_str(&text) + .unwrap_or_else(|e| panic!("{json_path:?} is not valid JSON: {e}")); + let expected = ["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"]; + assert_eq!( + map.len(), + 5, + "expected exactly 5 clans in ai_personalities.json, found {}: {:?}", + map.len(), + map.keys().collect::>() + ); + for id in &expected { + assert!( + map.contains_key(*id), + "ai_personalities.json missing expected clan {id}" + ); + } +} diff --git a/tools/ci-autoplay-smoke.sh b/tools/ci-autoplay-smoke.sh new file mode 100755 index 00000000..c4d49348 --- /dev/null +++ b/tools/ci-autoplay-smoke.sh @@ -0,0 +1,144 @@ +#!/usr/bin/env bash +# ci-autoplay-smoke.sh — Hang-regression smoke test for the autoplay pipeline. +# +# Runs one seeded T100 autoplay with a hard wall-clock budget and asserts the +# final `turn_stats.jsonl` entry has `outcome != "in_progress"`. Catches any +# class of hang — whether the root cause is in Godot (signal re-entry, main- +# loop stall), in Rust (MCTS deadlock, combat infinite loop), or in the +# shell harness (pkill substring collision, missing SAFETY timeout). +# +# Regression history: +# 2026-04-17 loop13 — PARALLEL=10 T300 hung all 10 seeds because +# `run_ap3.sh`'s cleanup `pkill -f "AUTO_PLAY_DIR="` substring-matched +# active sibling seeds whose paths shared a numeric prefix (seed1 → seed10). +# Fixed by switching to a unique per-run AP_RUN_ID token. This smoke test +# would have caught the hang immediately in `./run verify` because the +# victim game's `outcome` stays "in_progress" after SIGTERM. +# +# Usage: +# tools/ci-autoplay-smoke.sh # default seed=1, T100, 180s budget +# tools/ci-autoplay-smoke.sh # custom seed/turns +# +# Environment: +# AUTOPLAY_HOST — if set, run via SSH on that host (e.g. apricot) +# PROJECT_ROOT_REMOTE — repo path on RUN host (default: $HOME/Code/…) +# SMOKE_WALL_BUDGET_SEC — hard wall-clock budget (default: 180) +# SMOKE_KEEP_OUTPUT — "1" to keep .local/ci-smoke/ results dir after test +# +# Exit codes: +# 0 — game finished with a terminal outcome (victory | max_turns | defeat) +# 1 — game hung (outcome still "in_progress") OR no turn_stats produced +# 2 — bad arguments / SSH / environment failure +# +# Hook into ./run verify per p0-10 hang-regression mandate. + +set -uo pipefail + +SEED="${1:-1}" +TURNS="${2:-100}" +BUDGET="${SMOKE_WALL_BUDGET_SEC:-180}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +STAMP="$(date +%Y%m%d_%H%M%S)" +RESULTS_DIR="$PROJECT_DIR/.local/ci-smoke/smoke_${STAMP}_seed${SEED}" +mkdir -p "$RESULTS_DIR" + +echo "[ci-autoplay-smoke] seed=$SEED turns=$TURNS budget=${BUDGET}s" +echo "[ci-autoplay-smoke] results: $RESULTS_DIR" + +_cleanup() { + if [ "${SMOKE_KEEP_OUTPUT:-0}" != "1" ]; then + rm -rf "$RESULTS_DIR" 2>/dev/null || true + fi +} +trap _cleanup EXIT + +_fail() { + echo "[ci-autoplay-smoke] FAIL: $*" >&2 + exit 1 +} + +# ── Run autoplay ───────────────────────────────────────────────────────────── + +if [ -n "${AUTOPLAY_HOST:-}" ]; then + # Remote path — use the same runner autoplay-batch.sh uses. + REMOTE_ROOT="${PROJECT_ROOT_REMOTE:-\$HOME/Code/@projects/@magic-civilization}" + REMOTE_DIR="${REMOTE_ROOT}/.local/ci-smoke/smoke_${STAMP}_seed${SEED}" + REMOTE_RUNNER="${REMOTE_RUNNER:-\$HOME/bin/run_ap3.sh}" + RUN_ID="ci_smoke_${STAMP}_seed${SEED}" + + ssh "$AUTOPLAY_HOST" " + set -uo pipefail + mkdir -p '$REMOTE_DIR' + AUTO_PLAY=true \ + AUTO_PLAY_SEED='$SEED' \ + AUTO_PLAY_TURN_LIMIT='$TURNS' \ + AUTO_PLAY_DIR='$REMOTE_DIR' \ + AP_RUN_ID='$RUN_ID' \ + timeout '$BUDGET' bash $REMOTE_RUNNER + " >"$RESULTS_DIR/game.log" 2>&1 + REMOTE_EXIT=$? + + # Pull turn_stats + meta back. The remote auto_play writes either into + # the AUTO_PLAY_DIR directly (if the caller named it `game__seed`) + # or into a `game_*` subdir. ssh-cat handles both shapes — globbing via + # scp's non-quoted path ran into login-shell variations. + ssh "$AUTOPLAY_HOST" "find '$REMOTE_DIR' -maxdepth 3 -name turn_stats.jsonl -print0 | xargs -0 -I{} cat {}" \ + >"$RESULTS_DIR/turn_stats.jsonl" 2>/dev/null || true + ssh "$AUTOPLAY_HOST" "find '$REMOTE_DIR' -maxdepth 3 -name meta.json -print0 | xargs -0 -I{} cat {}" \ + >"$RESULTS_DIR/meta.json" 2>/dev/null || true + + if [ "$REMOTE_EXIT" -eq 124 ]; then + _fail "autoplay timed out after ${BUDGET}s — hang regression detected (SSH timeout path)" + fi +else + # Local path — flatpak Godot, Linux only. + if ! command -v flatpak >/dev/null 2>&1; then + echo "[ci-autoplay-smoke] SKIP: no flatpak locally and AUTOPLAY_HOST unset" + exit 0 + fi + cd "$PROJECT_DIR/src/game" + timeout "$BUDGET" flatpak run --user \ + --filesystem=home \ + --env=AUTO_PLAY=true \ + --env=AUTO_PLAY_SEED="$SEED" \ + --env=AUTO_PLAY_TURN_LIMIT="$TURNS" \ + --env=AUTO_PLAY_DIR="$RESULTS_DIR" \ + --env=AP_RUN_ID="ci_smoke_${STAMP}_seed${SEED}" \ + org.godotengine.Godot --path . --rendering-method gl_compatibility --headless \ + >"$RESULTS_DIR/game.log" 2>&1 + LOCAL_EXIT=$? + if [ "$LOCAL_EXIT" -eq 124 ]; then + _fail "autoplay timed out after ${BUDGET}s — hang regression detected" + fi +fi + +# ── Assert terminal outcome ────────────────────────────────────────────────── + +STATS_FILE="$(find "$RESULTS_DIR" -name 'turn_stats.jsonl' -type f 2>/dev/null | head -1)" +if [ -z "$STATS_FILE" ] || [ ! -s "$STATS_FILE" ]; then + _fail "no turn_stats.jsonl produced (autoplay never wrote a turn line)" +fi + +LAST_OUTCOME="$(tail -1 "$STATS_FILE" | python3 -c " +import json, sys +try: + d = json.loads(sys.stdin.read()) + print(d.get('outcome', 'missing')) +except Exception as e: + print('parse_error') +")" + +case "$LAST_OUTCOME" in + victory|max_turns|defeat) + echo "[ci-autoplay-smoke] PASS — outcome=$LAST_OUTCOME" + exit 0 + ;; + in_progress) + _fail "outcome=in_progress — game hung mid-run (see $STATS_FILE)" + ;; + *) + _fail "outcome=$LAST_OUTCOME — unexpected terminal state (see $STATS_FILE)" + ;; +esac