feat(rl-self-play): ✨ Add stochastic evaluation with masked softmax sampling to replace deterministic argmax in RL self-play training

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-05-17 06:55:12 -07:00 · 2026-05-17 06:55:12 -07:00 · e5a2a37d0e
commit e5a2a37d0e
parent b82e4a8fbd
1 changed files with 7 additions and 1 deletions
--- a/tooling/rl_self_play/train.py
+++ b/tooling/rl_self_play/train.py
@ -132,7 +132,13 @@ def main() -> int:
        log_path=str(run_dir / "eval"),
        eval_freq=max(args.eval_freq // args.num_envs, 1),
        n_eval_episodes=args.eval_episodes,
-        deterministic=True,
+        # Stochastic eval: a barely-trained net's argmax over the
+        # 322-dim action head has ~zero chance of being end_turn (idx 0),
+        # so deterministic eval episodes never advance past turn 0 and
+        # all 10 hit step_cap with reward 0. Sampling from the masked
+        # softmax keeps end_turn reachable until the policy has
+        # consolidated enough mass on a real strategy.
+        deterministic=False,
        render=False,
    )