feat(rl-self-play): Add stochastic evaluation with masked softmax sampling to replace deterministic argmax in RL self-play training

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
autocommit 2026-05-17 06:55:12 -07:00
parent b82e4a8fbd
commit e5a2a37d0e

View file

@ -132,7 +132,13 @@ def main() -> int:
log_path=str(run_dir / "eval"),
eval_freq=max(args.eval_freq // args.num_envs, 1),
n_eval_episodes=args.eval_episodes,
deterministic=True,
# Stochastic eval: a barely-trained net's argmax over the
# 322-dim action head has ~zero chance of being end_turn (idx 0),
# so deterministic eval episodes never advance past turn 0 and
# all 10 hit step_cap with reward 0. Sampling from the masked
# softmax keeps end_turn reachable until the policy has
# consolidated enough mass on a real strategy.
deterministic=False,
render=False,
)