feat(rl-self-play): ✨ Add stochastic evaluation with masked softmax sampling to replace deterministic argmax in RL self-play training
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
b82e4a8fbd
commit
e5a2a37d0e
1 changed files with 7 additions and 1 deletions
|
|
@ -132,7 +132,13 @@ def main() -> int:
|
|||
log_path=str(run_dir / "eval"),
|
||||
eval_freq=max(args.eval_freq // args.num_envs, 1),
|
||||
n_eval_episodes=args.eval_episodes,
|
||||
deterministic=True,
|
||||
# Stochastic eval: a barely-trained net's argmax over the
|
||||
# 322-dim action head has ~zero chance of being end_turn (idx 0),
|
||||
# so deterministic eval episodes never advance past turn 0 and
|
||||
# all 10 hit step_cap with reward 0. Sampling from the masked
|
||||
# softmax keeps end_turn reachable until the policy has
|
||||
# consolidated enough mass on a real strategy.
|
||||
deterministic=False,
|
||||
render=False,
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue