fix(@projects/@magic-civilization): 🐛 improve pid detection in rl scripts
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
a6f909a151
commit
4a862b76fb
2 changed files with 15 additions and 6 deletions
|
|
@ -48,7 +48,7 @@ case "$cmd" in
|
|||
remote "
|
||||
set +e
|
||||
echo '---PYTHON PID---'
|
||||
py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
|
||||
py=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1; exit}')
|
||||
if [ -z \"\$py\" ]; then
|
||||
echo 'no training process'
|
||||
echo '---EVAL DIR (${EVAL_DIR_REMOTE})---'
|
||||
|
|
@ -67,7 +67,7 @@ case "$cmd" in
|
|||
logs)
|
||||
n="${1:-60}"
|
||||
remote "
|
||||
py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
|
||||
py=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1; exit}')
|
||||
if [ -z \"\$py\" ]; then echo 'no training process'; exit 1; fi
|
||||
tail -${n} \"\$(readlink /proc/\$py/fd/1)\"
|
||||
"
|
||||
|
|
|
|||
|
|
@ -43,10 +43,17 @@ WIN_REWARD = 1.0
|
|||
LOSS_REWARD = -1.0
|
||||
DRAW_REWARD = 0.0
|
||||
|
||||
# Per-Gym-step ceiling on micro-actions before forcing end_turn. Without
|
||||
# this, a policy stuck in a loop (e.g. fortify→unfortify→fortify) would
|
||||
# hang the env forever. 64 is generous for a duel game's per-turn budget.
|
||||
MAX_MICRO_ACTIONS_PER_TURN = 64
|
||||
# Hard ceiling on env.step() calls per episode. A policy that learned
|
||||
# "ending the turn lowers my reward" would otherwise produce episodes
|
||||
# of unbounded length (observed: 1.3M harness round-trips in a single
|
||||
# eval episode). A total-episode budget catches that without biasing
|
||||
# intra-turn behavior — players in late game with hundreds of units
|
||||
# legitimately have hundreds of micro-actions per turn, so a per-turn
|
||||
# cap would interfere with normal play. 50k bounds eval wall-clock to
|
||||
# ~10 min at 50 fps while sitting an order of magnitude above any
|
||||
# plausibly legitimate game length (200 units * 200 turns * 5 acts/unit
|
||||
# = 200k upper bound, but real PPO eval games end far earlier).
|
||||
DEFAULT_MAX_STEPS_PER_EPISODE = 50_000
|
||||
|
||||
|
||||
class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
||||
|
|
@ -59,10 +66,12 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
self,
|
||||
harness_config: HarnessConfig | None = None,
|
||||
max_turns: int = 200,
|
||||
max_micro_actions_per_turn: int = DEFAULT_MAX_MICRO_ACTIONS_PER_TURN,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self._config = harness_config or HarnessConfig()
|
||||
self._max_turns = max_turns
|
||||
self._max_micro_actions_per_turn = max_micro_actions_per_turn
|
||||
self.observation_space = spaces.Box(
|
||||
low=-1e6, high=1e6, shape=(OBS_DIM,), dtype=np.float32
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue