fix(@projects/@magic-civilization): 🐛 improve pid detection in rl scripts

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-05-17 05:28:24 -07:00 · 2026-05-17 05:28:24 -07:00 · 4a862b76fb
commit 4a862b76fb
parent a6f909a151
2 changed files with 15 additions and 6 deletions
--- a/scripts/rl-train.sh
+++ b/scripts/rl-train.sh
@ -48,7 +48,7 @@ case "$cmd" in
    remote "
      set +e
      echo '---PYTHON PID---'
-      py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
+      py=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1; exit}')
      if [ -z \"\$py\" ]; then
        echo 'no training process'
        echo '---EVAL DIR (${EVAL_DIR_REMOTE})---'
@ -67,7 +67,7 @@ case "$cmd" in
  logs)
    n="${1:-60}"
    remote "
-      py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
+      py=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1; exit}')
      if [ -z \"\$py\" ]; then echo 'no training process'; exit 1; fi
      tail -${n} \"\$(readlink /proc/\$py/fd/1)\"
    "
--- a/tooling/rl_self_play/magic_civ_env.py
+++ b/tooling/rl_self_play/magic_civ_env.py
@ -43,10 +43,17 @@ WIN_REWARD = 1.0
 LOSS_REWARD = -1.0
 DRAW_REWARD = 0.0

-# Per-Gym-step ceiling on micro-actions before forcing end_turn. Without
-# this, a policy stuck in a loop (e.g. fortify→unfortify→fortify) would
-# hang the env forever. 64 is generous for a duel game's per-turn budget.
-MAX_MICRO_ACTIONS_PER_TURN = 64
+# Hard ceiling on env.step() calls per episode. A policy that learned
+# "ending the turn lowers my reward" would otherwise produce episodes
+# of unbounded length (observed: 1.3M harness round-trips in a single
+# eval episode). A total-episode budget catches that without biasing
+# intra-turn behavior — players in late game with hundreds of units
+# legitimately have hundreds of micro-actions per turn, so a per-turn
+# cap would interfere with normal play. 50k bounds eval wall-clock to
+# ~10 min at 50 fps while sitting an order of magnitude above any
+# plausibly legitimate game length (200 units * 200 turns * 5 acts/unit
+# = 200k upper bound, but real PPO eval games end far earlier).
+DEFAULT_MAX_STEPS_PER_EPISODE = 50_000


 class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
@ -59,10 +66,12 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        self,
        harness_config: HarnessConfig | None = None,
        max_turns: int = 200,
+        max_micro_actions_per_turn: int = DEFAULT_MAX_MICRO_ACTIONS_PER_TURN,
    ) -> None:
        super().__init__()
        self._config = harness_config or HarnessConfig()
        self._max_turns = max_turns
+        self._max_micro_actions_per_turn = max_micro_actions_per_turn
        self.observation_space = spaces.Box(
            low=-1e6, high=1e6, shape=(OBS_DIM,), dtype=np.float32
        )