diff --git a/scripts/rl-train.sh b/scripts/rl-train.sh index 20532859..3c3f378d 100755 --- a/scripts/rl-train.sh +++ b/scripts/rl-train.sh @@ -86,28 +86,56 @@ case "$cmd" in ;; launch) + # Launch as a transient systemd --user .service under heavy-tests.slice. + # The slice (CPUWeight=20, MemoryMax=32G, TasksMax=4096) prevents the godot + # workers spawned by the python parent from starving sshd/interactive work. + # Every child process (flatpak, bwrap, godot-bin) inherits the cgroup, so a + # 3000-proc explosion stays contained — exactly the wedge mode seen on + # 2026-05-18 and 2026-05-19. + # + # Unit name includes epoch so re-launches with the same RL_RUN_NAME don't + # collide with a stopped-but-not-yet-collected unit. + RL_UNIT="rl-train-${RL_RUN_NAME}-$(date +%s)" remote " - cd ${RL_WORKTREE} || exit 1 - existing=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1}') + set -e + cd ${RL_WORKTREE} + existing=\$(systemctl --user list-units --type=service --no-legend --state=running 'rl-train-*' 2>/dev/null | awk '{print \$1}' | head -1) if [ -n \"\$existing\" ]; then - echo 'training already running; run kill first' - echo \"\$existing\" + echo \"training already running: \$existing — run 'kill' first\" exit 1 fi - nohup python3 -m tooling.rl_self_play.train \ - --device ${RL_DEVICE} --num-envs ${RL_ENVS} \ - --total-steps ${RL_TOTAL_STEPS} --eval-freq ${RL_EVAL_FREQ} \ - --eval-episodes ${RL_EVAL_EPS} --max-turns ${RL_MAX_TURNS} \ - --run-name ${RL_RUN_NAME} > ${LOG_REMOTE} 2>&1 & - echo \$! > ${RL_PIDFILE} + systemd-run --user \\ + --slice=heavy-tests.slice \\ + --unit=${RL_UNIT} \\ + --collect --quiet \\ + --working-directory=${RL_WORKTREE} \\ + --setenv=PYTHONUNBUFFERED=1 \\ + --property=StandardOutput=append:${LOG_REMOTE} \\ + --property=StandardError=append:${LOG_REMOTE} \\ + -- python3 -m tooling.rl_self_play.train \\ + --device ${RL_DEVICE} --num-envs ${RL_ENVS} \\ + --total-steps ${RL_TOTAL_STEPS} --eval-freq ${RL_EVAL_FREQ} \\ + --eval-episodes ${RL_EVAL_EPS} --max-turns ${RL_MAX_TURNS} \\ + --run-name ${RL_RUN_NAME} + echo ${RL_UNIT} > ${RL_PIDFILE} sleep 3 - ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/' \\ - || (echo 'launch failed; check log'; tail -20 ${LOG_REMOTE}) + systemctl --user status ${RL_UNIT} --no-pager --lines=0 + systemctl --user show ${RL_UNIT} --property=MainPID " ;; kill) + # Stop all rl-train-* transient services. systemd cascades SIGTERM through + # the cgroup, then SIGKILL after TimeoutStopSec, reaping all godot children. + # Falls back to pkill for any procs not in a unit (legacy runs / orphans). remote " + units=\$(systemctl --user list-units --type=service --no-legend 'rl-train-*' 2>/dev/null | awk '{print \$1}') + if [ -n \"\$units\" ]; then + echo \"stopping units:\" + echo \"\$units\" + echo \"\$units\" | xargs -r systemctl --user stop + fi + # Legacy / out-of-unit sweep pkill -f 'rl_self_play.train' 2>/dev/null || true sleep 5 pkill -9 -f 'rl_self_play.train' 2>/dev/null || true