Build the linux .so/wasm once on a worker and let sim/test/AI runners fetch the
prebuilt artifact (keyed by git sha) instead of recompiling — N workers share
one build. Adds the magicciv-artifacts DO Space, rclone in the golden image, and:
- dist:publish build + upload builds/<sha>/{.so,wasm}
- dist:fetch download the prebuilt .so for HEAD's sha
- dist:sync git pull -> fetch prebuilt if published, else build
- dist:models share RL .onnx via the Space (push/pull/ls)
Complements sccache (compile cache) by caching final outputs. Creds via
RCLONE_S3_* env over ssh, never on worker disk/argv; degrades to build-on-worker
when creds/cache absent.
Also hardens the dispatch layer (pre-existing, affected test/build/render too):
- pass -i ~/.ssh/id_mc_fleet on dispatch ssh (don't rely on agent-loaded key)
- guard _dist_first_host against an empty / "fleet down" inventory
- drop ssh -n on heredoc-stdin verbs (it redirected stdin from /dev/null)
Proven end-to-end on DO: publish built a 43.9MB .so + wasm; dist:sync fetched it
in 2.8s (no rebuild).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
420 lines
20 KiB
Bash
Executable file
420 lines
20 KiB
Bash
Executable file
#!/usr/bin/env bash
|
||
# Distributed test/train dispatch — fan the iteration loop across the DigitalOcean
|
||
# test fleet. Sourced by ./run (defines cmd_dist_*). Auto-registered via the
|
||
# cmd_<verb>_<target> name-dispatch, so no edit to the top-level `run` is needed.
|
||
#
|
||
# ./run dist:up <workers> [size] [region] spin the fleet up
|
||
# ./run dist:sim <games> [turn_limit] [--destroy-after] fan a sim batch across it
|
||
# ./run dist:train <total_steps> [--destroy-after] fan an RL sweep across it
|
||
# ./run dist:down tear it down (zero cost)
|
||
#
|
||
# Requires: TF_VAR_do_token in env, terraform on PATH, and a coordinator with
|
||
# GNU coreutils (autoplay-batch.sh uses `realpath -m`).
|
||
|
||
_DIST_TF_DIR_REL="infra/terraform/test-fleet"
|
||
|
||
_dist_repo_root() { (cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd); }
|
||
|
||
_dist_tf() {
|
||
local root
|
||
root="$(_dist_repo_root)"
|
||
terraform -chdir="$root/$_DIST_TF_DIR_REL" "$@"
|
||
}
|
||
|
||
_dist_read_hosts() {
|
||
# Echo one "<user>@<ip>" per line from the inventory, skipping comments/blanks.
|
||
local inv="$1"
|
||
grep -vE '^\s*(#|$)' "$inv" 2>/dev/null || true
|
||
}
|
||
|
||
_dist_wait_ready() {
|
||
# Block until each worker's cloud-init finishes — it copies the fleet key to the
|
||
# build user and git-pulls. DO's boot agent install delays runcmd 1-3 min, so the
|
||
# build user isn't ssh-able until then. We ssh as root (authorized immediately) to wait.
|
||
local root inv host ip
|
||
root="$(_dist_repo_root)"; inv="$root/.local/fleet/inventory"
|
||
[ -f "$inv" ] || return 0
|
||
while IFS= read -r host; do
|
||
ip="${host#*@}"
|
||
printf ' waiting for %s cloud-init... ' "$ip"
|
||
local _i
|
||
for _i in $(seq 1 36); do
|
||
ssh -n -o StrictHostKeyChecking=accept-new -o ConnectTimeout=8 -o BatchMode=yes -i ~/.ssh/id_mc_fleet "root@$ip" true 2>/dev/null && break
|
||
sleep 5
|
||
done
|
||
ssh -n -o BatchMode=yes -i ~/.ssh/id_mc_fleet "root@$ip" 'cloud-init status --wait >/dev/null 2>&1 || true' 2>/dev/null
|
||
echo "ready"
|
||
done < <(_dist_read_hosts "$inv")
|
||
}
|
||
|
||
cmd_dist() {
|
||
cat <<'EOF'
|
||
Distributed test/train fleet (DigitalOcean). Set TF_VAR_do_token first.
|
||
./run dist:check offline: fmt + validate + mocked test (no token/spend)
|
||
./run dist:image [--cold] (re)build golden image — incremental by default (~3-8min vs ~20 cold)
|
||
./run dist:prune [keep=2] delete superseded golden snapshots (~$0.40/mo each)
|
||
./run dist:up <workers> [size] [region] e.g. ./run dist:up 10
|
||
./run dist:sim <games> [turn_limit] [--destroy-after]
|
||
./run dist:train <total_steps> [--destroy-after]
|
||
./run dist:test cargo test --workspace on a worker
|
||
./run dist:build cargo build + wasm on a worker (wasm rsync'd back)
|
||
./run dist:publish build once → upload .so/wasm to the artifact Space (keyed by sha)
|
||
./run dist:fetch download the prebuilt .so for HEAD's sha (skip recompile)
|
||
./run dist:sync [ref] git pull → fetch prebuilt .so if published, else build
|
||
./run dist:models {push <src> <name>|pull <name> <dest>|ls} share RL models via the Space
|
||
./run dist:render <res://scene.tscn> <out.png> render a proof scene (software weston, no GPU) → png
|
||
./run dist:down
|
||
EOF
|
||
}
|
||
|
||
cmd_dist_check() {
|
||
# Offline IaC verification — no DigitalOcean token, no API, no servers, no cost.
|
||
# fmt (style) + validate (schema typecheck) + test (mocked-provider behaviour).
|
||
local root
|
||
root="$(_dist_repo_root)"
|
||
local dir="$root/$_DIST_TF_DIR_REL"
|
||
echo "== terraform fmt =="
|
||
terraform -chdir="$dir" fmt -check -recursive || { echo "fmt: run 'terraform -chdir=$dir fmt'" >&2; return 1; }
|
||
echo "== terraform init (providers only) =="
|
||
terraform -chdir="$dir" init -backend=false -input=false >/dev/null || return 1
|
||
echo "== terraform validate (schema typecheck) =="
|
||
terraform -chdir="$dir" validate || return 1
|
||
echo "== terraform test (mocked digitalocean) =="
|
||
terraform -chdir="$dir" test || return 1
|
||
echo "dist:check OK — config is valid, no resources touched."
|
||
}
|
||
|
||
cmd_dist_image() {
|
||
# (Re)build the golden image. INCREMENTAL by default: builds FROM the newest
|
||
# mc-golden snapshot, so provision.sh (idempotent) only redoes changed work
|
||
# (~3-8 min). --cold builds from stock Ubuntu (~20 min) — resets accumulated
|
||
# layer cruft; run occasionally. Needs ~/.vault/{do_pat_mc,mc_forge_creds}.
|
||
local cold=false a
|
||
for a in "$@"; do [ "$a" = "--cold" ] && cold=true; done
|
||
local root pat
|
||
root="$(_dist_repo_root)"
|
||
pat="$(cat ~/.vault/do_pat_mc 2>/dev/null)"
|
||
[ -n "$pat" ] || { echo "no ~/.vault/do_pat_mc" >&2; return 1; }
|
||
export DIGITALOCEAN_TOKEN="$pat"
|
||
# shellcheck disable=SC1090
|
||
. ~/.vault/mc_forge_creds
|
||
export PKR_VAR_git_remote="http://${ADMIN_USER}:${ADMIN_PASS}@${FORGE_IP}:3000/mcadmin/magicciv.git"
|
||
PKR_VAR_fleet_pubkey="$(cat ~/.ssh/id_mc_fleet.pub)"; export PKR_VAR_fleet_pubkey
|
||
local base="ubuntu-24-04-x64" prev
|
||
if ! $cold; then
|
||
prev="$(curl -s -H "Authorization: Bearer $pat" "https://api.digitalocean.com/v2/snapshots?resource_type=droplet&per_page=200" \
|
||
| python3 -c "import sys,json;s=[x for x in json.load(sys.stdin)['snapshots'] if x['name'].startswith('mc-golden')];s.sort(key=lambda x:x['created_at']);print(s[-1]['id'] if s else '')" 2>/dev/null)"
|
||
if [ -n "$prev" ]; then base="$prev"; echo "INCREMENTAL rebuild from snapshot $base (pass --cold for a full rebuild)"; else echo "no prior golden — cold build"; fi
|
||
else
|
||
echo "COLD rebuild from $base"
|
||
fi
|
||
export PKR_VAR_base_image="$base"
|
||
( cd "$root/infra/packer" && packer init golden-image.pkr.hcl >/dev/null && packer build golden-image.pkr.hcl )
|
||
echo "tip: each rebuild leaves a snapshot (~\$0.40/mo) — './run dist:prune' deletes superseded ones."
|
||
}
|
||
|
||
cmd_dist_prune() {
|
||
# Delete superseded golden snapshots, keeping the newest N (default 2).
|
||
local keep="${1:-2}" pat old id
|
||
pat="$(cat ~/.vault/do_pat_mc 2>/dev/null)"
|
||
[ -n "$pat" ] || { echo "no ~/.vault/do_pat_mc" >&2; return 1; }
|
||
old="$(curl -s -H "Authorization: Bearer $pat" "https://api.digitalocean.com/v2/snapshots?resource_type=droplet&per_page=200" \
|
||
| python3 -c "import sys,json;s=[x for x in json.load(sys.stdin)['snapshots'] if x['name'].startswith('mc-golden')];s.sort(key=lambda x:x['created_at']);[print(x['id']) for x in s[:-${keep}]]" 2>/dev/null)"
|
||
[ -n "$old" ] || { echo "nothing to prune (<= $keep golden snapshots)"; return 0; }
|
||
for id in $old; do
|
||
curl -s -o /dev/null -w " pruned golden snapshot $id: http %{http_code}\n" -X DELETE -H "Authorization: Bearer $pat" "https://api.digitalocean.com/v2/snapshots/$id"
|
||
done
|
||
}
|
||
|
||
cmd_dist_up() {
|
||
local n="${1:-}"
|
||
[[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up <workers> [size] [region]" >&2; return 1; }
|
||
: "${TF_VAR_do_token:?export TF_VAR_do_token=<DigitalOcean API token> first}"
|
||
local args=(-auto-approve -var "workers=$n")
|
||
[ -n "${2:-}" ] && args+=(-var "size=$2")
|
||
[ -n "${3:-}" ] && args+=(-var "region=$3")
|
||
_dist_tf init -input=false >/dev/null
|
||
_dist_tf apply "${args[@]}" || { echo "dist:up FAILED — terraform apply errored (see above)" >&2; return 1; }
|
||
echo "fleet up: $n worker(s) — waiting for cloud-init before they're usable..."
|
||
_dist_wait_ready
|
||
echo "fleet ready. inventory: $(_dist_repo_root)/.local/fleet/inventory"
|
||
}
|
||
|
||
cmd_dist_down() {
|
||
: "${TF_VAR_do_token:?export TF_VAR_do_token=<DigitalOcean API token> first}"
|
||
_dist_tf apply -auto-approve -var "workers=0"
|
||
echo "fleet down (workers=0): zero compute cost, snapshot only (~$0.40/mo)."
|
||
}
|
||
|
||
cmd_dist_sim() {
|
||
local total="${1:-}" turn="${2:-300}" destroy=false
|
||
local a
|
||
for a in "$@"; do [ "$a" = "--destroy-after" ] && destroy=true; done
|
||
[[ "$total" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:sim <total_games> [turn_limit] [--destroy-after]" >&2; return 1; }
|
||
|
||
local root inv
|
||
root="$(_dist_repo_root)"
|
||
inv="$root/.local/fleet/inventory"
|
||
[ -f "$inv" ] || { echo "no inventory at $inv — run ./run dist:up <N> first" >&2; return 1; }
|
||
|
||
local hosts=()
|
||
while IFS= read -r line; do hosts+=("$line"); done < <(_dist_read_hosts "$inv")
|
||
local n=${#hosts[@]}
|
||
[ "$n" -gt 0 ] || { echo "inventory empty — fleet is down" >&2; return 1; }
|
||
|
||
local stamp results shard
|
||
stamp="$(date +%Y%m%d_%H%M%S)"
|
||
results="$root/.local/iter/$stamp"
|
||
mkdir -p "$results"
|
||
shard=$(( (total + n - 1) / n )) # ceil(total / n)
|
||
echo "distributing $total game(s) over $n worker(s): ~$shard each, turn_limit=$turn"
|
||
echo "results → $results"
|
||
|
||
local pids=() i=0 host offset cnt cores
|
||
for host in "${hosts[@]}"; do
|
||
offset=$(( i * shard ))
|
||
cnt=$shard
|
||
(( offset + cnt > total )) && cnt=$(( total - offset ))
|
||
(( cnt <= 0 )) && break
|
||
cores="$(ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" nproc 2>/dev/null || echo 8)"
|
||
echo " [$host] seeds $(( offset + 1 ))..$(( offset + cnt )) PARALLEL=$cores"
|
||
AUTOPLAY_HOST="$host" SEED_OFFSET="$offset" PARALLEL="$cores" \
|
||
bash "$root/tools/autoplay-batch.sh" "$cnt" "$turn" "$results" \
|
||
>"$results/dispatch_worker_${i}.log" 2>&1 &
|
||
pids+=($!)
|
||
i=$(( i + 1 ))
|
||
done
|
||
|
||
local fail=0 p
|
||
for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done
|
||
|
||
local produced
|
||
produced="$(find "$results" -name turn_stats.jsonl -type f 2>/dev/null | wc -l | tr -d ' ')"
|
||
echo "----------------------------------------------------------------"
|
||
echo "distributed sim done: $produced game(s) produced turn_stats under $results"
|
||
[ "$fail" -eq 0 ] || echo "WARNING: $fail worker batch(es) errored — see $results/dispatch_worker_*.log" >&2
|
||
|
||
$destroy && { echo "--destroy-after → tearing down"; cmd_dist_down; }
|
||
[ "$fail" -eq 0 ]
|
||
}
|
||
|
||
cmd_dist_train() {
|
||
# v1 blocking sweep: one training run per worker (distinct seed + run-name),
|
||
# then pull the models back. Detached orchestration is the documented follow-up.
|
||
local steps="${1:-1000000}" destroy=false
|
||
local a
|
||
for a in "$@"; do [ "$a" = "--destroy-after" ] && destroy=true; done
|
||
[[ "$steps" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:train <total_steps> [--destroy-after]" >&2; return 1; }
|
||
|
||
local root inv
|
||
root="$(_dist_repo_root)"
|
||
inv="$root/.local/fleet/inventory"
|
||
[ -f "$inv" ] || { echo "no inventory at $inv — run ./run dist:up <N> first" >&2; return 1; }
|
||
|
||
local hosts=()
|
||
while IFS= read -r line; do hosts+=("$line"); done < <(_dist_read_hosts "$inv")
|
||
local n=${#hosts[@]}
|
||
[ "$n" -gt 0 ] || { echo "inventory empty — fleet is down" >&2; return 1; }
|
||
|
||
local stamp results
|
||
stamp="$(date +%Y%m%d_%H%M%S)"
|
||
results="$root/.local/train/$stamp"
|
||
mkdir -p "$results"
|
||
echo "fanning $n training run(s) × $steps steps (CPU). results → $results"
|
||
|
||
local repo_remote="Code/@projects/@magic-civilization"
|
||
local pids=() i=0 host seed run
|
||
for host in "${hosts[@]}"; do
|
||
seed=$(( 42 + i ))
|
||
run="dist-${stamp}-w${i}"
|
||
echo " [$host] run=$run seed=$seed"
|
||
ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" \
|
||
"cd ~/$repo_remote && python3 -m tooling.rl_self_play.train --run-name '$run' --seed $seed --total-steps $steps --device cpu" \
|
||
>"$results/train_worker_${i}.log" 2>&1 &
|
||
pids+=($!)
|
||
i=$(( i + 1 ))
|
||
done
|
||
|
||
local fail=0 p
|
||
for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done
|
||
|
||
# Pull each worker's model dir back.
|
||
i=0
|
||
for host in "${hosts[@]}"; do
|
||
run="dist-${stamp}-w${i}"
|
||
rsync -az "$host:~/$repo_remote/tooling/rl_self_play/models/$run" "$results/" 2>/dev/null || \
|
||
echo " note: no model dir for $run on $host (check $results/train_worker_${i}.log)"
|
||
i=$(( i + 1 ))
|
||
done
|
||
|
||
echo "----------------------------------------------------------------"
|
||
echo "distributed train done under $results"
|
||
[ "$fail" -eq 0 ] || echo "WARNING: $fail run(s) errored — see $results/train_worker_*.log" >&2
|
||
|
||
$destroy && { echo "--destroy-after → tearing down"; cmd_dist_down; }
|
||
[ "$fail" -eq 0 ]
|
||
}
|
||
|
||
# ── compute offload (single worker) ──────────────────────────────────────────
|
||
# Run heavy build/test compute on a DO worker instead of plum (M2 Air). Workers
|
||
# already carry the toolchain (golden image) + repo (cloud-init git pull).
|
||
|
||
_dist_first_host() {
|
||
local inv h
|
||
inv="$(_dist_repo_root)/.local/fleet/inventory"
|
||
[ -f "$inv" ] || return 1
|
||
h="$(_dist_read_hosts "$inv" | head -1)"
|
||
[ -n "$h" ] || return 1 # inventory present but no live host (e.g. "fleet is down")
|
||
printf '%s\n' "$h"
|
||
}
|
||
|
||
cmd_dist_sync() {
|
||
# Pull the given ref on every live worker, then make the GDExtension current:
|
||
# fetch the prebuilt .so for that sha from the artifact Space if it exists
|
||
# (seconds), else build it. So a mid-session code change reaches the fleet
|
||
# without an image rebuild, and N workers share one published build.
|
||
local ref="${1:-main}"
|
||
local root inv host senv
|
||
root="$(_dist_repo_root)"
|
||
inv="$root/.local/fleet/inventory"
|
||
[ -f "$inv" ] || { echo "no fleet — run ./run dist:up <N> first" >&2; return 1; }
|
||
senv="$(_dist_spaces_env 2>/dev/null || true)" # empty → workers just build
|
||
local pids=() p fail=0
|
||
while IFS= read -r host; do
|
||
echo "[$host] sync → $ref (fetch prebuilt .so, else build)"
|
||
ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" \
|
||
"$senv SPACE='$_DIST_SPACE' SO_PATH='$_DIST_SO_PATH' REF='$ref' bash -s" <<'REMOTE' &
|
||
set -e
|
||
cd ~/Code/@projects/@magic-civilization
|
||
git fetch --depth=1 origin "$REF" && git reset --hard FETCH_HEAD
|
||
SHA=$(git rev-parse HEAD)
|
||
. ~/.cargo/env
|
||
if [ -n "${RCLONE_S3_ACCESS_KEY_ID:-}" ] && rclone copyto ":s3:$SPACE/builds/$SHA/libmagic_civ_physics.x86_64.so" "$SO_PATH" 2>/dev/null; then
|
||
echo " [$SHA] fetched prebuilt .so (no rebuild)"
|
||
else
|
||
( cd src/simulator && bash build-gdext.sh ) && echo " [$SHA] built .so (cache miss)"
|
||
fi
|
||
REMOTE
|
||
pids+=($!)
|
||
done < <(_dist_read_hosts "$inv")
|
||
for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done
|
||
[ "$fail" -eq 0 ] && echo "synced all workers to $ref" || { echo "$fail worker(s) failed sync" >&2; return 1; }
|
||
}
|
||
|
||
cmd_dist_test() {
|
||
# Offload the Rust test suite to one fast worker (slow on the M2 Air).
|
||
local host repo
|
||
host="$(_dist_first_host)" || { echo "no fleet — run ./run dist:up 1 c-8 first" >&2; return 1; }
|
||
repo="Code/@projects/@magic-civilization"
|
||
echo "running cargo tests on $host ..."
|
||
ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" "
|
||
set -e
|
||
cd ~/$repo/src/simulator && . ~/.cargo/env
|
||
if command -v cargo-nextest >/dev/null 2>&1; then cargo nextest run --workspace; else cargo test --workspace; fi
|
||
"
|
||
}
|
||
|
||
cmd_dist_build() {
|
||
# Offload the workspace build for fast compile feedback, and bring back the
|
||
# platform-independent WASM artifact. The native .so is linux-only and stays
|
||
# on the worker (plum builds its own macOS .dylib locally).
|
||
local host root repo
|
||
host="$(_dist_first_host)" || { echo "no fleet — run ./run dist:up 1 first" >&2; return 1; }
|
||
root="$(_dist_repo_root)"
|
||
repo="Code/@projects/@magic-civilization"
|
||
echo "building workspace + wasm on $host ..."
|
||
ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" "
|
||
set -e
|
||
cd ~/$repo/src/simulator && . ~/.cargo/env
|
||
cargo build --workspace
|
||
bash build-wasm.sh
|
||
"
|
||
echo "fetching wasm artifact → plum ..."
|
||
mkdir -p "$root/.local/build/wasm"
|
||
rsync -az "$host:~/$repo/.local/build/wasm/" "$root/.local/build/wasm/" 2>/dev/null \
|
||
&& echo "wasm → .local/build/wasm/" || echo "note: no wasm at .local/build/wasm/ on worker"
|
||
}
|
||
|
||
cmd_dist_render() {
|
||
# Render a proof scene on a worker (software weston + Mesa llvmpipe, no GPU) and
|
||
# pull the PNG back to plum. Replaces the apricot SCREENSHOT_HOST flow.
|
||
local scene="${1:-}" out="${2:-}"
|
||
[ -n "$scene" ] && [ -n "$out" ] || { echo "usage: ./run dist:render <res://scene.tscn> <out.png> [timeout_s]" >&2; return 1; }
|
||
local host
|
||
host="$(_dist_first_host)" || { echo "no fleet — run ./run dist:up 1 first" >&2; return 1; }
|
||
local user="${host%@*}"
|
||
AUTOPLAY_HOST="$host" \
|
||
PROJECT_ROOT_REMOTE="/home/${user}/Code/@projects/@magic-civilization" \
|
||
bash "$(_dist_repo_root)/tools/capture-proof.sh" "$scene" "$out" "${3:-180}"
|
||
}
|
||
|
||
# ── build-artifact Space (magicciv-artifacts on DO Spaces) ───────────────────
|
||
# Build once, publish the linux .so/wasm keyed by git sha; sim/test/AI runners
|
||
# fetch the prebuilt artifact instead of recompiling. Creds: ~/.vault/do-spaces-uvlava.*
|
||
_DIST_SPACE="magicciv-artifacts"
|
||
_DIST_SO_PATH="src/game/engine/addons/magic_civ_physics/libmagic_civ_physics.x86_64.so"
|
||
|
||
# Emit an `RCLONE_S3_* ...` env-prefix string (DO Spaces creds from the vault) for
|
||
# embedding in a remote ssh command. Empty (rc 1) if the keys are missing.
|
||
_dist_spaces_env() {
|
||
local ak sk
|
||
ak="$(cat ~/.vault/do-spaces-uvlava.access 2>/dev/null)"
|
||
sk="$(cat ~/.vault/do-spaces-uvlava.secret 2>/dev/null)"
|
||
[ -n "$ak" ] && [ -n "$sk" ] || return 1
|
||
printf "RCLONE_S3_PROVIDER=DigitalOcean RCLONE_S3_ENDPOINT=nyc3.digitaloceanspaces.com RCLONE_S3_ACCESS_KEY_ID='%s' RCLONE_S3_SECRET_ACCESS_KEY='%s'" "$ak" "$sk"
|
||
}
|
||
|
||
cmd_dist_publish() {
|
||
# On a worker: build gdext + wasm, upload to magicciv-artifacts/builds/<sha>/.
|
||
local host senv
|
||
host="$(_dist_first_host)" || { echo "no fleet — ./run dist:up 1 first" >&2; return 1; }
|
||
senv="$(_dist_spaces_env)" || { echo "no DO Spaces creds in ~/.vault/do-spaces-uvlava.*" >&2; return 1; }
|
||
echo "building + publishing artifacts on $host ..."
|
||
ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" \
|
||
"$senv SO_PATH='$_DIST_SO_PATH' SPACE='$_DIST_SPACE' bash -s" <<'REMOTE'
|
||
set -e
|
||
cd ~/Code/@projects/@magic-civilization
|
||
SHA=$(git rev-parse HEAD)
|
||
. ~/.cargo/env
|
||
( cd src/simulator && bash build-gdext.sh && bash build-wasm.sh )
|
||
rclone copyto "$SO_PATH" ":s3:$SPACE/builds/$SHA/libmagic_civ_physics.x86_64.so"
|
||
[ -d .local/build/wasm ] && rclone copy .local/build/wasm ":s3:$SPACE/builds/$SHA/wasm/" || true
|
||
printf 'sha=%s\nbuilt=%s\n' "$SHA" "$(date -u +%FT%TZ)" | rclone rcat ":s3:$SPACE/builds/$SHA/meta.txt"
|
||
echo "published builds/$SHA/ (.so + wasm)"
|
||
REMOTE
|
||
}
|
||
|
||
cmd_dist_fetch() {
|
||
# On a worker: fetch the prebuilt .so for the worker's HEAD sha into the addon
|
||
# path instead of recompiling. Nonzero on a cache miss.
|
||
local host senv
|
||
host="$(_dist_first_host)" || { echo "no fleet — ./run dist:up 1 first" >&2; return 1; }
|
||
senv="$(_dist_spaces_env)" || { echo "no DO Spaces creds" >&2; return 1; }
|
||
ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" \
|
||
"$senv SO_PATH='$_DIST_SO_PATH' SPACE='$_DIST_SPACE' bash -s" <<'REMOTE'
|
||
set -e
|
||
cd ~/Code/@projects/@magic-civilization
|
||
SHA=$(git rev-parse HEAD)
|
||
if rclone copyto ":s3:$SPACE/builds/$SHA/libmagic_civ_physics.x86_64.so" "$SO_PATH" 2>/dev/null; then
|
||
echo "FETCHED prebuilt .so for $SHA"
|
||
else
|
||
echo "MISS: no prebuilt .so for $SHA — run ./run dist:publish"; exit 3
|
||
fi
|
||
REMOTE
|
||
}
|
||
|
||
cmd_dist_models() {
|
||
# Share RL model artifacts via the Space (runs on plum; models are platform-independent).
|
||
# ./run dist:models push <src-dir-or-file> <name> ./run dist:models pull <name> <dest> ./run dist:models ls
|
||
local sub="${1:-}" ak sk
|
||
ak="$(cat ~/.vault/do-spaces-uvlava.access 2>/dev/null)"; sk="$(cat ~/.vault/do-spaces-uvlava.secret 2>/dev/null)"
|
||
[ -n "$ak" ] && [ -n "$sk" ] || { echo "no DO Spaces creds in ~/.vault/do-spaces-uvlava.*" >&2; return 1; }
|
||
export RCLONE_S3_PROVIDER=DigitalOcean RCLONE_S3_ENDPOINT=nyc3.digitaloceanspaces.com
|
||
export RCLONE_S3_ACCESS_KEY_ID="$ak" RCLONE_S3_SECRET_ACCESS_KEY="$sk"
|
||
case "$sub" in
|
||
push) [ -n "${2:-}" ] && [ -n "${3:-}" ] || { echo "usage: ./run dist:models push <src> <name>" >&2; return 1; }; rclone copy "$2" ":s3:$_DIST_SPACE/models/$3/" -P ;;
|
||
pull) [ -n "${2:-}" ] && [ -n "${3:-}" ] || { echo "usage: ./run dist:models pull <name> <dest>" >&2; return 1; }; rclone copy ":s3:$_DIST_SPACE/models/$2/" "$3" -P ;;
|
||
ls) rclone ls ":s3:$_DIST_SPACE/models/" 2>/dev/null || echo "(empty)" ;;
|
||
*) echo "usage: ./run dist:models {push <src> <name>|pull <name> <dest>|ls}" >&2; return 1 ;;
|
||
esac
|
||
}
|