diff --git a/infra/terraform/hetzner-cpu-runner/main.tf b/infra/terraform/hetzner-cpu-runner/main.tf new file mode 100644 index 00000000..a8fe92f1 --- /dev/null +++ b/infra/terraform/hetzner-cpu-runner/main.tf @@ -0,0 +1,51 @@ +locals { + server_count = var.enabled ? 1 : 0 +} + +resource "hcloud_ssh_key" "runner" { + name = "${var.name}-key" + public_key = file(pathexpand(var.ssh_public_key_path)) +} + +# Persistent data volume — deliberately NOT gated on var.enabled, so it lives +# across server destroy/recreate. This is what makes the server ephemeral: +# the slow-to-rebuild state (cargo cache, target/, the clone, RL checkpoints) +# stays here, the compute is disposable. +resource "hcloud_volume" "data" { + name = "${var.name}-data" + size = var.volume_size + location = var.location + format = "ext4" +} + +resource "hcloud_server" "runner" { + count = local.server_count + name = var.name + server_type = var.server_type + location = var.location + image = "ubuntu-24.04" + ssh_keys = [hcloud_ssh_key.runner.id] + + user_data = templatefile("${path.module}/cloud-init.yaml", { + volume_id = hcloud_volume.data.id + git_remote = var.git_remote + }) + + labels = { + project = "magic-civilization" + role = "cpu-runner" + } + + # Keep the box if it is briefly toggled; protects against an accidental apply + # nuking an in-flight training run. Remove if you want hard ephemerality. + lifecycle { + ignore_changes = [ssh_keys] + } +} + +resource "hcloud_volume_attachment" "data" { + count = local.server_count + volume_id = hcloud_volume.data.id + server_id = hcloud_server.runner[0].id + automount = false # cloud-init mounts it deterministically by id +} diff --git a/infra/terraform/hetzner-cpu-runner/variables.tf b/infra/terraform/hetzner-cpu-runner/variables.tf new file mode 100644 index 00000000..3575c518 --- /dev/null +++ b/infra/terraform/hetzner-cpu-runner/variables.tf @@ -0,0 +1,74 @@ +variable "hcloud_token" { + description = "Hetzner Cloud API token (project-scoped). Export as TF_VAR_hcloud_token; never commit." + type = string + sensitive = true +} + +variable "workers" { + description = <<-EOT + Fleet size — the iteration-speed lever. 0 = nothing running, zero cost. + Set to N to fan distributed sim/test work across N cattle, then back to 0 + to tear the fleet down. Each worker is identical and disposable; results + are rsynced off before destroy, so there is no per-worker state to keep. + EOT + type = number + default = 0 + + validation { + condition = var.workers >= 0 && var.workers <= 50 + error_message = "Keep the fleet between 0 and 50 (project-quota / sanity guard)." + } +} + +variable "location" { + description = "Hetzner location. US: ash (Ashburn VA, ~near NYC), hil (Hillsboro OR). EU: fsn1, nbg1, hel1." + type = string + default = "ash" +} + +variable "server_type" { + description = <<-EOT + Per-worker size. Distributed fan-out favours many small cheap boxes over one + big one (finer shard granularity per euro). Shared-vCPU cpx line is cheapest: + cpx31 = 4 vCPU / 8 GB (fine granularity, cheapest unit) + cpx41 = 8 vCPU / 16 GB (default; PARALLEL=8 games/worker) + cpx51 = 16 vCPU / 32 GB (fewer, fatter workers; also for RL self-play envs) + EOT + type = string + default = "cpx41" +} + +variable "image" { + description = <<-EOT + Boot image. Default is the stock Ubuntu base — workers then run the full + toolchain install via cloud-init on first boot (~3-4 min, parallel across the + fleet). After you bake a golden snapshot with the Packer template in + ../../packer, set this to that snapshot's ID for ~30 s ready-to-work boots. + EOT + type = string + default = "ubuntu-24.04" +} + +variable "ssh_public_key_path" { + description = "Public key authorised for SSH into every worker (also used by the dispatch script)." + type = string + default = "~/.ssh/id_ed25519.pub" +} + +variable "name" { + description = "Resource name prefix; workers are named -0, -1, ..." + type = string + default = "mc-test" +} + +variable "git_remote" { + description = "GitLab clone URL (origin) the workers pull source from. Required for cloud-init to fetch the repo." + type = string + default = "" +} + +variable "git_ref" { + description = "Branch/tag/SHA the fleet checks out. Pin to a SHA for reproducible distributed runs." + type = string + default = "main" +} diff --git a/infra/terraform/hetzner-cpu-runner/versions.tf b/infra/terraform/hetzner-cpu-runner/versions.tf new file mode 100644 index 00000000..a3837b08 --- /dev/null +++ b/infra/terraform/hetzner-cpu-runner/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.6" + + required_providers { + hcloud = { + source = "hetznercloud/hcloud" + version = "~> 1.49" + } + } +} + +provider "hcloud" { + token = var.hcloud_token +} diff --git a/tooling/claude/dot-claude/hooks/session-orient.sh b/tooling/claude/dot-claude/hooks/session-orient.sh new file mode 100755 index 00000000..fcd4df8c --- /dev/null +++ b/tooling/claude/dot-claude/hooks/session-orient.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# SessionStart bootloader — injects a LIVE project orientation so a fresh session +# (or a spawned agent) knows current state without manual digging: in-flight +# objectives, what just landed, unpushed risk, and the tooling entry-points. +# +# Contract: read-only, fast (<2s), and NEVER breaks the session — on any error it +# emits nothing (or valid empty JSON) and exits 0. +# +# Modes: +# (no args) → emits SessionStart hook JSON (additionalContext) +# --human → prints the orientation as plain markdown (for manual re-orient) +# +# State is read LIVE every run (objectives.json + git) — never embedded, so it +# can't go stale. This is the dynamic counterpart to the static CLAUDE.md router. + +ROOT="${CLAUDE_PROJECT_DIR:-$(git rev-parse --show-toplevel 2>/dev/null || pwd)}" +MODE="${1:-hook}" + +python3 - "$ROOT" "$MODE" <<'PY' 2>/dev/null || exit 0 +import json, sys, os, subprocess +from collections import Counter + +root, mode = sys.argv[1], sys.argv[2] + +def sh(*a): + try: + return subprocess.run(a, cwd=root, capture_output=True, text=True, timeout=4).stdout.strip() + except Exception: + return "" + +lines = [] + +# ── In-flight objectives (the actionable WIP) ───────────────────────────────── +try: + d = json.load(open(os.path.join(root, ".project/objectives/objectives.json"))) + objs = d.get("objectives", []) + c = Counter(str(o.get("status")) for o in objs) + lines.append( + f"**Objectives:** {c.get('done',0)} done · {c.get('partial',0)} partial · " + f"{c.get('stub',0)} stub · {c.get('oos',0)} oos" + ) + wip = [o for o in objs if o.get("status") in ("partial", "stub")] + if wip: + lines.append("**In-flight (partial/stub) — likely where to resume:**") + for o in wip[:8]: + lines.append(f" - `{o.get('id')}` {o.get('title','')} _[{o.get('status')}]_") + blocked = d.get("blocked") or [] + if blocked: + ids = ", ".join(str(b.get("id") if isinstance(b, dict) else b) for b in blocked[:6]) + lines.append(f"**Blocked:** {len(blocked)} — {ids}") +except Exception: + lines.append("_objectives.json unreadable — run `python3 tools/objectives-report.py` to regen._") + +# ── What just landed + unpushed risk ────────────────────────────────────────── +branch = sh("git", "rev-parse", "--abbrev-ref", "HEAD") +log = sh("git", "log", "--oneline", "-5") +if log: + lines.append("**Last 5 commits:**") + lines += [" " + l for l in log.splitlines()] +unpushed = sh("git", "rev-list", "--count", "@{u}..HEAD") +if unpushed.isdigit() and int(unpushed) > 0: + lines.append(f"**⚠ {unpushed} unpushed commits** on `{branch}` — forge may be down; don't blindly re-push or rebase.") + +# ── Build-health hint (cheap — don't run cargo in a boot hook) ───────────────── +lines.append("**Verify before trusting:** `cargo test -p ` (Rust) / headless play loop (sim) / render-proof (UI) — boot does not run tests.") + +# ── Tooling entry-points ────────────────────────────────────────────────────── +lines.append( + "**Pointers:** specialist → `specialist-preamble.md` · dispatch/verify → `agents-task-map.md` · " + "where-code-goes → `code-layering.md` · current work → `.project/ROADMAP.md` + `.project/objectives/`." +) + +body = "## 🧭 Session orientation (live snapshot)\n\n" + "\n".join(lines) + \ + "\n\n_Snapshot at boot — grep/read to confirm before acting (verify, don't infer)._" + +if mode == "--human": + print(body) +else: + print(json.dumps({"hookSpecificOutput": {"hookEventName": "SessionStart", "additionalContext": body}})) +PY diff --git a/tooling/claude/dot-claude/settings.json b/tooling/claude/dot-claude/settings.json index e0e2e59f..a3d9d702 100644 --- a/tooling/claude/dot-claude/settings.json +++ b/tooling/claude/dot-claude/settings.json @@ -73,6 +73,19 @@ } ] } + ], + "SessionStart": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": ".claude/hooks/session-orient.sh", + "timeout": 10, + "statusMessage": "Loading live session orientation..." + } + ] + } ] } }