playbook/antigravity-awesome-skills/skills/remote-gpu-trainer/scripts/health_patrol.sh.template

68 lines
4.4 KiB
Bash

#!/usr/bin/env bash
# health_patrol.sh.template — ONE read-only patrol tick for a detached remote GPU job.
#
# Fire on a cadence from the host's recurring runner (Claude Code `/loop 30m`; cron
# `3,33 * * * *` — offset off :00/:30 to dodge platform load spikes; Codex/Cursor
# Automations → references/monitoring_patterns.md §7). This is the §3 **L2 patrol** body:
# one combined ssh round-trip → a decision → a 3-5 line report EVEN IF nothing changed.
#
# READ-ONLY: never edits, restarts, or deletes anything (principles #6/#9). Watches
# LIVENESS only; to make the RESULT outlive the box, pair with an on-box L1 self-
# completion chain (§3 L1). Exit 0 = healthy / in-progress / cleanly done;
# exit 1 = ESCALATE ("崩了") so the loop/cron surfaces the tick loudly.
set -u
# ── PROFILE BLOCK — bind from profiles/<platform>.md §8 SCRIPT OVERRIDES ────────────────
HOST="${HOST:-autodl-1}" # ssh alias (profile §1)
RUN_GLOB="${RUN_GLOB:-scripts.train}" # pgrep -af pattern for the train process
RESULT_DIR="${RESULT_DIR:-/root/autodl-tmp/runs/results}" # dir holding one file per finished cell
RUN_LOG="${RUN_LOG:-/root/autodl-tmp/runs/logs/train.log}" # the PER-RUN log (NOT a tee'd master — see ‡)
DATA_MOUNT="${DATA_MOUNT:-/root/autodl-tmp}" # disk to watch (bytes AND inodes)
N_TOTAL="${N_TOTAL:-0}" # expected cell count (0 = don't grade completion)
DISK_PCT_MAX="${DISK_PCT_MAX:-95}" # escalate when used% (bytes or inodes) >= this
# ── ONE combined READ-ONLY round-trip (quoted heredoc → sent verbatim; safe args via bash -s) ──
OUT="$(ssh -o ConnectTimeout=15 -o ServerAliveInterval=10 -o ServerAliveCountMax=3 "$HOST" \
bash -s "$RUN_GLOB" "$RESULT_DIR" "$RUN_LOG" "$DATA_MOUNT" <<'REMOTE'
set -u
RUN_GLOB=$1; RESULT_DIR=$2; RUN_LOG=$3; DATA_MOUNT=$4
RESULT_GLOB='*.json' # CUSTOMIZE: one file per finished cell
CRASH_RE='Traceback|Error|CUDA out of memory|OutOfMemory|Killed' # CUSTOMIZE; QUOTED → | is alternation, never a pipe
echo "ALIVE=$(pgrep -af "$RUN_GLOB" 2>/dev/null | grep -v grep | wc -l)"
echo "DONE=$(ls "$RESULT_DIR"/$RESULT_GLOB 2>/dev/null | wc -l)"
echo "EPOCH=$(grep -hoE 'Epoch[ =:]*[0-9]+(/[0-9]+)?' "$RUN_LOG" 2>/dev/null | tail -1)"
echo "CRASH=$(grep -hE "$CRASH_RE" "$RUN_LOG" 2>/dev/null | wc -l)" # ‡ scope to per-run log, never run_all.out (§2)
echo "DISK=$(df -h "$DATA_MOUNT" 2>/dev/null | awk 'NR==2{print $5}')"
echo "INODE=$(df -i "$DATA_MOUNT" 2>/dev/null | awk 'NR==2{print $5}')"
REMOTE
)" || { echo "PATROL $HOST: ssh FAILED — only YOU can see the console (balance / power / preemption). Check it."; exit 1; }
# ── parse ──
g(){ printf '%s\n' "$OUT" | sed -n "s/^$1=//p"; }
ALIVE=$(g ALIVE); DONE=$(g DONE); EPOCH=$(g EPOCH); CRASH=$(g CRASH); DISK=$(g DISK); INODE=$(g INODE)
dp=${DISK%\%}; ip=${INODE%\%}
# ── always report, even if nothing changed (§3-L2) ──
echo "PATROL $HOST: proc=${ALIVE:-?} done=${DONE:-?}/${N_TOTAL} epoch=${EPOCH:-n/a} disk=${DISK:-?} inode=${INODE:-?}"
# ── escalate? crash signature / disk / inode / process-gone-while-incomplete ──
esc=0; why=""
[ "${CRASH:-0}" -gt 0 ] 2>/dev/null && { esc=1; why="crash x${CRASH} in $(basename "$RUN_LOG")"; }
[ "${dp:-0}" -ge "$DISK_PCT_MAX" ] 2>/dev/null && { esc=1; why="${why:+$why; }disk ${DISK}"; }
[ "${ip:-0}" -ge "$DISK_PCT_MAX" ] 2>/dev/null && { esc=1; why="${why:+$why; }inodes ${INODE}"; }
if [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null && [ "${N_TOTAL:-0}" -gt 0 ] && [ "${DONE:-0}" -lt "$N_TOTAL" ] 2>/dev/null; then
esc=1; why="${why:+$why; }process gone at ${DONE}/${N_TOTAL} (incomplete)"
fi
if [ "$esc" -eq 1 ]; then
echo "PATROL: 崩了 — ${why}. Triage: ssh $HOST \"grep -B2 -A20 -E 'Traceback' '$RUN_LOG' | head -50\" (§6). Do NOT blind-restart; classify → fixed remediation."
exit 1
fi
if [ "${N_TOTAL:-0}" -gt 0 ] && [ "${DONE:-0}" -ge "$N_TOTAL" ] 2>/dev/null && [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null; then
echo "PATROL: all ${N_TOTAL} done, process exited → load-verify + pull, THEN teardown (SKILL.md Phase 5 Iron Law)."; exit 0
fi
if [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null && [ "${N_TOTAL:-0}" -eq 0 ]; then
echo "PATROL: process not running and completion-grading off — set N_TOTAL to auto-classify, or verify by hand."; exit 0
fi
echo "PATROL: healthy / in progress — nothing to do."; exit 0