playbook/antigravity-awesome-skills/skills/remote-gpu-trainer/scripts/gpu_health.sh

170 lines
9.0 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# gpu_health.sh — portable pre-flight GPU-health probe for a rented box (see references/gotchas_universal.md U21-U23).
#
# Runs three independent checks and prints ONE PASS / WARN / FAIL summary:
# 1. live sampling — nvidia-smi dmon over a few seconds (power/util/clocks/mem/temp)
# 2. Xid scan — dmesg for hardware-failure Xid codes; Xid 48 / 79 are HARD failures
# 3. throttle scan — SM clock crushed below base while hot, or nvidia-smi throttle reasons
#
# Exit codes (so a launch wrapper can react before it pays for GPU-hours):
# 0 PASS or WARN — safe to launch (WARN = degraded but usable; see stderr notes)
# 2 HARD FAIL — dead/throttling GPU; re-rent a DIFFERENT box, do not launch here
#
# Usage: bash gpu_health.sh [GPU_INDEX] # default 0
# On a rental there is no "reseat the card" — a HARD fail means stop + re-rent (see references/gotchas_universal.md U21-U23).
# NEVER an unquoted pipe inside a grep regex (it reads stdin and hangs).
set -u
# ---------------------------------------------------------------------------
# Tunable constants — every magic number is documented here, no voodoo.
# ---------------------------------------------------------------------------
GPU="${1:-0}" # which GPU to probe (nvidia-smi index)
SAMPLE_COUNT=5 # dmon sample COUNT (-c N = N one-second samples); 5 samples ~= 5 s,
# enough to catch a clock dip without burning metered time on a no-op probe.
TEMP_HOT_C=83 # H100/A100-class throttle onset ~83 °C (U23). At/above this the
# board down-clocks itself; sustained >83 °C while SM clock is low
# is the thermal-throttle signature.
SM_CLOCK_FLOOR_FRAC=70 # treat SM clock < 70% of the board's *base* clock as "crushed".
# 70% chosen as a conservative gap: boost variance is normal,
# but a 30%+ drop below BASE under load is throttling, not jitter.
# ---------------------------------------------------------------------------
# Result accumulators. status escalates PASS -> WARN -> FAIL, never downgrades.
# ---------------------------------------------------------------------------
STATUS="PASS"
NOTES="" # human-readable findings, one per line, emitted to stderr
# escalate <LEVEL> <message> — raise overall status and record the reason.
escalate() {
local level="$1"; shift
NOTES="${NOTES} [${level}] $*"$'\n'
# FAIL beats WARN beats PASS; only ever climb the ladder.
if [ "$level" = "FAIL" ]; then
STATUS="FAIL"
elif [ "$level" = "WARN" ] && [ "$STATUS" != "FAIL" ]; then
STATUS="WARN"
fi
}
# ---------------------------------------------------------------------------
# Pre-flight: nvidia-smi must exist, and the requested GPU index must resolve.
# ---------------------------------------------------------------------------
if ! command -v nvidia-smi >/dev/null 2>&1; then
echo "FAIL: nvidia-smi not found — no NVIDIA driver on this box." >&2
exit 2
fi
if ! nvidia-smi -i "$GPU" -L >/dev/null 2>&1; then
echo "FAIL: GPU index $GPU does not exist (nvidia-smi -L)." >&2
exit 2
fi
GPU_NAME="$(nvidia-smi -i "$GPU" --query-gpu=name --format=csv,noheader 2>/dev/null)"
echo "== gpu_health: GPU $GPU ($GPU_NAME), sampling ${SAMPLE_COUNT}s =="
# ---------------------------------------------------------------------------
# CHECK 1 — live sampling with nvidia-smi dmon.
# -s pucvmet selects: p=power, u=util(sm/mem), c=clocks(sm/mem), v=power/thermal
# violations, m=mem usage, e=ECC errors, t=temp. -c N takes N one-second samples.
# We capture the raw table; later checks parse the peak temp / current SM clock out
# of the per-GPU query API (more robust than column-slicing dmon across driver versions).
# ---------------------------------------------------------------------------
DMON_OUT="$(nvidia-smi dmon -i "$GPU" -s pucvmet -c "$SAMPLE_COUNT" 2>/dev/null || true)"
if [ -n "$DMON_OUT" ]; then
echo "$DMON_OUT"
else
escalate WARN "dmon produced no samples (old driver?); falling back to point queries."
fi
# Point-in-time query: temperature, current SM clock, and BASE-equivalent reference.
# query-gpu fields are stable across drivers, unlike dmon column order.
read -r TEMP_C SM_CUR SM_MAX <<EOF
$(nvidia-smi -i "$GPU" \
--query-gpu=temperature.gpu,clocks.current.sm,clocks.max.sm \
--format=csv,noheader,nounits 2>/dev/null | tr ',' ' ')
EOF
TEMP_C="${TEMP_C:-0}"
SM_CUR="${SM_CUR:-0}"
SM_MAX="${SM_MAX:-0}"
echo " temp=${TEMP_C}C sm_clock=${SM_CUR}MHz sm_max=${SM_MAX}MHz"
# ---------------------------------------------------------------------------
# CHECK 2 — Xid hardware-error scan (see references/gotchas_universal.md U21-U23).
# Xid is the canonical NVIDIA hardware-failure channel in the kernel ring buffer.
# Xid 48 = double-bit (uncorrectable) ECC -> the GPU is effectively DEAD.
# Xid 79 = "GPU has fallen off the bus" -> PCIe link lost; board is gone.
# Other Xids (e.g. 13, 31, 43, 45) are usually app faults, not hardware death -> WARN.
# dmesg may need root; if it is unreadable we cannot clear the GPU, so WARN (not silent PASS).
# IMPORTANT: grep alternation is fully quoted — an unquoted '|' would fork a pipe that
# reads stdin and hangs the probe forever.
# ---------------------------------------------------------------------------
if DMESG_OUT="$(dmesg 2>/dev/null)" && [ -n "$DMESG_OUT" ]; then
# Any Xid line at all is worth surfacing.
XID_LINES="$(printf '%s\n' "$DMESG_OUT" | grep -iE 'NVRM: Xid' || true)"
if [ -n "$XID_LINES" ]; then
# HARD-failure Xid codes. Match "Xid (...): 48," / "Xid 79" robustly by code.
HARD_XID="$(printf '%s\n' "$XID_LINES" | grep -iE 'Xid[^0-9]*[0-9:() ]*[^0-9](48|79)([,. ]|$)' || true)"
if [ -n "$HARD_XID" ]; then
escalate FAIL "Xid 48/79 detected (dead GPU / off-the-bus): $(printf '%s' "$HARD_XID" | tail -n1)"
else
escalate WARN "Non-fatal Xid present (likely app fault): $(printf '%s' "$XID_LINES" | tail -n1)"
fi
fi
else
escalate WARN "dmesg unreadable (need root?) — cannot rule out an Xid hardware fault. — exit code is non-authoritative; have a human confirm GPU health when dmesg is unreadable."
fi
# ---------------------------------------------------------------------------
# CHECK 3 — thermal / power throttling (see references/gotchas_universal.md U21-U23).
# Two independent signatures, either one trips a HARD fail:
# (a) the kernel-reported clocks-throttle reasons via nvidia-smi -q -d PERFORMANCE
# (HW thermal slowdown / HW power brake / SW thermal slowdown active = throttling now);
# (b) heuristic: SM clock crushed below SM_CLOCK_FLOOR_FRAC% of sm_max WHILE temp >= 83 °C
# — the classic "same code slower than yesterday" silent 2540% loss.
# On a shared rental the cooling cannot be fixed, so confirmed throttling => re-rent.
# ---------------------------------------------------------------------------
PERF_OUT="$(nvidia-smi -i "$GPU" -q -d PERFORMANCE 2>/dev/null || true)"
# Look ONLY for reasons reported "Active" — the static list is always present.
# Quoted alternation again: never an unquoted pipe in the regex.
THROTTLE_ACTIVE="$(printf '%s\n' "$PERF_OUT" \
| grep -iE 'slowdown|power brake|hw thermal|sw thermal' \
| grep -i 'active' \
| grep -iv ': not active' || true)"
if [ -n "$THROTTLE_ACTIVE" ]; then
escalate FAIL "nvidia-smi reports active throttling: $(printf '%s' "$THROTTLE_ACTIVE" | tr -s ' ' | tail -n1)"
fi
# Heuristic clock-vs-temp check — only meaningful when we read real numbers.
# Integer math only (clocks are whole MHz); guards against a zero sm_max.
if [ "$SM_MAX" -gt 0 ] 2>/dev/null; then
SM_FLOOR=$(( SM_MAX * SM_CLOCK_FLOOR_FRAC / 100 )) # 70% of max = "crushed" threshold
if [ "$SM_CUR" -lt "$SM_FLOOR" ] && [ "$TEMP_C" -ge "$TEMP_HOT_C" ] 2>/dev/null; then
escalate FAIL "thermal throttle: sm_clock ${SM_CUR}MHz < ${SM_FLOOR}MHz (70% of max) while temp ${TEMP_C}C >= ${TEMP_HOT_C}C"
elif [ "$TEMP_C" -ge "$TEMP_HOT_C" ] 2>/dev/null; then
# Hot but clock still high: borderline, warn so the caller watches it.
escalate WARN "running hot (${TEMP_C}C >= ${TEMP_HOT_C}C) but SM clock not yet crushed — watch for throttling."
fi
fi
# ---------------------------------------------------------------------------
# Summary + exit. HARD fail => exit 2 so a wrapper aborts the launch.
# ---------------------------------------------------------------------------
echo "------------------------------------------------------------"
if [ -n "$NOTES" ]; then
printf 'findings:\n%s' "$NOTES" >&2
fi
case "$STATUS" in
FAIL)
echo "RESULT: FAIL — GPU $GPU is unhealthy. Stop this instance and re-rent a different box."
exit 2
;;
WARN)
echo "RESULT: WARN — GPU $GPU usable but degraded; review findings above before a long run."
exit 0
;;
*)
echo "RESULT: PASS — GPU $GPU healthy (no Xid, no throttling, clocks nominal)."
exit 0
;;
esac