170 lines
9.0 KiB
Bash
170 lines
9.0 KiB
Bash
#!/usr/bin/env bash
|
||
# gpu_health.sh — portable pre-flight GPU-health probe for a rented box (see references/gotchas_universal.md U21-U23).
|
||
#
|
||
# Runs three independent checks and prints ONE PASS / WARN / FAIL summary:
|
||
# 1. live sampling — nvidia-smi dmon over a few seconds (power/util/clocks/mem/temp)
|
||
# 2. Xid scan — dmesg for hardware-failure Xid codes; Xid 48 / 79 are HARD failures
|
||
# 3. throttle scan — SM clock crushed below base while hot, or nvidia-smi throttle reasons
|
||
#
|
||
# Exit codes (so a launch wrapper can react before it pays for GPU-hours):
|
||
# 0 PASS or WARN — safe to launch (WARN = degraded but usable; see stderr notes)
|
||
# 2 HARD FAIL — dead/throttling GPU; re-rent a DIFFERENT box, do not launch here
|
||
#
|
||
# Usage: bash gpu_health.sh [GPU_INDEX] # default 0
|
||
# On a rental there is no "reseat the card" — a HARD fail means stop + re-rent (see references/gotchas_universal.md U21-U23).
|
||
# NEVER an unquoted pipe inside a grep regex (it reads stdin and hangs).
|
||
|
||
set -u
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Tunable constants — every magic number is documented here, no voodoo.
|
||
# ---------------------------------------------------------------------------
|
||
GPU="${1:-0}" # which GPU to probe (nvidia-smi index)
|
||
SAMPLE_COUNT=5 # dmon sample COUNT (-c N = N one-second samples); 5 samples ~= 5 s,
|
||
# enough to catch a clock dip without burning metered time on a no-op probe.
|
||
TEMP_HOT_C=83 # H100/A100-class throttle onset ~83 °C (U23). At/above this the
|
||
# board down-clocks itself; sustained >83 °C while SM clock is low
|
||
# is the thermal-throttle signature.
|
||
SM_CLOCK_FLOOR_FRAC=70 # treat SM clock < 70% of the board's *base* clock as "crushed".
|
||
# 70% chosen as a conservative gap: boost variance is normal,
|
||
# but a 30%+ drop below BASE under load is throttling, not jitter.
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Result accumulators. status escalates PASS -> WARN -> FAIL, never downgrades.
|
||
# ---------------------------------------------------------------------------
|
||
STATUS="PASS"
|
||
NOTES="" # human-readable findings, one per line, emitted to stderr
|
||
|
||
# escalate <LEVEL> <message> — raise overall status and record the reason.
|
||
escalate() {
|
||
local level="$1"; shift
|
||
NOTES="${NOTES} [${level}] $*"$'\n'
|
||
# FAIL beats WARN beats PASS; only ever climb the ladder.
|
||
if [ "$level" = "FAIL" ]; then
|
||
STATUS="FAIL"
|
||
elif [ "$level" = "WARN" ] && [ "$STATUS" != "FAIL" ]; then
|
||
STATUS="WARN"
|
||
fi
|
||
}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Pre-flight: nvidia-smi must exist, and the requested GPU index must resolve.
|
||
# ---------------------------------------------------------------------------
|
||
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||
echo "FAIL: nvidia-smi not found — no NVIDIA driver on this box." >&2
|
||
exit 2
|
||
fi
|
||
if ! nvidia-smi -i "$GPU" -L >/dev/null 2>&1; then
|
||
echo "FAIL: GPU index $GPU does not exist (nvidia-smi -L)." >&2
|
||
exit 2
|
||
fi
|
||
|
||
GPU_NAME="$(nvidia-smi -i "$GPU" --query-gpu=name --format=csv,noheader 2>/dev/null)"
|
||
echo "== gpu_health: GPU $GPU ($GPU_NAME), sampling ${SAMPLE_COUNT}s =="
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CHECK 1 — live sampling with nvidia-smi dmon.
|
||
# -s pucvmet selects: p=power, u=util(sm/mem), c=clocks(sm/mem), v=power/thermal
|
||
# violations, m=mem usage, e=ECC errors, t=temp. -c N takes N one-second samples.
|
||
# We capture the raw table; later checks parse the peak temp / current SM clock out
|
||
# of the per-GPU query API (more robust than column-slicing dmon across driver versions).
|
||
# ---------------------------------------------------------------------------
|
||
DMON_OUT="$(nvidia-smi dmon -i "$GPU" -s pucvmet -c "$SAMPLE_COUNT" 2>/dev/null || true)"
|
||
if [ -n "$DMON_OUT" ]; then
|
||
echo "$DMON_OUT"
|
||
else
|
||
escalate WARN "dmon produced no samples (old driver?); falling back to point queries."
|
||
fi
|
||
|
||
# Point-in-time query: temperature, current SM clock, and BASE-equivalent reference.
|
||
# query-gpu fields are stable across drivers, unlike dmon column order.
|
||
read -r TEMP_C SM_CUR SM_MAX <<EOF
|
||
$(nvidia-smi -i "$GPU" \
|
||
--query-gpu=temperature.gpu,clocks.current.sm,clocks.max.sm \
|
||
--format=csv,noheader,nounits 2>/dev/null | tr ',' ' ')
|
||
EOF
|
||
TEMP_C="${TEMP_C:-0}"
|
||
SM_CUR="${SM_CUR:-0}"
|
||
SM_MAX="${SM_MAX:-0}"
|
||
echo " temp=${TEMP_C}C sm_clock=${SM_CUR}MHz sm_max=${SM_MAX}MHz"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CHECK 2 — Xid hardware-error scan (see references/gotchas_universal.md U21-U23).
|
||
# Xid is the canonical NVIDIA hardware-failure channel in the kernel ring buffer.
|
||
# Xid 48 = double-bit (uncorrectable) ECC -> the GPU is effectively DEAD.
|
||
# Xid 79 = "GPU has fallen off the bus" -> PCIe link lost; board is gone.
|
||
# Other Xids (e.g. 13, 31, 43, 45) are usually app faults, not hardware death -> WARN.
|
||
# dmesg may need root; if it is unreadable we cannot clear the GPU, so WARN (not silent PASS).
|
||
# IMPORTANT: grep alternation is fully quoted — an unquoted '|' would fork a pipe that
|
||
# reads stdin and hangs the probe forever.
|
||
# ---------------------------------------------------------------------------
|
||
if DMESG_OUT="$(dmesg 2>/dev/null)" && [ -n "$DMESG_OUT" ]; then
|
||
# Any Xid line at all is worth surfacing.
|
||
XID_LINES="$(printf '%s\n' "$DMESG_OUT" | grep -iE 'NVRM: Xid' || true)"
|
||
if [ -n "$XID_LINES" ]; then
|
||
# HARD-failure Xid codes. Match "Xid (...): 48," / "Xid 79" robustly by code.
|
||
HARD_XID="$(printf '%s\n' "$XID_LINES" | grep -iE 'Xid[^0-9]*[0-9:() ]*[^0-9](48|79)([,. ]|$)' || true)"
|
||
if [ -n "$HARD_XID" ]; then
|
||
escalate FAIL "Xid 48/79 detected (dead GPU / off-the-bus): $(printf '%s' "$HARD_XID" | tail -n1)"
|
||
else
|
||
escalate WARN "Non-fatal Xid present (likely app fault): $(printf '%s' "$XID_LINES" | tail -n1)"
|
||
fi
|
||
fi
|
||
else
|
||
escalate WARN "dmesg unreadable (need root?) — cannot rule out an Xid hardware fault. — exit code is non-authoritative; have a human confirm GPU health when dmesg is unreadable."
|
||
fi
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CHECK 3 — thermal / power throttling (see references/gotchas_universal.md U21-U23).
|
||
# Two independent signatures, either one trips a HARD fail:
|
||
# (a) the kernel-reported clocks-throttle reasons via nvidia-smi -q -d PERFORMANCE
|
||
# (HW thermal slowdown / HW power brake / SW thermal slowdown active = throttling now);
|
||
# (b) heuristic: SM clock crushed below SM_CLOCK_FLOOR_FRAC% of sm_max WHILE temp >= 83 °C
|
||
# — the classic "same code slower than yesterday" silent 25–40% loss.
|
||
# On a shared rental the cooling cannot be fixed, so confirmed throttling => re-rent.
|
||
# ---------------------------------------------------------------------------
|
||
PERF_OUT="$(nvidia-smi -i "$GPU" -q -d PERFORMANCE 2>/dev/null || true)"
|
||
# Look ONLY for reasons reported "Active" — the static list is always present.
|
||
# Quoted alternation again: never an unquoted pipe in the regex.
|
||
THROTTLE_ACTIVE="$(printf '%s\n' "$PERF_OUT" \
|
||
| grep -iE 'slowdown|power brake|hw thermal|sw thermal' \
|
||
| grep -i 'active' \
|
||
| grep -iv ': not active' || true)"
|
||
if [ -n "$THROTTLE_ACTIVE" ]; then
|
||
escalate FAIL "nvidia-smi reports active throttling: $(printf '%s' "$THROTTLE_ACTIVE" | tr -s ' ' | tail -n1)"
|
||
fi
|
||
|
||
# Heuristic clock-vs-temp check — only meaningful when we read real numbers.
|
||
# Integer math only (clocks are whole MHz); guards against a zero sm_max.
|
||
if [ "$SM_MAX" -gt 0 ] 2>/dev/null; then
|
||
SM_FLOOR=$(( SM_MAX * SM_CLOCK_FLOOR_FRAC / 100 )) # 70% of max = "crushed" threshold
|
||
if [ "$SM_CUR" -lt "$SM_FLOOR" ] && [ "$TEMP_C" -ge "$TEMP_HOT_C" ] 2>/dev/null; then
|
||
escalate FAIL "thermal throttle: sm_clock ${SM_CUR}MHz < ${SM_FLOOR}MHz (70% of max) while temp ${TEMP_C}C >= ${TEMP_HOT_C}C"
|
||
elif [ "$TEMP_C" -ge "$TEMP_HOT_C" ] 2>/dev/null; then
|
||
# Hot but clock still high: borderline, warn so the caller watches it.
|
||
escalate WARN "running hot (${TEMP_C}C >= ${TEMP_HOT_C}C) but SM clock not yet crushed — watch for throttling."
|
||
fi
|
||
fi
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Summary + exit. HARD fail => exit 2 so a wrapper aborts the launch.
|
||
# ---------------------------------------------------------------------------
|
||
echo "------------------------------------------------------------"
|
||
if [ -n "$NOTES" ]; then
|
||
printf 'findings:\n%s' "$NOTES" >&2
|
||
fi
|
||
case "$STATUS" in
|
||
FAIL)
|
||
echo "RESULT: FAIL — GPU $GPU is unhealthy. Stop this instance and re-rent a different box."
|
||
exit 2
|
||
;;
|
||
WARN)
|
||
echo "RESULT: WARN — GPU $GPU usable but degraded; review findings above before a long run."
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "RESULT: PASS — GPU $GPU healthy (no Xid, no throttling, clocks nominal)."
|
||
exit 0
|
||
;;
|
||
esac
|