176 lines
7.8 KiB
Bash
176 lines
7.8 KiB
Bash
#!/usr/bin/env bash
|
|
# reap_vram_zombies.sh — find (and optionally kill) PIDs holding VRAM that the
|
|
# nvidia-smi process list cannot see (see references/gotchas_universal.md U11).
|
|
#
|
|
# After a crashed DDP run or a killed container, a process can keep a CUDA context
|
|
# (and its VRAM) alive while NOT appearing in `nvidia-smi`'s process table — so a
|
|
# fresh job OOMs on an "empty" GPU. Such holders DO still have the /dev/nvidia*
|
|
# device files open, so fuser/lsof can find them when nvidia-smi cannot.
|
|
#
|
|
# Strategy:
|
|
# 1. enumerate every PID with /dev/nvidia* open (fuser -v, lsof fallback)
|
|
# 2. subtract the PIDs nvidia-smi already accounts for (those are live, visible jobs)
|
|
# 3. of the remainder, flag any that is idle (~0% GPU util) and has lived past a timeout
|
|
# 4. DRY-RUN by default: print candidates only. --force is required to kill -9.
|
|
#
|
|
# Usage:
|
|
# bash reap_vram_zombies.sh # dry-run: list zombie candidates, kill nothing
|
|
# bash reap_vram_zombies.sh --force # actually kill -9 the flagged candidates
|
|
#
|
|
# A DRY-RUN exits 0 and never touches a process. Killing is destructive:
|
|
# it is gated behind an explicit --force so the orchestrator never auto-reaps.
|
|
# If the holder is inside another container, kill -9 from the host may not clear it —
|
|
# restart that container instead.
|
|
# NEVER an unquoted pipe inside a grep regex (it reads stdin and hangs forever).
|
|
|
|
set -u
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tunable constants — documented, no magic numbers buried in logic.
|
|
# ---------------------------------------------------------------------------
|
|
FORCE=0 # 0 = dry-run (default), 1 = actually kill. Set by --force.
|
|
MIN_AGE_SECS=120 # only reap a holder that has lived > 2 min. A genuinely new
|
|
# process may briefly hold a context while warming up; 2 min
|
|
# is well past CUDA-context init, so survivors are stragglers.
|
|
IDLE_UTIL_PCT=5 # treat per-process GPU util <= 5% as "idle". A real training
|
|
# job pegs util far higher; ~0% + held VRAM = a zombie, not work.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Arg parse — only --force is recognized; anything else is a usage error.
|
|
# ---------------------------------------------------------------------------
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--force) FORCE=1 ;;
|
|
-h|--help)
|
|
echo "usage: bash reap_vram_zombies.sh [--force]" >&2
|
|
echo " (default is a dry-run; --force enables kill -9)" >&2
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "unknown argument: $arg (only --force is supported)" >&2
|
|
exit 64 # EX_USAGE
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
|
echo "nvidia-smi not found — no NVIDIA driver on this box." >&2
|
|
exit 1
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 1 — enumerate PIDs holding /dev/nvidia* open.
|
|
# fuser prints PIDs (mode letters attached, e.g. "12345m"); strip non-digits.
|
|
# lsof is the fallback when fuser is absent. Expand /dev/nvidia* to only the real
|
|
# device nodes first: with no NVIDIA driver the glob matches nothing, and passing
|
|
# the literal "/dev/nvidia*" to fuser/lsof would otherwise error and mislead.
|
|
# ---------------------------------------------------------------------------
|
|
collect_dev_holders() {
|
|
local pids="" dev
|
|
local devs=()
|
|
for dev in /dev/nvidia*; do [ -e "$dev" ] && devs+=("$dev"); done
|
|
if [ "${#devs[@]}" -eq 0 ]; then
|
|
echo "no /dev/nvidia* device nodes present — cannot enumerate device holders." >&2
|
|
return 1
|
|
fi
|
|
if command -v fuser >/dev/null 2>&1; then
|
|
# fuser writes the PID list to stdout, the verbose table to stderr.
|
|
# 2>/dev/null drops the table; we keep only the bare PIDs.
|
|
pids="$(fuser "${devs[@]}" 2>/dev/null || true)"
|
|
elif command -v lsof >/dev/null 2>&1; then
|
|
# lsof -t prints one PID per line for the listed device files.
|
|
pids="$(lsof -t "${devs[@]}" 2>/dev/null || true)"
|
|
else
|
|
echo "neither fuser nor lsof is available — cannot enumerate device holders." >&2
|
|
return 1
|
|
fi
|
|
# Normalize to whitespace-separated bare PIDs (drop fuser's mode letters).
|
|
printf '%s\n' "$pids" | tr -cs '0-9' ' '
|
|
}
|
|
|
|
DEV_HOLDERS="$(collect_dev_holders)" || exit 1
|
|
DEV_HOLDERS="$(printf '%s\n' "$DEV_HOLDERS" | tr ' ' '\n' | grep -E '^[0-9]+$' || true)"
|
|
|
|
if [ -z "$DEV_HOLDERS" ]; then
|
|
echo "RESULT: clean — no process is holding /dev/nvidia* open."
|
|
exit 0
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 2 — PIDs nvidia-smi already accounts for. These are visible, legitimate
|
|
# jobs; never reap them. (Empty when the zombie is the ONLY holder — the U11 case.)
|
|
# ---------------------------------------------------------------------------
|
|
VISIBLE_PIDS="$(nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null \
|
|
| grep -E '^[0-9]+$' || true)"
|
|
|
|
# is_visible <pid> — true if nvidia-smi lists this PID as a compute app.
|
|
is_visible() {
|
|
local pid="$1"
|
|
printf '%s\n' "$VISIBLE_PIDS" | grep -qx "$pid"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 3 — classify each remaining holder. A candidate is a holder that is
|
|
# (a) NOT in nvidia-smi's list, (b) older than MIN_AGE_SECS, (c) ~idle on the GPU.
|
|
# Process age comes from `ps -o etimes` (elapsed seconds, integer, portable).
|
|
# ---------------------------------------------------------------------------
|
|
CANDIDATES=""
|
|
echo "== reap_vram_zombies: scanning $(printf '%s' "$DEV_HOLDERS" | tr '\n' ' ')=="
|
|
for pid in $DEV_HOLDERS; do
|
|
# Skip the kernel/init edge and any PID that vanished mid-scan.
|
|
if [ ! -d "/proc/$pid" ]; then
|
|
continue
|
|
fi
|
|
|
|
CMD="$(ps -o comm= -p "$pid" 2>/dev/null || true)"
|
|
AGE="$(ps -o etimes= -p "$pid" 2>/dev/null | tr -d ' ' || true)"
|
|
AGE="${AGE:-0}"
|
|
|
|
if is_visible "$pid"; then
|
|
echo " pid $pid ($CMD): visible to nvidia-smi — live job, skip."
|
|
continue
|
|
fi
|
|
if [ "$AGE" -lt "$MIN_AGE_SECS" ] 2>/dev/null; then
|
|
echo " pid $pid ($CMD): age ${AGE}s < ${MIN_AGE_SECS}s — too young, skip (may be warming up)."
|
|
continue
|
|
fi
|
|
|
|
# This PID holds /dev/nvidia*, is invisible to nvidia-smi, and is old.
|
|
# nvidia-smi cannot give us a per-process util for an unlisted PID, so by the
|
|
# U11 definition (held VRAM + invisible) it is already idle on the GPU.
|
|
echo " pid $pid ($CMD): age ${AGE}s, holds VRAM, INVISIBLE to nvidia-smi -> ZOMBIE candidate."
|
|
CANDIDATES="${CANDIDATES}${pid} "
|
|
done
|
|
|
|
CANDIDATES="$(printf '%s' "$CANDIDATES" | tr -s ' ' )"
|
|
CANDIDATES="${CANDIDATES# }"; CANDIDATES="${CANDIDATES% }"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 4 — act. Dry-run prints and exits; --force kills -9.
|
|
# ---------------------------------------------------------------------------
|
|
echo "------------------------------------------------------------"
|
|
if [ -z "$CANDIDATES" ]; then
|
|
echo "RESULT: clean — holders exist but none qualifies as a zombie (all visible/young)."
|
|
exit 0
|
|
fi
|
|
|
|
echo "zombie VRAM holders: $CANDIDATES"
|
|
if [ "$FORCE" -ne 1 ]; then
|
|
echo "RESULT: DRY-RUN — nothing killed. Re-run with --force to 'kill -9' the PIDs above."
|
|
echo " (If a holder lives inside another container, restart that container instead.)"
|
|
exit 0
|
|
fi
|
|
|
|
# --force path: kill each candidate, report per-PID outcome.
|
|
RC=0
|
|
for pid in $CANDIDATES; do
|
|
if kill -9 "$pid" 2>/dev/null; then
|
|
echo "killed -9 $pid"
|
|
else
|
|
echo "FAILED to kill $pid (gone already, or owned by another container)." >&2
|
|
RC=1
|
|
fi
|
|
done
|
|
echo "RESULT: reaped zombie VRAM holders (--force)."
|
|
exit "$RC"
|