84 lines
3.8 KiB
Bash
84 lines
3.8 KiB
Bash
#!/usr/bin/env bash
|
|
# Queue iterator for multi-ablation deployment — platform-agnostic.
|
|
#
|
|
# Network-accel hook so every child (incl. the tracker client) inherits it; ":" no-op on a clean box.
|
|
# See references/gotchas_universal.md and references/china-network.md.
|
|
PROXY_HOOK="${PROXY_HOOK:-source /etc/network_turbo}"
|
|
# PROXY_HOOK is an OPERATOR-supplied profile snippet (source a file / module load / export ...), eval'd
|
|
# on purpose to run an arbitrary setup hook. Set it ONLY from your own trusted profile, never from
|
|
# untrusted or remote-derived input.
|
|
eval "${PROXY_HOOK}" 2>/dev/null || true
|
|
#
|
|
# Each queue line: <config_yaml_path> <task> [epochs] (epochs defaults to 20)
|
|
# Calls $RUN_ONE per line — defaults to $DURABLE_DIR/run_one.sh (the durable/shared mount from
|
|
# profiles/<platform>.md §8). Export DURABLE_DIR, or set RUN_ONE directly if run_one.sh lives elsewhere.
|
|
#
|
|
# Usage: ./run_queue.sh <queue_file> [start_index]
|
|
# start_index defaults to 1 (run all). Pass N to RESUME from ablation N (principle #8 — see
|
|
# references/parallel_ablation.md §5).
|
|
#
|
|
# IMPORTANT: tmux/bash loads THIS script into memory at launch. Editing it mid-flight does NOT affect
|
|
# the running queue; only a NEW launch sees changes. Never overwrite it while a queue reads it
|
|
# (references/gotchas_universal.md, never-mutate-inputs-under-a-live-run; principle #6).
|
|
set -u
|
|
|
|
RUN_ONE="${RUN_ONE:-${DURABLE_DIR:-/root/autodl-fs}/run_one.sh}" # = <durable mount>/run_one.sh; export DURABLE_DIR (profile §8) or set RUN_ONE directly
|
|
|
|
# Arg-count guard FIRST — under `set -u`, QUEUE="$1" below would abort with an unbound-variable
|
|
# error before the Usage check could run. Guard so the Usage message is reachable.
|
|
if [ "$#" -lt 1 ]; then
|
|
echo "Usage: $0 <queue_file> [start_index]"
|
|
exit 1
|
|
fi
|
|
|
|
QUEUE="$1"
|
|
START="${2:-1}"
|
|
|
|
if [ -z "$QUEUE" ] || [ ! -f "$QUEUE" ]; then
|
|
echo "Usage: $0 <queue_file> [start_index]"
|
|
exit 1
|
|
fi
|
|
|
|
HOSTNAME_SHORT=$(hostname -s)
|
|
# Count ablation CELLS only (skip #-comments + blank lines) so $TOTAL and the resume index are
|
|
# CELL numbers, not raw line numbers — `start_index=N` then resumes from ablation N regardless of how
|
|
# many comment/blank lines precede it (the loop below increments i only after the same skip guards).
|
|
TOTAL=$(grep -cvE '^[[:space:]]*(#|$)' "$QUEUE")
|
|
i=0
|
|
fail=0
|
|
failed_names=()
|
|
|
|
echo "=== Queue $(basename "$QUEUE"): $TOTAL ablations, starting from $START on $HOSTNAME_SHORT ==="
|
|
|
|
while IFS=$' \t' read -r cfg task epochs; do
|
|
# Skip comment/blank lines BEFORE counting so i (and the START resume index) count CELLS, not lines.
|
|
if [ -z "$cfg" ]; then continue; fi
|
|
case "$cfg" in \#*) continue ;; esac # skip #-prefixed comment lines
|
|
i=$((i+1))
|
|
if [ "$i" -lt "$START" ]; then continue; fi
|
|
|
|
EPOCHS="${epochs:-20}"
|
|
NAME=$(basename "$cfg" .yaml)
|
|
|
|
echo "================================================================"
|
|
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$i/$TOTAL] STARTING $NAME ($task, ${EPOCHS}ep)"
|
|
echo "================================================================"
|
|
|
|
bash "$RUN_ONE" "$cfg" "$task" "$EPOCHS"
|
|
RC=$?
|
|
if [ "$RC" -ne 0 ]; then fail=$((fail+1)); failed_names+=("$NAME"); fi
|
|
|
|
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$i/$TOTAL] FINISHED $NAME (exit=$RC)"
|
|
done < "$QUEUE"
|
|
|
|
echo "================================================================"
|
|
if [ "$fail" -eq 0 ]; then
|
|
echo "[$(date +'%Y-%m-%d %H:%M:%S')] === QUEUE DONE on $HOSTNAME_SHORT -- all $TOTAL cell(s) exited 0 ==="
|
|
else
|
|
echo "[$(date +'%Y-%m-%d %H:%M:%S')] === QUEUE DONE on $HOSTNAME_SHORT -- $fail cell(s) FAILED: ${failed_names[*]} ==="
|
|
fi
|
|
echo "================================================================"
|
|
# Propagate failure: a queue with any failed cell must NOT exit 0, or tmux/patrol
|
|
# automation reads "QUEUE DONE" as success and a broken ablation hides for hours.
|
|
[ "$fail" -eq 0 ] || exit 1
|