#!/usr/bin/env bash # Queue iterator for multi-ablation deployment — platform-agnostic. # # Network-accel hook so every child (incl. the tracker client) inherits it; ":" no-op on a clean box. # See references/gotchas_universal.md and references/china-network.md. PROXY_HOOK="${PROXY_HOOK:-source /etc/network_turbo}" # PROXY_HOOK is an OPERATOR-supplied profile snippet (source a file / module load / export ...), eval'd # on purpose to run an arbitrary setup hook. Set it ONLY from your own trusted profile, never from # untrusted or remote-derived input. eval "${PROXY_HOOK}" 2>/dev/null || true # # Each queue line: [epochs] (epochs defaults to 20) # Calls $RUN_ONE per line — defaults to $DURABLE_DIR/run_one.sh (the durable/shared mount from # profiles/.md §8). Export DURABLE_DIR, or set RUN_ONE directly if run_one.sh lives elsewhere. # # Usage: ./run_queue.sh [start_index] # start_index defaults to 1 (run all). Pass N to RESUME from ablation N (principle #8 — see # references/parallel_ablation.md §5). # # IMPORTANT: tmux/bash loads THIS script into memory at launch. Editing it mid-flight does NOT affect # the running queue; only a NEW launch sees changes. Never overwrite it while a queue reads it # (references/gotchas_universal.md, never-mutate-inputs-under-a-live-run; principle #6). set -u RUN_ONE="${RUN_ONE:-${DURABLE_DIR:-/root/autodl-fs}/run_one.sh}" # = /run_one.sh; export DURABLE_DIR (profile §8) or set RUN_ONE directly # Arg-count guard FIRST — under `set -u`, QUEUE="$1" below would abort with an unbound-variable # error before the Usage check could run. Guard so the Usage message is reachable. if [ "$#" -lt 1 ]; then echo "Usage: $0 [start_index]" exit 1 fi QUEUE="$1" START="${2:-1}" if [ -z "$QUEUE" ] || [ ! -f "$QUEUE" ]; then echo "Usage: $0 [start_index]" exit 1 fi HOSTNAME_SHORT=$(hostname -s) # Count ablation CELLS only (skip #-comments + blank lines) so $TOTAL and the resume index are # CELL numbers, not raw line numbers — `start_index=N` then resumes from ablation N regardless of how # many comment/blank lines precede it (the loop below increments i only after the same skip guards). TOTAL=$(grep -cvE '^[[:space:]]*(#|$)' "$QUEUE") i=0 fail=0 failed_names=() echo "=== Queue $(basename "$QUEUE"): $TOTAL ablations, starting from $START on $HOSTNAME_SHORT ===" while IFS=$' \t' read -r cfg task epochs; do # Skip comment/blank lines BEFORE counting so i (and the START resume index) count CELLS, not lines. if [ -z "$cfg" ]; then continue; fi case "$cfg" in \#*) continue ;; esac # skip #-prefixed comment lines i=$((i+1)) if [ "$i" -lt "$START" ]; then continue; fi EPOCHS="${epochs:-20}" NAME=$(basename "$cfg" .yaml) echo "================================================================" echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$i/$TOTAL] STARTING $NAME ($task, ${EPOCHS}ep)" echo "================================================================" bash "$RUN_ONE" "$cfg" "$task" "$EPOCHS" RC=$? if [ "$RC" -ne 0 ]; then fail=$((fail+1)); failed_names+=("$NAME"); fi echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$i/$TOTAL] FINISHED $NAME (exit=$RC)" done < "$QUEUE" echo "================================================================" if [ "$fail" -eq 0 ]; then echo "[$(date +'%Y-%m-%d %H:%M:%S')] === QUEUE DONE on $HOSTNAME_SHORT -- all $TOTAL cell(s) exited 0 ===" else echo "[$(date +'%Y-%m-%d %H:%M:%S')] === QUEUE DONE on $HOSTNAME_SHORT -- $fail cell(s) FAILED: ${failed_names[*]} ===" fi echo "================================================================" # Propagate failure: a queue with any failed cell must NOT exit 0, or tmux/patrol # automation reads "QUEUE DONE" as success and a broken ablation hides for hours. [ "$fail" -eq 0 ] || exit 1