playbook/antigravity-awesome-skills/plugins/antigravity-awesome-skills-.../skills/remote-gpu-trainer/scripts/download_loop.sh

68 lines
3.0 KiB
Bash

#!/usr/bin/env bash
# Per-dir resumable download loop — robust to mid-transfer connection drops.
#
# Each dir is pulled in its own session, so one network blip never loses the rest,
# and re-running skips already-complete dirs. A single `scp -r` of a huge tree dies
# on any blip and does NOT resume — see references/gotchas_universal.md (transfer
# resets). This uses rsync --partial, which resumes a half-pulled dir in place.
#
# Usage (override any var from the environment):
# LOCAL_TARGET=/path/to/local/final_ckpts \
# REMOTE_ALIAS=my-gpu-1 \
# REMOTE_PATH=/durable/final_ckpts \
# bash download_loop.sh
#
# NOTE: `du -sb` is GNU coreutils. On a non-GNU local shell (macOS/Windows) the
# size-skip heuristic may need adjusting; the download itself is unaffected.
set -u
LOCAL_TARGET="${LOCAL_TARGET:-/path/to/local/final_ckpts}"
REMOTE_ALIAS="${REMOTE_ALIAS:-my-gpu-1}"
REMOTE_PATH="${REMOTE_PATH:-/root/autodl-fs/final_ckpts}" # override from your profile (durable mount)
MIN_DIR_SIZE_BYTES="${MIN_DIR_SIZE_BYTES:-2000000000}" # 2 GB = "looks complete"
mkdir -p "$LOCAL_TARGET"
cd "$LOCAL_TARGET" || exit 1
echo "Listing remote dirs in $REMOTE_ALIAS:$REMOTE_PATH ..."
# Capture the listing AND its exit status separately. A bare `mapfile < <(ssh ...)`
# discards ssh's exit code, so an unreachable host or a wrong path yields an empty
# array that then reads as "nothing to download" -- a silent success right before a
# pre-teardown pull (principle #3). Fail loud on a listing error instead.
remote_listing=$(ssh -o ConnectTimeout=15 "$REMOTE_ALIAS" "ls -1 '$REMOTE_PATH'")
ssh_rc=$?
if [ "$ssh_rc" -ne 0 ]; then
echo "ERROR: could not list $REMOTE_ALIAS:$REMOTE_PATH (ssh/ls exit $ssh_rc) -- refusing to treat an unreachable host as an empty download." >&2
exit 1
fi
# mapfile preserves names with spaces; guard the empty string so it yields 0 elems, not 1.
if [ -z "$remote_listing" ]; then remote_dirs=(); else mapfile -t remote_dirs <<< "$remote_listing"; fi
n_total=${#remote_dirs[@]}
echo "Found $n_total remote dirs"
if [ "$n_total" -eq 0 ]; then echo "Remote dir is reachable but empty -- nothing to download."; exit 0; fi
ok=0; skip=0; fail=0
for d in "${remote_dirs[@]}"; do
[ -n "$d" ] || continue
if [ -d "$d" ]; then
size=$(du -sb "$d" 2>/dev/null | cut -f1)
if [ "${size:-0}" -ge "$MIN_DIR_SIZE_BYTES" ]; then
echo "SKIP $d (already complete)"
skip=$((skip+1)); continue
fi
echo "RETRY $d (partial — rsync will resume in place)"
fi
echo "DOWNLOADING $d ..."
if rsync -az --partial -e 'ssh -o ConnectTimeout=15 -o ServerAliveInterval=60 -o ServerAliveCountMax=120' \
"$REMOTE_ALIAS:$REMOTE_PATH/$d" ./ ; then
echo "OK $d"; ok=$((ok+1))
else
echo "FAIL $d"; fail=$((fail+1))
fi
done
echo
echo "=== Done === OK: $ok SKIP: $skip FAIL: $fail (of $n_total expected)"
echo "Local dirs now: $(find . -mindepth 1 -maxdepth 1 -type d | wc -l)"
[ "$fail" -eq 0 ] || { echo "Re-run to retry the failed dirs (resumable)."; exit 1; }