68 lines
2.9 KiB
Bash
68 lines
2.9 KiB
Bash
#!/usr/bin/env bash
|
|
# 5-second resolution memory + CPU + GPU profiler for AutoDL training.
|
|
# Catches val-phase memory spikes that can cgroup-wedge an instance.
|
|
#
|
|
# Usage: bash mem_monitor.sh > /root/autodl-tmp/runs/logs/mem.tsv 2>&1 &
|
|
# Run in tmux session (separate from training tmux).
|
|
#
|
|
# Output: TSV with columns:
|
|
# timestamp cgroup_gb cpu_pct main_pid main_rss_gb main_threads main_fds n_python total_python_rss_gb wandb_pid wandb_rss_gb gpu_util_pct gpu_mem_mb
|
|
|
|
set -u
|
|
|
|
# Which training process to track for the "main" RSS columns. Override to match your launcher's
|
|
# `pgrep -f` pattern, e.g. TRAIN_PROC=train.py or TRAIN_PROC=accelerate (default: src.train).
|
|
TRAIN_PROC="${TRAIN_PROC:-src.train}"
|
|
|
|
# Header
|
|
printf "timestamp\tcgroup_gb\tcpu_pct\tmain_pid\tmain_rss_gb\tmain_threads\tmain_fds\tn_python\ttotal_python_rss_gb\twandb_pid\twandb_rss_gb\tgpu_util_pct\tgpu_mem_mb\n"
|
|
|
|
while true; do
|
|
ts=$(date '+%Y-%m-%d %H:%M:%S')
|
|
|
|
# cgroup current memory (bytes → GB)
|
|
cgroup_bytes=$(cat /sys/fs/cgroup/memory.current 2>/dev/null || echo 0)
|
|
cgroup_gb=$(awk "BEGIN{printf \"%.2f\", $cgroup_bytes/1073741824}")
|
|
|
|
# Total CPU usage from /proc/stat (rough; just diff once)
|
|
cpu_pct=$(top -bn1 | grep "Cpu(s)" | awk '{print $2+$4}')
|
|
|
|
# Main training python PID + RSS (pattern overridable via $TRAIN_PROC)
|
|
main_pid=$(pgrep -f "$TRAIN_PROC" | head -1)
|
|
if [ -n "$main_pid" ]; then
|
|
main_rss=$(awk '/VmRSS/ {print $2}' /proc/$main_pid/status 2>/dev/null || echo 0)
|
|
main_rss_gb=$(awk "BEGIN{printf \"%.2f\", $main_rss/1048576}")
|
|
main_threads=$(awk '/Threads/ {print $2}' /proc/$main_pid/status 2>/dev/null || echo 0)
|
|
main_fds=$(ls /proc/$main_pid/fd 2>/dev/null | wc -l)
|
|
else
|
|
main_pid=0; main_rss_gb=0; main_threads=0; main_fds=0
|
|
fi
|
|
|
|
# All python processes total RSS
|
|
n_python=$(pgrep -f python | wc -l)
|
|
total_python_rss_kb=$(ps -eo rss,comm | awk '$2 ~ /python/ {sum+=$1} END {print sum+0}')
|
|
total_python_rss_gb=$(awk "BEGIN{printf \"%.2f\", $total_python_rss_kb/1048576}")
|
|
|
|
# wandb process
|
|
wandb_pid=$(pgrep -f wandb-service | head -1)
|
|
if [ -n "$wandb_pid" ]; then
|
|
wandb_rss=$(awk '/VmRSS/ {print $2}' /proc/$wandb_pid/status 2>/dev/null || echo 0)
|
|
wandb_rss_gb=$(awk "BEGIN{printf \"%.2f\", $wandb_rss/1048576}")
|
|
else
|
|
wandb_pid=0; wandb_rss_gb=0
|
|
fi
|
|
|
|
# GPU util + memory
|
|
gpu_info=$(nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv,noheader,nounits 2>/dev/null | head -1)
|
|
gpu_util=$(echo "$gpu_info" | cut -d',' -f1 | tr -d ' ')
|
|
gpu_mem=$(echo "$gpu_info" | cut -d',' -f2 | tr -d ' ')
|
|
gpu_util=${gpu_util:-0}
|
|
gpu_mem=${gpu_mem:-0}
|
|
|
|
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
|
|
"$ts" "$cgroup_gb" "$cpu_pct" "$main_pid" "$main_rss_gb" "$main_threads" "$main_fds" \
|
|
"$n_python" "$total_python_rss_gb" "$wandb_pid" "$wandb_rss_gb" "$gpu_util" "$gpu_mem"
|
|
|
|
sleep 5
|
|
done
|