307 lines
8.9 KiB
Python
307 lines
8.9 KiB
Python
# /// script
|
|
# requires-python = ">=3.10"
|
|
# dependencies = [
|
|
# "inspect-ai>=0.3.0",
|
|
# "inspect-evals",
|
|
# "vllm>=0.4.0",
|
|
# "torch>=2.0.0",
|
|
# "transformers>=4.40.0",
|
|
# ]
|
|
# ///
|
|
|
|
"""
|
|
Entry point script for running inspect-ai evaluations with vLLM or HuggingFace Transformers backend.
|
|
|
|
This script runs evaluations on custom HuggingFace models using local GPU inference,
|
|
separate from inference provider scripts (which use external APIs).
|
|
|
|
Usage (standalone):
|
|
uv run scripts/inspect_vllm_uv.py --model "meta-llama/Llama-3.2-1B" --task "mmlu"
|
|
|
|
Model backends:
|
|
- vllm: Fast inference with vLLM (recommended for large models)
|
|
- hf: HuggingFace Transformers backend (broader model compatibility)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
from typing import Optional
|
|
|
|
|
|
def setup_environment() -> None:
|
|
"""Configure environment variables for HuggingFace authentication."""
|
|
hf_token = os.getenv("HF_TOKEN")
|
|
if hf_token:
|
|
os.environ.setdefault("HUGGING_FACE_HUB_TOKEN", hf_token)
|
|
os.environ.setdefault("HF_HUB_TOKEN", hf_token)
|
|
|
|
|
|
def run_inspect_vllm(
|
|
model_id: str,
|
|
task: str,
|
|
limit: Optional[int] = None,
|
|
max_connections: int = 4,
|
|
temperature: float = 0.0,
|
|
tensor_parallel_size: int = 1,
|
|
gpu_memory_utilization: float = 0.8,
|
|
dtype: str = "auto",
|
|
trust_remote_code: bool = False,
|
|
log_level: str = "info",
|
|
) -> None:
|
|
"""
|
|
Run inspect-ai evaluation with vLLM backend.
|
|
|
|
Args:
|
|
model_id: HuggingFace model ID
|
|
task: inspect-ai task to execute (e.g., "mmlu", "gsm8k")
|
|
limit: Limit number of samples to evaluate
|
|
max_connections: Maximum concurrent connections
|
|
temperature: Sampling temperature
|
|
tensor_parallel_size: Number of GPUs for tensor parallelism
|
|
gpu_memory_utilization: GPU memory fraction
|
|
dtype: Data type (auto, float16, bfloat16)
|
|
trust_remote_code: Allow remote code execution
|
|
log_level: Logging level
|
|
"""
|
|
setup_environment()
|
|
|
|
model_spec = f"vllm/{model_id}"
|
|
cmd = [
|
|
"inspect",
|
|
"eval",
|
|
task,
|
|
"--model",
|
|
model_spec,
|
|
"--log-level",
|
|
log_level,
|
|
"--max-connections",
|
|
str(max_connections),
|
|
]
|
|
|
|
# vLLM supports temperature=0 unlike HF inference providers
|
|
cmd.extend(["--temperature", str(temperature)])
|
|
|
|
# Older inspect-ai CLI versions do not support --model-args; rely on defaults
|
|
# and let vLLM choose sensible settings for small models.
|
|
if tensor_parallel_size != 1:
|
|
cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
|
|
if gpu_memory_utilization != 0.8:
|
|
cmd.extend(["--gpu-memory-utilization", str(gpu_memory_utilization)])
|
|
if dtype != "auto":
|
|
cmd.extend(["--dtype", dtype])
|
|
if trust_remote_code:
|
|
cmd.append("--trust-remote-code")
|
|
|
|
if limit:
|
|
cmd.extend(["--limit", str(limit)])
|
|
|
|
print(f"Running: {' '.join(cmd)}")
|
|
|
|
try:
|
|
subprocess.run(cmd, check=True)
|
|
print("Evaluation complete.")
|
|
except subprocess.CalledProcessError as exc:
|
|
print(f"Evaluation failed with exit code {exc.returncode}", file=sys.stderr)
|
|
sys.exit(exc.returncode)
|
|
|
|
|
|
def run_inspect_hf(
|
|
model_id: str,
|
|
task: str,
|
|
limit: Optional[int] = None,
|
|
max_connections: int = 1,
|
|
temperature: float = 0.001,
|
|
device: str = "auto",
|
|
dtype: str = "auto",
|
|
trust_remote_code: bool = False,
|
|
log_level: str = "info",
|
|
) -> None:
|
|
"""
|
|
Run inspect-ai evaluation with HuggingFace Transformers backend.
|
|
|
|
Use this when vLLM doesn't support the model architecture.
|
|
|
|
Args:
|
|
model_id: HuggingFace model ID
|
|
task: inspect-ai task to execute
|
|
limit: Limit number of samples
|
|
max_connections: Maximum concurrent connections (keep low for memory)
|
|
temperature: Sampling temperature
|
|
device: Device to use (auto, cuda, cpu)
|
|
dtype: Data type
|
|
trust_remote_code: Allow remote code execution
|
|
log_level: Logging level
|
|
"""
|
|
setup_environment()
|
|
|
|
model_spec = f"hf/{model_id}"
|
|
|
|
cmd = [
|
|
"inspect",
|
|
"eval",
|
|
task,
|
|
"--model",
|
|
model_spec,
|
|
"--log-level",
|
|
log_level,
|
|
"--max-connections",
|
|
str(max_connections),
|
|
"--temperature",
|
|
str(temperature),
|
|
]
|
|
|
|
if device != "auto":
|
|
cmd.extend(["--device", device])
|
|
if dtype != "auto":
|
|
cmd.extend(["--dtype", dtype])
|
|
if trust_remote_code:
|
|
cmd.append("--trust-remote-code")
|
|
|
|
if limit:
|
|
cmd.extend(["--limit", str(limit)])
|
|
|
|
print(f"Running: {' '.join(cmd)}")
|
|
|
|
try:
|
|
subprocess.run(cmd, check=True)
|
|
print("Evaluation complete.")
|
|
except subprocess.CalledProcessError as exc:
|
|
print(f"Evaluation failed with exit code {exc.returncode}", file=sys.stderr)
|
|
sys.exit(exc.returncode)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Run inspect-ai evaluations with vLLM or HuggingFace Transformers on custom models",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Run MMLU with vLLM backend
|
|
uv run scripts/inspect_vllm_uv.py --model meta-llama/Llama-3.2-1B --task mmlu
|
|
|
|
# Run with HuggingFace Transformers backend
|
|
uv run scripts/inspect_vllm_uv.py --model meta-llama/Llama-3.2-1B --task mmlu --backend hf
|
|
|
|
# Run with limited samples for testing
|
|
uv run scripts/inspect_vllm_uv.py --model meta-llama/Llama-3.2-1B --task mmlu --limit 10
|
|
|
|
# Run on multiple GPUs with tensor parallelism
|
|
uv run scripts/inspect_vllm_uv.py --model meta-llama/Llama-3.2-70B --task mmlu --tensor-parallel-size 4
|
|
|
|
Available tasks (from inspect-evals):
|
|
- mmlu: Massive Multitask Language Understanding
|
|
- gsm8k: Grade School Math
|
|
- hellaswag: Common sense reasoning
|
|
- arc_challenge: AI2 Reasoning Challenge
|
|
- truthfulqa: TruthfulQA benchmark
|
|
- winogrande: Winograd Schema Challenge
|
|
- humaneval: Code generation (HumanEval)
|
|
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--model",
|
|
required=True,
|
|
help="HuggingFace model ID (e.g., meta-llama/Llama-3.2-1B)",
|
|
)
|
|
parser.add_argument(
|
|
"--task",
|
|
required=True,
|
|
help="inspect-ai task to execute (e.g., mmlu, gsm8k)",
|
|
)
|
|
parser.add_argument(
|
|
"--backend",
|
|
choices=["vllm", "hf"],
|
|
default="vllm",
|
|
help="Model backend (default: vllm)",
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
default=None,
|
|
help="Limit number of samples to evaluate",
|
|
)
|
|
parser.add_argument(
|
|
"--max-connections",
|
|
type=int,
|
|
default=None,
|
|
help="Maximum concurrent connections (default: 4 for vllm, 1 for hf)",
|
|
)
|
|
parser.add_argument(
|
|
"--temperature",
|
|
type=float,
|
|
default=None,
|
|
help="Sampling temperature (default: 0.0 for vllm, 0.001 for hf)",
|
|
)
|
|
parser.add_argument(
|
|
"--tensor-parallel-size",
|
|
type=int,
|
|
default=1,
|
|
help="Number of GPUs for tensor parallelism (vLLM only, default: 1)",
|
|
)
|
|
parser.add_argument(
|
|
"--gpu-memory-utilization",
|
|
type=float,
|
|
default=0.8,
|
|
help="GPU memory fraction to use (vLLM only, default: 0.8)",
|
|
)
|
|
parser.add_argument(
|
|
"--dtype",
|
|
default="auto",
|
|
choices=["auto", "float16", "bfloat16", "float32"],
|
|
help="Data type for model weights (default: auto)",
|
|
)
|
|
parser.add_argument(
|
|
"--device",
|
|
default="auto",
|
|
help="Device for HF backend (auto, cuda, cpu)",
|
|
)
|
|
parser.add_argument(
|
|
"--trust-remote-code",
|
|
action="store_true",
|
|
help="Allow executing remote code from model repository",
|
|
)
|
|
parser.add_argument(
|
|
"--log-level",
|
|
default="info",
|
|
choices=["debug", "info", "warning", "error"],
|
|
help="Logging level (default: info)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.backend == "vllm":
|
|
run_inspect_vllm(
|
|
model_id=args.model,
|
|
task=args.task,
|
|
limit=args.limit,
|
|
max_connections=args.max_connections or 4,
|
|
temperature=args.temperature if args.temperature is not None else 0.0,
|
|
tensor_parallel_size=args.tensor_parallel_size,
|
|
gpu_memory_utilization=args.gpu_memory_utilization,
|
|
dtype=args.dtype,
|
|
trust_remote_code=args.trust_remote_code,
|
|
log_level=args.log_level,
|
|
)
|
|
else:
|
|
run_inspect_hf(
|
|
model_id=args.model,
|
|
task=args.task,
|
|
limit=args.limit,
|
|
max_connections=args.max_connections or 1,
|
|
temperature=args.temperature if args.temperature is not None else 0.001,
|
|
device=args.device,
|
|
dtype=args.dtype,
|
|
trust_remote_code=args.trust_remote_code,
|
|
log_level=args.log_level,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|