playbook/antigravity-awesome-skills/skills/hugging-face-community-evals/scripts/inspect_eval_uv.py

# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "inspect-ai>=0.3.0",
#     "inspect-evals",
#     "openai",
# ]
# ///

"""
Entry point script for running inspect-ai evaluations against Hugging Face inference providers.
"""

from __future__ import annotations

import argparse
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional


def _inspect_evals_tasks_root() -> Optional[Path]:
    """Return the installed inspect_evals package path if available."""
    try:
        import inspect_evals

        return Path(inspect_evals.__file__).parent
    except Exception:
        return None


def _normalize_task(task: str) -> str:
    """Allow lighteval-style `suite|task|shots` strings by keeping the task name."""
    if "|" in task:
        parts = task.split("|")
        if len(parts) >= 2 and parts[1]:
            return parts[1]
    return task


def main() -> None:
    parser = argparse.ArgumentParser(description="Inspect-ai job runner")
    parser.add_argument("--model", required=True, help="Model ID on Hugging Face Hub")
    parser.add_argument("--task", required=True, help="inspect-ai task to execute")
    parser.add_argument("--limit", type=int, default=None, help="Limit number of samples to evaluate")
    parser.add_argument(
        "--tasks-root",
        default=None,
        help="Optional path to inspect task files. Defaults to the installed inspect_evals package.",
    )
    parser.add_argument(
        "--sandbox",
        default="local",
        help="Sandbox backend to use (default: local for HF jobs without Docker).",
    )
    args = parser.parse_args()

    # Ensure downstream libraries can read the token passed as a secret
    hf_token = os.getenv("HF_TOKEN")
    if hf_token:
        os.environ.setdefault("HUGGING_FACE_HUB_TOKEN", hf_token)
        os.environ.setdefault("HF_HUB_TOKEN", hf_token)

    task = _normalize_task(args.task)
    tasks_root = Path(args.tasks_root) if args.tasks_root else _inspect_evals_tasks_root()
    if tasks_root and not tasks_root.exists():
        tasks_root = None

    cmd = [
        "inspect",
        "eval",
        task,
        "--model",
        f"hf-inference-providers/{args.model}",
        "--log-level",
        "info",
        # Reduce batch size to avoid OOM errors (default is 32)
        "--max-connections",
        "1",
        # Set a small positive temperature (HF doesn't allow temperature=0)
        "--temperature",
        "0.001",
    ]

    if args.sandbox:
        cmd.extend(["--sandbox", args.sandbox])

    if args.limit:
        cmd.extend(["--limit", str(args.limit)])

    try:
        subprocess.run(cmd, check=True, cwd=tasks_root)
        print("Evaluation complete.")
    except subprocess.CalledProcessError as exc:
        location = f" (cwd={tasks_root})" if tasks_root else ""
        print(f"Evaluation failed with exit code {exc.returncode}{location}", file=sys.stderr)
        raise


if __name__ == "__main__":
    main()