69 lines
2.5 KiB
Python
69 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Structural retrieval-reachability check for the remote-gpu-trainer skill.
|
|
|
|
For each scenario in cases.jsonl, assert that the answer is actually PRESENT in the
|
|
skill, at the documented location, with the expected entry IDs / keywords intact:
|
|
|
|
- every `expect_files` path exists
|
|
- every `expect_ids` appears as a `### <ID>` header in one of those files
|
|
- every `expect_grep` keyword appears (case-insensitive) in one of those files
|
|
|
|
This is the cheap, no-API-key tier: it does NOT prove an agent *navigates* there
|
|
(that is the agentic tier — see RESULTS.md), and it does NOT prove the platform
|
|
FACTS are correct on a live box (see the README "Verification status"). What it
|
|
DOES catch is drift: a renamed/removed entry ID, a moved section, a deleted file,
|
|
or a fact rewritten away from a key term — i.e. a regression in the skill's known
|
|
load-bearing capabilities.
|
|
|
|
Usage: python evals/run_evals.py # exits 1 if any case fails
|
|
"""
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
REPO = Path(__file__).resolve().parent.parent
|
|
CASES = Path(__file__).resolve().parent / "cases.jsonl"
|
|
|
|
|
|
def header_present(text, id_):
|
|
# match `### O1 ...` but not `### O10 ...`
|
|
return re.search(r"(?m)^###\s+" + re.escape(id_) + r"\b", text) is not None
|
|
|
|
|
|
def main():
|
|
cases = [json.loads(l) for l in CASES.read_text(encoding="utf-8").splitlines() if l.strip()]
|
|
passed = failed = 0
|
|
for c in cases:
|
|
problems = []
|
|
blobs = []
|
|
for f in c.get("expect_files", []):
|
|
p = REPO / f
|
|
if not p.exists():
|
|
problems.append(f"missing file: {f}")
|
|
else:
|
|
blobs.append(p.read_text(encoding="utf-8"))
|
|
joined = "\n".join(blobs)
|
|
low = joined.lower()
|
|
for i in c.get("expect_ids", []):
|
|
if not any(header_present(b, i) for b in blobs):
|
|
problems.append(f"missing entry id: {i}")
|
|
for kw in c.get("expect_grep", []):
|
|
if kw.lower() not in low:
|
|
problems.append(f"missing keyword: {kw!r}")
|
|
status = "PASS" if not problems else "FAIL"
|
|
if problems:
|
|
failed += 1
|
|
else:
|
|
passed += 1
|
|
print(f"[{status}] {c['id']}")
|
|
for pr in problems:
|
|
print(f" - {pr}")
|
|
print(f"\n{passed}/{passed + failed} cases reachable" + ("" if not failed else f" ({failed} FAILED)"))
|
|
return 1 if failed else 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|