126 lines
4.9 KiB
JavaScript
126 lines
4.9 KiB
JavaScript
/**
|
|
* Eval suite structural validator.
|
|
*
|
|
* Checks that every scenario in evals/evals.json is structurally sound:
|
|
* - Required fields are present
|
|
* - IDs are sequential
|
|
* - expected_output references at least one risk code (R1-R6 or T1-T6)
|
|
*
|
|
* This does not execute skills against prompts — it validates that the eval
|
|
* definitions themselves are complete and internally consistent, catching
|
|
* authoring errors (missing fields, duplicate IDs, orphaned scenarios) before
|
|
* they silently accumulate.
|
|
*
|
|
* Usage: node scripts/run-evals.mjs
|
|
*/
|
|
|
|
import { readFileSync } from "node:fs";
|
|
import path from "node:path";
|
|
import { fileURLToPath } from "node:url";
|
|
import { VALID_MODES } from "./assemble-prompt.mjs";
|
|
import { PRODUCTION_RISK_COUNT, TEST_RISK_COUNT } from "./frontmatter.mjs";
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
const root = path.resolve(__dirname, "..");
|
|
|
|
const evalsData = JSON.parse(
|
|
readFileSync(path.join(root, "evals/evals.json"), "utf8"),
|
|
);
|
|
const evals = evalsData.evals;
|
|
|
|
const REQUIRED_FIELDS = ["id", "name", "prompt", "expected_output", "mode"];
|
|
|
|
const RISK_CODES = [
|
|
...Array.from({ length: PRODUCTION_RISK_COUNT }, (_, i) => `R${i + 1}`),
|
|
...Array.from({ length: TEST_RISK_COUNT }, (_, i) => `T${i + 1}`),
|
|
];
|
|
|
|
const errors = [];
|
|
const warnings = [];
|
|
|
|
// ── Sequential ID check ────────────────────────────────────────────────────
|
|
|
|
for (let i = 0; i < evals.length; i++) {
|
|
const ev = evals[i];
|
|
const expectedId = i + 1;
|
|
if (ev.id !== expectedId) {
|
|
errors.push(`Eval at index ${i}: expected id ${expectedId}, got ${JSON.stringify(ev.id)}`);
|
|
}
|
|
}
|
|
|
|
// ── Per-eval field and content checks ─────────────────────────────────────
|
|
|
|
for (const ev of evals) {
|
|
const label = `Eval ${ev.id} "${ev.name ?? "<unnamed>"}"`;
|
|
|
|
for (const field of REQUIRED_FIELDS) {
|
|
if (!ev[field] && ev[field] !== 0) {
|
|
errors.push(`${label}: missing required field '${field}'`);
|
|
}
|
|
}
|
|
|
|
if (typeof ev.prompt === "string" && ev.prompt.trim().length === 0) {
|
|
errors.push(`${label}: 'prompt' is empty`);
|
|
}
|
|
|
|
if (typeof ev.expected_output === "string" && ev.expected_output.trim().length === 0) {
|
|
errors.push(`${label}: 'expected_output' is empty`);
|
|
}
|
|
|
|
if (typeof ev.mode === "string" && !VALID_MODES.includes(ev.mode)) {
|
|
errors.push(`${label}: 'mode' must be one of ${VALID_MODES.join(", ")} (got '${ev.mode}')`);
|
|
}
|
|
|
|
// expected_output should reference at least one risk code so reviewers know
|
|
// which risk the scenario is testing
|
|
if (typeof ev.expected_output === "string") {
|
|
const referencedCodes = RISK_CODES.filter((code) => ev.expected_output.includes(code));
|
|
if (referencedCodes.length === 0) {
|
|
warnings.push(`${label}: expected_output does not reference any risk code (${RISK_CODES.join(", ")})`);
|
|
}
|
|
}
|
|
|
|
// no_risk_codes and no_health_score are optional flags that put the live
|
|
// runner into a false-positive classification mode. They are mutually
|
|
// exclusive because allowing both would make the verdict indeterminate
|
|
// (the no_health_score branch exits before risk-code analysis runs).
|
|
if ("no_risk_codes" in ev && ev.no_risk_codes !== true) {
|
|
errors.push(`${label}: 'no_risk_codes' must be true when present (got ${JSON.stringify(ev.no_risk_codes)})`);
|
|
}
|
|
if ("no_health_score" in ev && ev.no_health_score !== true) {
|
|
errors.push(`${label}: 'no_health_score' must be true when present (got ${JSON.stringify(ev.no_health_score)})`);
|
|
}
|
|
if (ev.no_risk_codes && ev.no_health_score) {
|
|
errors.push(`${label}: 'no_risk_codes' and 'no_health_score' are mutually exclusive`);
|
|
}
|
|
}
|
|
|
|
// ── Report ─────────────────────────────────────────────────────────────────
|
|
|
|
const idCheckPass = !errors.some((e) => e.includes("expected id"));
|
|
const fieldCheckPass = !errors.some((e) => e.includes("missing required field") || e.includes("is empty"));
|
|
const riskCodePass = warnings.length === 0;
|
|
|
|
console.log("\nEval Suite Structural Validation");
|
|
console.log("=================================");
|
|
console.log(`Total scenarios : ${evals.length}`);
|
|
console.log(`Sequential IDs : ${idCheckPass ? "PASS" : "FAIL"}`);
|
|
console.log(`Required fields : ${fieldCheckPass ? "PASS" : "FAIL"}`);
|
|
console.log(`Risk code refs : ${riskCodePass ? "PASS" : `${warnings.length} warning(s)`}`);
|
|
|
|
if (errors.length > 0) {
|
|
console.error("\nErrors:");
|
|
for (const e of errors) console.error(` ✗ ${e}`);
|
|
}
|
|
|
|
if (warnings.length > 0) {
|
|
console.warn("\nWarnings:");
|
|
for (const w of warnings) console.warn(` ⚠ ${w}`);
|
|
}
|
|
|
|
if (errors.length === 0) {
|
|
console.log(`\nAll structural checks passed (${evals.length} scenarios).`);
|
|
} else {
|
|
process.exit(1);
|
|
}
|