playbook/brooks-lint/scripts/eval-utils.mjs

44 lines
1.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Shared eval classification utilities.
* Used by run-evals-live.mjs (runtime) and validate-repo.test.mjs (tests).
*/
// Only R1R6 / T1T6 are valid codes; \d+ would also match typos like R10 or
// stray text like "R20", polluting true/false-positive classification.
const RISK_CODE_RE = /\b([RT][1-6])\b/g;
export function extractRiskCodes(text) {
return new Set(text.match(RISK_CODE_RE) ?? []);
}
export function classify(scenario, aiText) {
const hasHealthScore = /Health\s+Score[:\s]+\d+/i.test(aiText);
// no_health_score exits before risk-code extraction (codes are not needed).
if (scenario.no_health_score) {
return hasHealthScore ? "fail" : "false-positive-pass";
}
const expectedCodes = extractRiskCodes(scenario.expected_output);
const foundCodes = extractRiskCodes(aiText);
// no_risk_codes exits after extraction (needs codes, not Iron Law / Health Score).
if (scenario.no_risk_codes) {
const unexpected = [...foundCodes].filter((c) => expectedCodes.has(c));
return unexpected.length === 0 ? "false-positive-pass" : "fail";
}
const hasIronLaw =
(/\bSymptom\b/.test(aiText) && /\bSource\b/.test(aiText) &&
/\bConsequence\b/.test(aiText) && /\bRemedy\b/.test(aiText)) ||
(/症状/.test(aiText) && /根源/.test(aiText) &&
/后果/.test(aiText) && /修复/.test(aiText));
const truePositives = [...expectedCodes].filter((c) => foundCodes.has(c));
const falseNegatives = [...expectedCodes].filter((c) => !foundCodes.has(c));
if (falseNegatives.length === 0 && hasIronLaw && hasHealthScore) return "pass";
if (truePositives.length > 0 && hasIronLaw) return "partial";
return "fail";
}