playbook/brooks-lint/scripts/benchmark.mjs

/**
 * Parser-fidelity benchmark for brooks-lint.
 *
 * Reads evals/benchmark-corpus.json — a FROZEN corpus of real, model-generated
 * brooks-lint reports, each paired with an independently graded ground-truth
 * finding inventory. Runs the shipped report-parse.mjs / sarif.mjs against every
 * report and measures how faithfully the parser reproduces what the report says.
 *
 * Because the parser is deterministic and the corpus is frozen, the numbers are
 * exactly reproducible: anyone can re-run `npm run benchmark` and get the same
 * result. This benchmarks the PARSER (the SARIF/CI-gate plumbing), not the model
 * — model quality is measured separately by the 57-scenario suite (npm run evals:live).
 *
 * Exit code: 0 if every report is parsed faithfully and emits valid SARIF; 1 otherwise.
 */
import { readFileSync } from "node:fs";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { parseFindings, countFindings } from "./report-parse.mjs";
import { reportToSarif } from "./sarif.mjs";

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const root = path.resolve(__dirname, "..");
const VALID_LEVELS = new Set(["error", "warning", "note"]);

/** Keep only valid R1–R6 / T1–T6 codes (duplicates preserved), uppercased. */
function validCodes(codes) {
  return (codes ?? [])
    .map((c) => String(c).toUpperCase().trim())
    .filter((c) => /^[RT][1-6]$/.test(c));
}

/** Count occurrences of each code → { code: n }. */
function multiset(codes) {
  const m = {};
  for (const c of codes) m[c] = (m[c] ?? 0) + 1;
  return m;
}

/**
 * Score one corpus sample: compare the parser's output against the graded truth.
 * Returns severity-count match, SARIF validity, and risk-code tp/fp/fn.
 */
export function scoreReport(sample) {
  const pf = parseFindings(sample.report);
  const pc = countFindings(sample.report);
  const t = sample.truth;
  const countMatch = pc.critical === t.critical && pc.warning === t.warning && pc.suggestion === t.suggestion;

  // Compare codes per-finding (multiset), so a dropped duplicate-code finding
  // is caught, not masked by set-level de-duplication.
  const pCodes = validCodes(pf.map((f) => f.riskCode));
  const tCodes = validCodes(t.codes);
  const pm = multiset(pCodes), tm = multiset(tCodes);
  let tp = 0, fp = 0, fn = 0;
  for (const code of new Set([...Object.keys(pm), ...Object.keys(tm)])) {
    const p = pm[code] ?? 0, q = tm[code] ?? 0;
    tp += Math.min(p, q);
    fp += Math.max(0, p - q);
    fn += Math.max(0, q - p);
  }

  const sarif = reportToSarif(sample.report, { mode: sample.mode, toolVersion: "bench" });
  const ruleIds = new Set(sarif.runs[0].tool.driver.rules.map((r) => r.id));
  const results = sarif.runs[0].results;
  const sarifValid = sarif.version === "2.1.0"
    && results.length === pf.length
    && results.every((r) => VALID_LEVELS.has(r.level))
    && results.every((r) => ruleIds.has(r.ruleId));

  return { id: sample.id, mode: sample.mode, isFP: sample.isFP, countMatch, sarifValid, tp, fp, fn,
    truth: `${t.critical}/${t.warning}/${t.suggestion}`, parser: `${pc.critical}/${pc.warning}/${pc.suggestion}`,
    truthCodes: [...new Set(tCodes)].sort(), parserCodes: [...new Set(pCodes)].sort() };
}

/**
 * Score every sample in a corpus and aggregate corpus-wide totals:
 * exact severity-count matches, SARIF validity, and code precision/recall.
 */
export function summarize(corpus) {
  const rows = corpus.samples.map(scoreReport);
  const n = rows.length;
  const exact = rows.filter((r) => r.countMatch).length;
  const sarifOk = rows.filter((r) => r.sarifValid).length;
  const tp = rows.reduce((s, r) => s + r.tp, 0);
  const fp = rows.reduce((s, r) => s + r.fp, 0);
  const fn = rows.reduce((s, r) => s + r.fn, 0);
  return {
    rows, n, exact, sarifOk, tp, fp, fn,
    precision: tp / (tp + fp || 1),
    recall: tp / (tp + fn || 1),
  };
}

if (process.argv[1] === fileURLToPath(import.meta.url)) {
  const corpus = JSON.parse(readFileSync(path.join(root, "evals/benchmark-corpus.json"), "utf8"));
  const s = summarize(corpus);

  console.log("\nBrooks-Lint Parser-Fidelity Benchmark");
  console.log("=====================================");
  console.log(`Corpus: ${s.n} real model-generated reports (frozen) across ${new Set(s.rows.map((r) => r.mode)).size} modes`);
  console.table(s.rows.map((r) => ({
    id: r.id, mode: r.mode, FP: r.isFP ? "Y" : "",
    truth: r.truth, parser: r.parser, countMatch: r.countMatch,
    codes: r.parserCodes.join(",") || "-", sarif: r.sarifValid ? "ok" : "BAD",
  })));
  console.log(`Exact severity-count match : ${s.exact}/${s.n} (${(100 * s.exact / s.n).toFixed(1)}%)`);
  console.log(`Risk-code precision        : ${(100 * s.precision).toFixed(1)}%   recall: ${(100 * s.recall).toFixed(1)}%   (tp=${s.tp} fp=${s.fp} fn=${s.fn})`);
  console.log(`SARIF 2.1.0 validity       : ${s.sarifOk}/${s.n}`);

  if (corpus.strictness?.length) {
    console.log("\nStrictness preset scoring (recorded single-run, fixed 2C/3W/1S findings):");
    console.table(corpus.strictness.map((x) => ({ preset: x.preset, expected: x.expected, modelScore: x.score, match: x.score === x.expected, leadsWithTopFixes: x.leadsWithTopFixes })));
  }

  const ok = s.exact === s.n && s.sarifOk === s.n;
  console.log(`\n${ok ? "PASS" : "FAIL"} — parser fidelity ${ok ? "100%" : "below threshold"} on the frozen corpus.`);
  if (!ok) process.exit(1);
}