363 lines
13 KiB
Python
Executable File
363 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Render a Bumblebee NDJSON scan into a human-readable Markdown report.
|
|
|
|
Bumblebee emits one JSON record per line. Record types we know about:
|
|
|
|
- package — an inventory record for a discovered package
|
|
- finding — an exposure-catalog match (high signal)
|
|
- scan_summary — emitted once at end of run, contains counts/duration
|
|
- diagnostic — non-fatal warnings (skipped roots, parse errors)
|
|
|
|
Unknown record types are bucketed under "other" and counted but not
|
|
rendered in detail. The script never imports anything outside the
|
|
standard library — it has to run on whatever Python 3 ships with the
|
|
developer's machine.
|
|
|
|
Usage:
|
|
python3 render_report.py <input.ndjson> <output.md>
|
|
|
|
Exit codes:
|
|
0 success
|
|
1 usage error
|
|
2 input file unreadable or empty
|
|
3 no records parsed (likely malformed file)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
SEVERITY_ORDER = ["critical", "high", "medium", "low", "info", "unknown"]
|
|
|
|
|
|
def severity_rank(value: str | None) -> int:
|
|
"""Return a sort key for severities; unknown values sort last."""
|
|
if not value:
|
|
return len(SEVERITY_ORDER)
|
|
try:
|
|
return SEVERITY_ORDER.index(value.lower())
|
|
except ValueError:
|
|
return len(SEVERITY_ORDER)
|
|
|
|
|
|
def load_records(path: Path) -> list[dict[str, Any]]:
|
|
"""Parse NDJSON, tolerating blank lines and trailing whitespace.
|
|
|
|
Malformed lines are reported on stderr but do not abort the run —
|
|
Bumblebee can interleave records from multiple goroutines and a single
|
|
truncated line should not lose the rest of the report.
|
|
"""
|
|
records: list[dict[str, Any]] = []
|
|
with path.open("r", encoding="utf-8") as fh:
|
|
for lineno, raw in enumerate(fh, start=1):
|
|
line = raw.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
records.append(json.loads(line))
|
|
except json.JSONDecodeError as exc:
|
|
print(
|
|
f"warning: skipping malformed line {lineno}: {exc}",
|
|
file=sys.stderr,
|
|
)
|
|
return records
|
|
|
|
|
|
def group_by_kind(records: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]:
|
|
groups: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
for rec in records:
|
|
kind = rec.get("record_type") or rec.get("type") or "unknown"
|
|
groups[kind].append(rec)
|
|
return groups
|
|
|
|
|
|
def render_findings(findings: list[dict[str, Any]]) -> str:
|
|
"""Findings are the most important section — render them first.
|
|
|
|
Each finding carries the matched package's identity (Bumblebee uses
|
|
`normalized_name` / `source_file` per docs/state-model.md) plus
|
|
catalog metadata. We look up by the canonical Bumblebee names first
|
|
and keep a small set of fallbacks so the helper still works against
|
|
older schemas or hand-rolled fixtures.
|
|
"""
|
|
if not findings:
|
|
return "## Findings\n\nNo exposure-catalog matches.\n"
|
|
|
|
# Sort by severity (critical first) then by package name.
|
|
sorted_findings = sorted(
|
|
findings,
|
|
key=lambda f: (
|
|
severity_rank(_get(f, "severity", "catalog_severity")),
|
|
_get(f, "normalized_name", "package", "name", "package_name") or "",
|
|
),
|
|
)
|
|
|
|
out = [f"## Findings\n\n**{len(sorted_findings)} match(es) against exposure catalog.**\n"]
|
|
for finding in sorted_findings:
|
|
severity = _get(finding, "severity", "catalog_severity") or "unknown"
|
|
catalog_id = _get(finding, "catalog_id", "advisory_id", "id") or "—"
|
|
catalog_name = _get(finding, "catalog_name", "advisory", "name") or ""
|
|
ecosystem = _get(finding, "ecosystem") or "?"
|
|
pkg = _get(finding, "normalized_name", "package", "name", "package_name") or "?"
|
|
version = _get(finding, "version", "matched_version") or "?"
|
|
# Bumblebee emits `source_file` (and often a `project_path`);
|
|
# legacy / demo records may use `source_path`. Render both when
|
|
# available — responders need that traceability.
|
|
source_file = _get(finding, "source_file", "source_path", "evidence_path", "path") or ""
|
|
project_path = _get(finding, "project_path") or ""
|
|
finding_type = _get(finding, "finding_type") or ""
|
|
|
|
out.append(f"### [{severity.upper()}] {pkg}@{version} ({ecosystem})")
|
|
out.append("")
|
|
out.append(f"- Catalog entry: `{catalog_id}`" + (f" — {catalog_name}" if catalog_name else ""))
|
|
if finding_type:
|
|
out.append(f"- Finding type: {finding_type}")
|
|
if source_file:
|
|
out.append(f"- Source file: `{source_file}`")
|
|
if project_path and project_path != source_file:
|
|
out.append(f"- Project path: `{project_path}`")
|
|
confidence = _get(finding, "confidence")
|
|
if confidence:
|
|
out.append(f"- Confidence: {confidence}")
|
|
root_kind = _get(finding, "root_kind")
|
|
if root_kind:
|
|
out.append(f"- Root kind: {root_kind}")
|
|
evidence = _get(finding, "evidence")
|
|
if evidence and isinstance(evidence, (str, int, float)):
|
|
out.append(f"- Evidence: {evidence}")
|
|
out.append("")
|
|
return "\n".join(out)
|
|
|
|
|
|
def render_inventory(packages: list[dict[str, Any]]) -> str:
|
|
if not packages:
|
|
return "## Inventory\n\nNo package records emitted (findings-only mode?).\n"
|
|
|
|
by_ecosystem: Counter[str] = Counter()
|
|
by_root_kind: Counter[str] = Counter()
|
|
by_confidence: Counter[str] = Counter()
|
|
for pkg in packages:
|
|
by_ecosystem[pkg.get("ecosystem", "unknown")] += 1
|
|
by_root_kind[pkg.get("root_kind", "unknown")] += 1
|
|
by_confidence[pkg.get("confidence", "unknown")] += 1
|
|
|
|
lines = [
|
|
"## Inventory",
|
|
"",
|
|
f"Total package records: **{len(packages):,}**",
|
|
"",
|
|
"### By ecosystem",
|
|
"",
|
|
"| Ecosystem | Count |",
|
|
"| --- | ---: |",
|
|
]
|
|
for eco, count in by_ecosystem.most_common():
|
|
lines.append(f"| {eco} | {count:,} |")
|
|
|
|
lines += [
|
|
"",
|
|
"### By root kind",
|
|
"",
|
|
"| Root kind | Count |",
|
|
"| --- | ---: |",
|
|
]
|
|
for kind, count in by_root_kind.most_common():
|
|
lines.append(f"| {kind} | {count:,} |")
|
|
|
|
lines += [
|
|
"",
|
|
"### By confidence",
|
|
"",
|
|
"| Confidence | Count |",
|
|
"| --- | ---: |",
|
|
]
|
|
for conf, count in by_confidence.most_common():
|
|
lines.append(f"| {conf} | {count:,} |")
|
|
lines.append("")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def render_summary(summary_records: list[dict[str, Any]]) -> str:
|
|
"""Render scan_summary record(s).
|
|
|
|
Bumblebee's real scan_summary (per docs/state-model.md) is flat —
|
|
`scan_time`, `end_time`, `status`, `package_records_emitted`,
|
|
`findings_emitted`, `diagnostics_count`, `roots`, plus HTTP-sink
|
|
counters when applicable. We render those canonical fields first
|
|
and still fall back to the older `counts`/`totals` shape for
|
|
backwards compatibility with hand-rolled fixtures.
|
|
"""
|
|
if not summary_records:
|
|
return "## Scan summary\n\n_No `scan_summary` record found — the run may not have completed cleanly._\n"
|
|
|
|
# Bumblebee canonical scan_summary fields (per docs/state-model.md).
|
|
# Order matters: identity → status → timing → counts → delivery → versioning.
|
|
canonical_fields = (
|
|
"endpoint_id",
|
|
"profile",
|
|
"run_id",
|
|
"status",
|
|
"scan_time",
|
|
"end_time",
|
|
"timed_out",
|
|
"package_records_emitted",
|
|
"package_records_suppressed",
|
|
"findings_emitted",
|
|
"diagnostics_count",
|
|
"http_batches_attempted",
|
|
"http_batches_succeeded",
|
|
"http_batches_failed",
|
|
"http_last_status",
|
|
"scanner_version",
|
|
"schema_version",
|
|
)
|
|
|
|
out = ["## Scan summary", ""]
|
|
for idx, rec in enumerate(summary_records, start=1):
|
|
if len(summary_records) > 1:
|
|
out.append(f"### Summary {idx}")
|
|
out.append("")
|
|
|
|
status = rec.get("status")
|
|
if status and status != "complete":
|
|
out.append(f"> **Status `{status}`** — this run is not a trustworthy complete snapshot. Treat as raw evidence only.")
|
|
out.append("")
|
|
|
|
# Canonical fields
|
|
for key in canonical_fields:
|
|
if key in rec and rec[key] not in (None, ""):
|
|
out.append(f"- **{key}**: `{rec[key]}`")
|
|
|
|
# Roots can be a list of objects; render compactly.
|
|
roots = rec.get("roots")
|
|
if roots:
|
|
if isinstance(roots, list):
|
|
out.append(f"- **roots**: {len(roots)} root(s) scanned")
|
|
for r in roots[:20]:
|
|
if isinstance(r, dict):
|
|
# Common shape: {kind, path}
|
|
kind = r.get("kind") or r.get("root_kind") or "?"
|
|
path = r.get("path") or r.get("root") or "?"
|
|
out.append(f" - `{kind}`: `{path}`")
|
|
else:
|
|
out.append(f" - `{r}`")
|
|
if len(roots) > 20:
|
|
out.append(f" - _… {len(roots) - 20} more truncated_")
|
|
else:
|
|
out.append(f"- **roots**: `{roots}`")
|
|
|
|
# Legacy / fixture shape: nested counts dict
|
|
counts = rec.get("counts") or rec.get("totals") or {}
|
|
if counts:
|
|
out.append("- **counts** (legacy):")
|
|
for k, v in counts.items():
|
|
out.append(f" - {k}: {v}")
|
|
|
|
# Surface any remaining fields we didn't recognize so the helper
|
|
# never silently drops data when the schema gains new keys.
|
|
rendered = set(canonical_fields) | {"roots", "counts", "totals", "record_type", "type", "record_id"}
|
|
extras = {k: v for k, v in rec.items() if k not in rendered and v not in (None, "", [], {})}
|
|
if extras:
|
|
out.append("- **other fields**:")
|
|
for k, v in extras.items():
|
|
out.append(f" - {k}: `{v}`")
|
|
|
|
out.append("")
|
|
return "\n".join(out)
|
|
|
|
|
|
def render_diagnostics(diagnostics: list[dict[str, Any]]) -> str:
|
|
if not diagnostics:
|
|
return ""
|
|
out = ["## Diagnostics", "", f"{len(diagnostics)} diagnostic record(s) emitted on stdout."]
|
|
out.append("")
|
|
for diag in diagnostics[:50]: # Cap output — log file has full detail.
|
|
level = diag.get("level", "info")
|
|
msg = diag.get("message") or diag.get("msg") or json.dumps(diag, sort_keys=True)
|
|
path = diag.get("path") or diag.get("source_path") or ""
|
|
suffix = f" — `{path}`" if path else ""
|
|
out.append(f"- **{level}**: {msg}{suffix}")
|
|
if len(diagnostics) > 50:
|
|
out.append("")
|
|
out.append(f"_… {len(diagnostics) - 50} more diagnostic record(s) truncated. See the `.log` file for the full list._")
|
|
out.append("")
|
|
return "\n".join(out)
|
|
|
|
|
|
def _get(record: dict[str, Any], *keys: str) -> Any:
|
|
"""Return the first non-empty value found for any of the candidate keys."""
|
|
for key in keys:
|
|
value = record.get(key)
|
|
if value not in (None, "", [], {}):
|
|
return value
|
|
return None
|
|
|
|
|
|
def build_report(records: list[dict[str, Any]], source_path: Path) -> str:
|
|
groups = group_by_kind(records)
|
|
findings = groups.get("finding", []) + groups.get("findings", [])
|
|
packages = groups.get("package", []) + groups.get("packages", [])
|
|
summaries = groups.get("scan_summary", [])
|
|
diagnostics = groups.get("diagnostic", []) + groups.get("diagnostics", [])
|
|
|
|
other_kinds = {
|
|
kind: len(items)
|
|
for kind, items in groups.items()
|
|
if kind not in {"finding", "findings", "package", "packages", "scan_summary", "diagnostic", "diagnostics"}
|
|
}
|
|
|
|
generated = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
header = [
|
|
"# Bumblebee Scan Report",
|
|
"",
|
|
f"- Source: `{source_path}`",
|
|
f"- Generated: `{generated}`",
|
|
f"- Total records: **{len(records):,}**",
|
|
]
|
|
if other_kinds:
|
|
header.append(f"- Other record types: {', '.join(f'{k} ({v})' for k, v in sorted(other_kinds.items()))}")
|
|
header.append("")
|
|
|
|
sections = [
|
|
"\n".join(header),
|
|
render_findings(findings),
|
|
render_summary(summaries),
|
|
render_inventory(packages),
|
|
render_diagnostics(diagnostics),
|
|
]
|
|
return "\n".join(s for s in sections if s).rstrip() + "\n"
|
|
|
|
|
|
def main(argv: list[str]) -> int:
|
|
if len(argv) != 3:
|
|
print(f"usage: {Path(argv[0]).name} <input.ndjson> <output.md>", file=sys.stderr)
|
|
return 1
|
|
|
|
input_path = Path(argv[1])
|
|
output_path = Path(argv[2])
|
|
|
|
if not input_path.exists() or input_path.stat().st_size == 0:
|
|
print(f"error: {input_path} is missing or empty", file=sys.stderr)
|
|
return 2
|
|
|
|
records = load_records(input_path)
|
|
if not records:
|
|
print("error: no JSON records parsed from input", file=sys.stderr)
|
|
return 3
|
|
|
|
report = build_report(records, input_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(report, encoding="utf-8")
|
|
print(f"wrote {output_path} ({len(records)} records)")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main(sys.argv))
|