292 lines
12 KiB
JavaScript
292 lines
12 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
// Merges per-lane partial markdown files into one consolidated file per competitor.
|
|
//
|
|
// The 5-lane subagent fan-out writes partials to: {OUTPUT_DIR}/partials/{slug}.{lane}.md
|
|
// lane ∈ { marketing, discussion, social, news, technical }
|
|
//
|
|
// Each partial has its own YAML frontmatter + sections. The marketing partial owns
|
|
// the canonical frontmatter (pricing, features, etc.); other lanes contribute only
|
|
// Mentions / Benchmarks / Findings bullets. The merge:
|
|
// 1. Starts from marketing.md's frontmatter as the canonical header
|
|
// 2. Appends body sections in the canonical order (Product, Pricing, Features,
|
|
// Positioning, Comparison, Mentions, Benchmarks, Research Findings)
|
|
// 3. Unions all Mentions bullets across lanes, dedups by URL, sorts by date desc
|
|
// 4. Unions all Research Findings bullets across lanes
|
|
// 5. Unions all Benchmarks bullets
|
|
// 6. Writes the consolidated file to {OUTPUT_DIR}/{slug}.md
|
|
//
|
|
// Usage: node merge_partials.mjs <research-dir>
|
|
|
|
import { readdirSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
|
import { join } from 'path';
|
|
import { parseFrontmatter, parseBody, parseSections } from './md_utils.mjs';
|
|
|
|
const args = process.argv.slice(2);
|
|
if (args.includes('--help') || args.includes('-h') || args.length === 0) {
|
|
console.error(`Usage: node merge_partials.mjs <research-dir>
|
|
|
|
Reads {dir}/partials/{slug}.{lane}.md files and writes consolidated
|
|
{dir}/{slug}.md per competitor. Lanes: marketing, discussion, social, news, technical.`);
|
|
process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1);
|
|
}
|
|
|
|
const dir = args[0];
|
|
const partialsDir = join(dir, 'partials');
|
|
|
|
const LANES = ['marketing', 'discussion', 'social', 'news', 'technical', 'battle'];
|
|
|
|
function extractBullets(sectionText) {
|
|
if (!sectionText) return [];
|
|
const out = [];
|
|
for (const raw of sectionText.split('\n')) {
|
|
const line = raw.trim();
|
|
// Accept either "- ..." or numbered-list "1. ..." — normalize both to "- ...".
|
|
if (line.startsWith('- ')) out.push(line);
|
|
else {
|
|
const m = line.match(/^\d+\.\s+(.*)$/);
|
|
if (m) out.push('- ' + m[1]);
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// Normalize Mentions bullet lines to the canonical format that `compile_report.mjs`
|
|
// parses: `- **[SourceType]** Title | Snippet (source: URL, YYYY-MM-DD)`.
|
|
//
|
|
// Lane subagents deviate in practice — we've observed at least three variants:
|
|
// A) discussion-style: `- **HN** — [Title](url) — snippet`
|
|
// B) news-style: `- **2025-08-06** — [News] Outlet — "title" — url`
|
|
// C) canonical: `- **[SourceType]** Title | Snippet (source: URL, YYYY-MM-DD)`
|
|
// Rather than fighting prompt drift, normalize at merge time so downstream stays clean.
|
|
function normalizeMentionBullet(line) {
|
|
// Already canonical — nothing to do.
|
|
if (/^-\s*\*\*\[\w+\]\*\*/.test(line)) return line;
|
|
|
|
const urlMatch = line.match(/https?:\/\/\S+/);
|
|
const url = urlMatch ? urlMatch[0].replace(/[).,\]\s]+$/, '') : '';
|
|
const dateMatch = line.match(/\b(\d{4}-\d{2}-\d{2})\b/);
|
|
const date = dateMatch ? dateMatch[1] : '';
|
|
|
|
// Pattern A — `- **SourceType** — [Title](url) — snippet` (e.g. discussion lane)
|
|
// **SourceType** is bold but without the brackets we want in canonical form.
|
|
let m = line.match(/^-\s*\*\*([^*]+)\*\*\s*[—\-]\s*\[([^\]]+)\]\(([^)]+)\)\s*(?:[—\-]\s*(.*))?$/);
|
|
if (m) {
|
|
const [, rawType, title, linkUrl, snippet] = m;
|
|
const sourceType = rawType.trim().replace(/^\[|\]$/g, '');
|
|
const snippetStr = snippet && snippet.trim() ? ` | ${snippet.trim()}` : '';
|
|
const dateStr = date ? `, ${date}` : '';
|
|
return `- **[${sourceType}]** ${title.trim()}${snippetStr} (source: ${linkUrl}${dateStr})`;
|
|
}
|
|
|
|
// Pattern B — `- **YYYY-MM-DD** — [SourceType] Outlet — "title" — url` (e.g. news lane)
|
|
m = line.match(/^-\s*\*\*(\d{4}-\d{2}-\d{2})\*\*\s*[—\-]\s*\[(\w+)\]\s+([^—]+?)\s*[—\-]\s*"?([^"]+?)"?\s*(?:[—\-]\s*(\S+))?\s*$/);
|
|
if (m) {
|
|
const [, dateStr, sourceType, outlet, title, trailingUrl] = m;
|
|
const finalUrl = trailingUrl && trailingUrl.startsWith('http') ? trailingUrl : url;
|
|
const snippet = outlet.trim();
|
|
return `- **[${sourceType}]** ${title.trim()}${snippet ? ` | ${snippet}` : ''} (source: ${finalUrl || ''}, ${dateStr})`;
|
|
}
|
|
|
|
// Pattern C — generic fallback: find any `**X**` tag + URL and format canonically.
|
|
m = line.match(/^-\s*\*\*([^*]+)\*\*\s*(.*)/);
|
|
if (m && url) {
|
|
const rawType = m[1].trim().replace(/^\[|\]$/g, '');
|
|
// If the leading token is a date, try to pull a later **type** off the rest.
|
|
let sourceType = rawType;
|
|
if (/^\d{4}-\d{2}-\d{2}$/.test(rawType)) {
|
|
const innerType = m[2].match(/\[(\w+)\]/);
|
|
if (innerType) sourceType = innerType[1];
|
|
}
|
|
const linkTextM = m[2].match(/\[([^\]]+)\]/);
|
|
const title = linkTextM ? linkTextM[1] : m[2].replace(url, '').replace(/[—"]+/g, '').replace(/^\W+|\W+$/g, '').slice(0, 100);
|
|
const dateStr = date ? `, ${date}` : '';
|
|
return `- **[${sourceType}]** ${title.trim()} (source: ${url}${dateStr})`;
|
|
}
|
|
|
|
// Last resort — leave line untouched (preserves data even if un-parseable).
|
|
return line;
|
|
}
|
|
|
|
function urlOf(bullet) {
|
|
const m = bullet.match(/\(source:\s*([^,)]+)/);
|
|
return m ? m[1].trim() : null;
|
|
}
|
|
|
|
function dateOf(bullet) {
|
|
const m = bullet.match(/\(source:\s*[^,)]+,\s*(\d{4}-\d{2}-\d{2})/);
|
|
return m ? m[1] : '';
|
|
}
|
|
|
|
let files;
|
|
try { files = readdirSync(partialsDir); } catch {
|
|
console.error(`No partials directory at ${partialsDir} — nothing to merge.`);
|
|
process.exit(0);
|
|
}
|
|
|
|
// Group partials by slug
|
|
const bySlug = new Map();
|
|
for (const f of files) {
|
|
if (!f.endsWith('.md')) continue;
|
|
const m = f.match(/^(.+)\.([a-z]+)\.md$/);
|
|
if (!m) continue;
|
|
const slug = m[1];
|
|
const lane = m[2];
|
|
if (!LANES.includes(lane)) continue;
|
|
if (!bySlug.has(slug)) bySlug.set(slug, {});
|
|
const content = readFileSync(join(partialsDir, f), 'utf-8');
|
|
bySlug.get(slug)[lane] = { fm: parseFrontmatter(content), body: parseBody(content) };
|
|
}
|
|
|
|
let merged = 0;
|
|
for (const [slug, lanes] of bySlug.entries()) {
|
|
const marketing = lanes.marketing;
|
|
if (!marketing || !marketing.fm) {
|
|
console.error(`[skip] ${slug}: no marketing partial — cannot form canonical frontmatter`);
|
|
continue;
|
|
}
|
|
|
|
// Union body sections
|
|
const allSections = {};
|
|
for (const lane of LANES) {
|
|
if (!lanes[lane]) continue;
|
|
const secs = parseSections(lanes[lane].body);
|
|
for (const [k, v] of Object.entries(secs)) {
|
|
if (!allSections[k]) allSections[k] = [];
|
|
allSections[k].push(v);
|
|
}
|
|
}
|
|
|
|
// Normalize → dedup Mentions by URL, sort by date desc
|
|
const rawBullets = (allSections['Mentions'] || []).flatMap(s => extractBullets(s));
|
|
const mentionBullets = rawBullets.map(normalizeMentionBullet);
|
|
const seenUrls = new Set();
|
|
const dedupedMentions = [];
|
|
for (const b of mentionBullets) {
|
|
const u = urlOf(b);
|
|
const key = u || b; // fallback to bullet text if no URL
|
|
if (seenUrls.has(key)) continue;
|
|
seenUrls.add(key);
|
|
dedupedMentions.push(b);
|
|
}
|
|
dedupedMentions.sort((a, b) => {
|
|
const da = dateOf(a), db = dateOf(b);
|
|
if (da && db) return db.localeCompare(da);
|
|
if (da) return -1;
|
|
if (db) return 1;
|
|
return 0;
|
|
});
|
|
|
|
// Dedup Benchmarks by URL
|
|
const benchmarkBullets = (allSections['Benchmarks'] || []).flatMap(s => extractBullets(s));
|
|
const seenBench = new Set();
|
|
const dedupedBench = [];
|
|
for (const b of benchmarkBullets) {
|
|
const m = b.match(/https?:\/\/\S+/);
|
|
const key = m ? m[0] : b;
|
|
if (seenBench.has(key)) continue;
|
|
seenBench.add(key);
|
|
dedupedBench.push(b);
|
|
}
|
|
|
|
// Dedup Findings loosely (by exact text)
|
|
const findingBullets = (allSections['Research Findings'] || []).flatMap(s => extractBullets(s));
|
|
const dedupedFindings = [...new Set(findingBullets)];
|
|
|
|
// Merge/prefer marketing for Product/Pricing/Features/Positioning/Comparison
|
|
function first(key) {
|
|
const arr = allSections[key] || [];
|
|
return arr.length ? arr[0] : '';
|
|
}
|
|
|
|
// Rebuild frontmatter — whitelist canonical fields only. Non-marketing lane subagents
|
|
// sometimes leak ad-hoc meta fields (notes, searches_run, lane, etc.) into their partial's
|
|
// frontmatter; those are debug/summary fields, not canonical data. Drop them here.
|
|
const CANONICAL_FIELDS = [
|
|
'competitor_name', 'website', 'pricing_url',
|
|
'tagline', 'positioning', 'product_description', 'target_customer',
|
|
'pricing_model', 'pricing_tiers', 'key_features', 'integrations',
|
|
'headquarters', 'founded', 'employee_estimate', 'funding_info',
|
|
'strategic_diff',
|
|
];
|
|
// Subagents drift on canonical field names too. Common aliases observed in real runs:
|
|
// `competitor` → `competitor_name` (browsaur marketing subagent), `homepage` → `website`,
|
|
// `price_tiers` → `pricing_tiers`. Accept aliases silently.
|
|
//
|
|
// NOTE: a bare `pricing` key is mapped to `pricing_model`, NOT `pricing_tiers`. In practice
|
|
// subagents use `pricing` for a pricing *model* or prose summary ("usage-based", "$0.005/req")
|
|
// far more often than for an enumerated tier list, so routing it to `pricing_tiers` corrupted
|
|
// the structured tier data the overview/matrix render from. Use `price_tiers`/`pricing_tiers`
|
|
// explicitly for tiers.
|
|
const FIELD_ALIASES = {
|
|
'competitor': 'competitor_name',
|
|
'name': 'competitor_name',
|
|
'company': 'competitor_name',
|
|
'homepage': 'website',
|
|
'url': 'website',
|
|
'price_tiers': 'pricing_tiers',
|
|
'pricing': 'pricing_model',
|
|
};
|
|
function canonicalValue(fm, key) {
|
|
if (fm[key]) return fm[key];
|
|
for (const [alias, canonical] of Object.entries(FIELD_ALIASES)) {
|
|
if (canonical === key && fm[alias]) return fm[alias];
|
|
}
|
|
return undefined;
|
|
}
|
|
const mergedFm = {};
|
|
for (const k of CANONICAL_FIELDS) {
|
|
const v = canonicalValue(marketing.fm, k);
|
|
if (v) mergedFm[k] = v;
|
|
}
|
|
// Other lanes may fill in canonical gaps (e.g. funding_info from news, strategic_diff from technical).
|
|
for (const lane of LANES) {
|
|
if (lane === 'marketing' || !lanes[lane] || !lanes[lane].fm) continue;
|
|
for (const k of CANONICAL_FIELDS) {
|
|
if (!mergedFm[k]) {
|
|
const v = canonicalValue(lanes[lane].fm, k);
|
|
if (v) mergedFm[k] = v;
|
|
}
|
|
}
|
|
}
|
|
|
|
const fmLines = Object.entries(mergedFm).map(([k, v]) => `${k}: ${v}`).join('\n');
|
|
|
|
// Comparison heading may be "Comparison vs Exa" etc — find any key starting with "Comparison"
|
|
const comparisonKey = Object.keys(allSections).find(k => k.startsWith('Comparison'));
|
|
// Battle lane is format-drifty: subagents emit `## Battle Card`, `# Battle Card: X vs Y`
|
|
// (h1 — not picked up by parseSections), or skip the wrapper and lead with `## Landmines`.
|
|
// Treat the ENTIRE battle partial body as the Battle Card section regardless of heading style,
|
|
// so sales enablement content always lands in the merged file.
|
|
let battleCardBody = '';
|
|
if (lanes.battle && lanes.battle.body) {
|
|
const body = lanes.battle.body.trim();
|
|
// Strip the FIRST heading line if it mentions "Battle Card" — handles h1/h2/h3 and any
|
|
// suffix (e.g. `## Battle Card — Serper`, `# Battle Card: Tavily`). Otherwise the
|
|
// canonical `## Battle Card` wrapper added below produces duplicate headings.
|
|
battleCardBody = body.replace(/^#{1,3}\s+Battle\s*Card\b[^\n]*\n+/m, '').trim();
|
|
}
|
|
|
|
const out = [
|
|
'---',
|
|
fmLines,
|
|
'---',
|
|
'',
|
|
first('Product') ? `## Product\n${first('Product')}\n` : '',
|
|
first('Pricing') ? `## Pricing\n${first('Pricing')}\n` : '',
|
|
first('Features') ? `## Features\n${first('Features')}\n` : '',
|
|
first('Positioning') ? `## Positioning\n${first('Positioning')}\n` : '',
|
|
comparisonKey && allSections[comparisonKey].length ? `## ${comparisonKey}\n${allSections[comparisonKey][0]}\n` : '',
|
|
battleCardBody ? `## Battle Card\n${battleCardBody}\n` : '',
|
|
dedupedMentions.length ? `## Mentions\n${dedupedMentions.join('\n')}\n` : '',
|
|
dedupedBench.length ? `## Benchmarks\n${dedupedBench.join('\n')}\n` : '',
|
|
dedupedFindings.length ? `## Research Findings\n${dedupedFindings.join('\n')}\n` : '',
|
|
].filter(Boolean).join('\n');
|
|
|
|
writeFileSync(join(dir, `${slug}.md`), out);
|
|
merged += 1;
|
|
console.error(`[ok] ${slug}: ${dedupedMentions.length} mentions, ${dedupedBench.length} benchmarks, ${dedupedFindings.length} findings`);
|
|
}
|
|
|
|
console.log(JSON.stringify({ merged, competitors: bySlug.size }));
|