playbook/antigravity-awesome-skills/skills/competitor-analysis/scripts/merge_partials.mjs

292 lines
12 KiB
JavaScript

#!/usr/bin/env node
// Merges per-lane partial markdown files into one consolidated file per competitor.
//
// The 5-lane subagent fan-out writes partials to: {OUTPUT_DIR}/partials/{slug}.{lane}.md
// lane ∈ { marketing, discussion, social, news, technical }
//
// Each partial has its own YAML frontmatter + sections. The marketing partial owns
// the canonical frontmatter (pricing, features, etc.); other lanes contribute only
// Mentions / Benchmarks / Findings bullets. The merge:
// 1. Starts from marketing.md's frontmatter as the canonical header
// 2. Appends body sections in the canonical order (Product, Pricing, Features,
// Positioning, Comparison, Mentions, Benchmarks, Research Findings)
// 3. Unions all Mentions bullets across lanes, dedups by URL, sorts by date desc
// 4. Unions all Research Findings bullets across lanes
// 5. Unions all Benchmarks bullets
// 6. Writes the consolidated file to {OUTPUT_DIR}/{slug}.md
//
// Usage: node merge_partials.mjs <research-dir>
import { readdirSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join } from 'path';
import { parseFrontmatter, parseBody, parseSections } from './md_utils.mjs';
const args = process.argv.slice(2);
if (args.includes('--help') || args.includes('-h') || args.length === 0) {
console.error(`Usage: node merge_partials.mjs <research-dir>
Reads {dir}/partials/{slug}.{lane}.md files and writes consolidated
{dir}/{slug}.md per competitor. Lanes: marketing, discussion, social, news, technical.`);
process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1);
}
const dir = args[0];
const partialsDir = join(dir, 'partials');
const LANES = ['marketing', 'discussion', 'social', 'news', 'technical', 'battle'];
function extractBullets(sectionText) {
if (!sectionText) return [];
const out = [];
for (const raw of sectionText.split('\n')) {
const line = raw.trim();
// Accept either "- ..." or numbered-list "1. ..." — normalize both to "- ...".
if (line.startsWith('- ')) out.push(line);
else {
const m = line.match(/^\d+\.\s+(.*)$/);
if (m) out.push('- ' + m[1]);
}
}
return out;
}
// Normalize Mentions bullet lines to the canonical format that `compile_report.mjs`
// parses: `- **[SourceType]** Title | Snippet (source: URL, YYYY-MM-DD)`.
//
// Lane subagents deviate in practice — we've observed at least three variants:
// A) discussion-style: `- **HN** — [Title](url) — snippet`
// B) news-style: `- **2025-08-06** — [News] Outlet — "title" — url`
// C) canonical: `- **[SourceType]** Title | Snippet (source: URL, YYYY-MM-DD)`
// Rather than fighting prompt drift, normalize at merge time so downstream stays clean.
function normalizeMentionBullet(line) {
// Already canonical — nothing to do.
if (/^-\s*\*\*\[\w+\]\*\*/.test(line)) return line;
const urlMatch = line.match(/https?:\/\/\S+/);
const url = urlMatch ? urlMatch[0].replace(/[).,\]\s]+$/, '') : '';
const dateMatch = line.match(/\b(\d{4}-\d{2}-\d{2})\b/);
const date = dateMatch ? dateMatch[1] : '';
// Pattern A — `- **SourceType** — [Title](url) — snippet` (e.g. discussion lane)
// **SourceType** is bold but without the brackets we want in canonical form.
let m = line.match(/^-\s*\*\*([^*]+)\*\*\s*[—\-]\s*\[([^\]]+)\]\(([^)]+)\)\s*(?:[—\-]\s*(.*))?$/);
if (m) {
const [, rawType, title, linkUrl, snippet] = m;
const sourceType = rawType.trim().replace(/^\[|\]$/g, '');
const snippetStr = snippet && snippet.trim() ? ` | ${snippet.trim()}` : '';
const dateStr = date ? `, ${date}` : '';
return `- **[${sourceType}]** ${title.trim()}${snippetStr} (source: ${linkUrl}${dateStr})`;
}
// Pattern B — `- **YYYY-MM-DD** — [SourceType] Outlet — "title" — url` (e.g. news lane)
m = line.match(/^-\s*\*\*(\d{4}-\d{2}-\d{2})\*\*\s*[—\-]\s*\[(\w+)\]\s+([^—]+?)\s*[—\-]\s*"?([^"]+?)"?\s*(?:[—\-]\s*(\S+))?\s*$/);
if (m) {
const [, dateStr, sourceType, outlet, title, trailingUrl] = m;
const finalUrl = trailingUrl && trailingUrl.startsWith('http') ? trailingUrl : url;
const snippet = outlet.trim();
return `- **[${sourceType}]** ${title.trim()}${snippet ? ` | ${snippet}` : ''} (source: ${finalUrl || ''}, ${dateStr})`;
}
// Pattern C — generic fallback: find any `**X**` tag + URL and format canonically.
m = line.match(/^-\s*\*\*([^*]+)\*\*\s*(.*)/);
if (m && url) {
const rawType = m[1].trim().replace(/^\[|\]$/g, '');
// If the leading token is a date, try to pull a later **type** off the rest.
let sourceType = rawType;
if (/^\d{4}-\d{2}-\d{2}$/.test(rawType)) {
const innerType = m[2].match(/\[(\w+)\]/);
if (innerType) sourceType = innerType[1];
}
const linkTextM = m[2].match(/\[([^\]]+)\]/);
const title = linkTextM ? linkTextM[1] : m[2].replace(url, '').replace(/[—"]+/g, '').replace(/^\W+|\W+$/g, '').slice(0, 100);
const dateStr = date ? `, ${date}` : '';
return `- **[${sourceType}]** ${title.trim()} (source: ${url}${dateStr})`;
}
// Last resort — leave line untouched (preserves data even if un-parseable).
return line;
}
function urlOf(bullet) {
const m = bullet.match(/\(source:\s*([^,)]+)/);
return m ? m[1].trim() : null;
}
function dateOf(bullet) {
const m = bullet.match(/\(source:\s*[^,)]+,\s*(\d{4}-\d{2}-\d{2})/);
return m ? m[1] : '';
}
let files;
try { files = readdirSync(partialsDir); } catch {
console.error(`No partials directory at ${partialsDir} — nothing to merge.`);
process.exit(0);
}
// Group partials by slug
const bySlug = new Map();
for (const f of files) {
if (!f.endsWith('.md')) continue;
const m = f.match(/^(.+)\.([a-z]+)\.md$/);
if (!m) continue;
const slug = m[1];
const lane = m[2];
if (!LANES.includes(lane)) continue;
if (!bySlug.has(slug)) bySlug.set(slug, {});
const content = readFileSync(join(partialsDir, f), 'utf-8');
bySlug.get(slug)[lane] = { fm: parseFrontmatter(content), body: parseBody(content) };
}
let merged = 0;
for (const [slug, lanes] of bySlug.entries()) {
const marketing = lanes.marketing;
if (!marketing || !marketing.fm) {
console.error(`[skip] ${slug}: no marketing partial — cannot form canonical frontmatter`);
continue;
}
// Union body sections
const allSections = {};
for (const lane of LANES) {
if (!lanes[lane]) continue;
const secs = parseSections(lanes[lane].body);
for (const [k, v] of Object.entries(secs)) {
if (!allSections[k]) allSections[k] = [];
allSections[k].push(v);
}
}
// Normalize → dedup Mentions by URL, sort by date desc
const rawBullets = (allSections['Mentions'] || []).flatMap(s => extractBullets(s));
const mentionBullets = rawBullets.map(normalizeMentionBullet);
const seenUrls = new Set();
const dedupedMentions = [];
for (const b of mentionBullets) {
const u = urlOf(b);
const key = u || b; // fallback to bullet text if no URL
if (seenUrls.has(key)) continue;
seenUrls.add(key);
dedupedMentions.push(b);
}
dedupedMentions.sort((a, b) => {
const da = dateOf(a), db = dateOf(b);
if (da && db) return db.localeCompare(da);
if (da) return -1;
if (db) return 1;
return 0;
});
// Dedup Benchmarks by URL
const benchmarkBullets = (allSections['Benchmarks'] || []).flatMap(s => extractBullets(s));
const seenBench = new Set();
const dedupedBench = [];
for (const b of benchmarkBullets) {
const m = b.match(/https?:\/\/\S+/);
const key = m ? m[0] : b;
if (seenBench.has(key)) continue;
seenBench.add(key);
dedupedBench.push(b);
}
// Dedup Findings loosely (by exact text)
const findingBullets = (allSections['Research Findings'] || []).flatMap(s => extractBullets(s));
const dedupedFindings = [...new Set(findingBullets)];
// Merge/prefer marketing for Product/Pricing/Features/Positioning/Comparison
function first(key) {
const arr = allSections[key] || [];
return arr.length ? arr[0] : '';
}
// Rebuild frontmatter — whitelist canonical fields only. Non-marketing lane subagents
// sometimes leak ad-hoc meta fields (notes, searches_run, lane, etc.) into their partial's
// frontmatter; those are debug/summary fields, not canonical data. Drop them here.
const CANONICAL_FIELDS = [
'competitor_name', 'website', 'pricing_url',
'tagline', 'positioning', 'product_description', 'target_customer',
'pricing_model', 'pricing_tiers', 'key_features', 'integrations',
'headquarters', 'founded', 'employee_estimate', 'funding_info',
'strategic_diff',
];
// Subagents drift on canonical field names too. Common aliases observed in real runs:
// `competitor` → `competitor_name` (browsaur marketing subagent), `homepage` → `website`,
// `price_tiers` → `pricing_tiers`. Accept aliases silently.
//
// NOTE: a bare `pricing` key is mapped to `pricing_model`, NOT `pricing_tiers`. In practice
// subagents use `pricing` for a pricing *model* or prose summary ("usage-based", "$0.005/req")
// far more often than for an enumerated tier list, so routing it to `pricing_tiers` corrupted
// the structured tier data the overview/matrix render from. Use `price_tiers`/`pricing_tiers`
// explicitly for tiers.
const FIELD_ALIASES = {
'competitor': 'competitor_name',
'name': 'competitor_name',
'company': 'competitor_name',
'homepage': 'website',
'url': 'website',
'price_tiers': 'pricing_tiers',
'pricing': 'pricing_model',
};
function canonicalValue(fm, key) {
if (fm[key]) return fm[key];
for (const [alias, canonical] of Object.entries(FIELD_ALIASES)) {
if (canonical === key && fm[alias]) return fm[alias];
}
return undefined;
}
const mergedFm = {};
for (const k of CANONICAL_FIELDS) {
const v = canonicalValue(marketing.fm, k);
if (v) mergedFm[k] = v;
}
// Other lanes may fill in canonical gaps (e.g. funding_info from news, strategic_diff from technical).
for (const lane of LANES) {
if (lane === 'marketing' || !lanes[lane] || !lanes[lane].fm) continue;
for (const k of CANONICAL_FIELDS) {
if (!mergedFm[k]) {
const v = canonicalValue(lanes[lane].fm, k);
if (v) mergedFm[k] = v;
}
}
}
const fmLines = Object.entries(mergedFm).map(([k, v]) => `${k}: ${v}`).join('\n');
// Comparison heading may be "Comparison vs Exa" etc — find any key starting with "Comparison"
const comparisonKey = Object.keys(allSections).find(k => k.startsWith('Comparison'));
// Battle lane is format-drifty: subagents emit `## Battle Card`, `# Battle Card: X vs Y`
// (h1 — not picked up by parseSections), or skip the wrapper and lead with `## Landmines`.
// Treat the ENTIRE battle partial body as the Battle Card section regardless of heading style,
// so sales enablement content always lands in the merged file.
let battleCardBody = '';
if (lanes.battle && lanes.battle.body) {
const body = lanes.battle.body.trim();
// Strip the FIRST heading line if it mentions "Battle Card" — handles h1/h2/h3 and any
// suffix (e.g. `## Battle Card — Serper`, `# Battle Card: Tavily`). Otherwise the
// canonical `## Battle Card` wrapper added below produces duplicate headings.
battleCardBody = body.replace(/^#{1,3}\s+Battle\s*Card\b[^\n]*\n+/m, '').trim();
}
const out = [
'---',
fmLines,
'---',
'',
first('Product') ? `## Product\n${first('Product')}\n` : '',
first('Pricing') ? `## Pricing\n${first('Pricing')}\n` : '',
first('Features') ? `## Features\n${first('Features')}\n` : '',
first('Positioning') ? `## Positioning\n${first('Positioning')}\n` : '',
comparisonKey && allSections[comparisonKey].length ? `## ${comparisonKey}\n${allSections[comparisonKey][0]}\n` : '',
battleCardBody ? `## Battle Card\n${battleCardBody}\n` : '',
dedupedMentions.length ? `## Mentions\n${dedupedMentions.join('\n')}\n` : '',
dedupedBench.length ? `## Benchmarks\n${dedupedBench.join('\n')}\n` : '',
dedupedFindings.length ? `## Research Findings\n${dedupedFindings.join('\n')}\n` : '',
].filter(Boolean).join('\n');
writeFileSync(join(dir, `${slug}.md`), out);
merged += 1;
console.error(`[ok] ${slug}: ${dedupedMentions.length} mentions, ${dedupedBench.length} benchmarks, ${dedupedFindings.length} findings`);
}
console.log(JSON.stringify({ merged, competitors: bySlug.size }));