141 lines
5.4 KiB
JavaScript
141 lines
5.4 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
// Parses "X vs Y" patterns from `browse cloud search` result titles across discovery batch files.
|
|
// Produces a ranked list of candidate competitor names, with an example title each,
|
|
// and attempts to resolve each name to a domain from the result URL pool.
|
|
//
|
|
// Usage: node extract_vs_names.mjs <directory> [--prefix competitor] [--seed "Exa,Tavily,SerpAPI"]
|
|
//
|
|
// Output: newline-delimited JSON to stdout, one object per candidate:
|
|
// { "name": "serper", "hits": 3, "domain": "serper.dev", "example": "Tavily vs Serper..." }
|
|
|
|
import { readdirSync, readFileSync } from 'fs';
|
|
import { join } from 'path';
|
|
|
|
const args = process.argv.slice(2);
|
|
|
|
if (args.includes('--help') || args.includes('-h') || args.length === 0) {
|
|
console.error(`Usage: node extract_vs_names.mjs <directory> [--prefix <prefix>] [--seed "<csv>"]
|
|
|
|
Reads all <prefix>_discovery_batch_*.json files, parses "X vs Y" patterns from result
|
|
titles, and outputs a ranked list of candidate competitor names as newline-delimited JSON.
|
|
|
|
Options:
|
|
--prefix <prefix> Batch file prefix (default: "competitor")
|
|
--seed "<csv>" Comma-separated list of seed names to exclude from output
|
|
(you already know these; want the OTHER side of the comparison)
|
|
--help, -h Show this help message`);
|
|
process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1);
|
|
}
|
|
|
|
const dir = args[0];
|
|
const prefixIdx = args.indexOf('--prefix');
|
|
const prefix = prefixIdx !== -1 && args[prefixIdx + 1] ? args[prefixIdx + 1] : 'competitor';
|
|
const seedIdx = args.indexOf('--seed');
|
|
const seeds = seedIdx !== -1 && args[seedIdx + 1]
|
|
? args[seedIdx + 1].split(',').map(s => s.trim().toLowerCase()).filter(Boolean)
|
|
: [];
|
|
const seedSet = new Set(seeds);
|
|
|
|
// Escape regex metacharacters in the user-supplied prefix so a value like
|
|
// "comp.+" matches the literal filename, not as a regex pattern.
|
|
const escapedPrefix = prefix.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
const pattern = new RegExp(`^${escapedPrefix}_discovery_batch_.*\\.json$`);
|
|
|
|
let files;
|
|
try {
|
|
files = readdirSync(dir).filter(f => pattern.test(f)).sort();
|
|
} catch (err) {
|
|
console.error(`Error reading directory ${dir}: ${err.message}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
if (files.length === 0) {
|
|
console.error(`No ${prefix}_discovery_batch_*.json files found in ${dir}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
const allResults = [];
|
|
for (const f of files) {
|
|
try {
|
|
const d = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
|
|
const rs = Array.isArray(d) ? d : d.results || [];
|
|
allResults.push(...rs);
|
|
} catch {}
|
|
}
|
|
|
|
// Build a lookup of hostname -> candidate root domain from all result URLs.
|
|
// Used later to try to resolve "serper" -> "serper.dev".
|
|
// Exclude any host whose root-base equals a seed name — otherwise a short extracted token
|
|
// like "exa" can match the user's own domain (exa.ai).
|
|
const hostMap = new Map();
|
|
for (const r of allResults) {
|
|
if (!r.url) continue;
|
|
try {
|
|
const h = new URL(r.url).hostname.replace(/^www\./, '');
|
|
const root = h.split('.').slice(-2).join('.');
|
|
const rootBase = root.split('.')[0];
|
|
if (seedSet.has(rootBase)) continue;
|
|
if (!hostMap.has(root)) hostMap.set(root, h);
|
|
} catch {}
|
|
}
|
|
|
|
// Extract names from "X vs Y" patterns.
|
|
const counts = new Map();
|
|
for (const r of allResults) {
|
|
const title = (r.title || '').toLowerCase();
|
|
const ms = [...title.matchAll(/\b([a-z][\w.\-]{2,})\s+(?:vs\.?|versus)\s+([a-z][\w.\-]{2,})/g)];
|
|
for (const m of ms) {
|
|
for (const raw of [m[1], m[2]]) {
|
|
const name = raw.replace(/[^a-z0-9.\-]/g, '').trim();
|
|
if (!name || name.length < 3) continue;
|
|
if (seedSet.has(name)) continue;
|
|
// Reject obvious non-product tokens
|
|
if (['the', 'and', 'for', 'with', 'best', 'top', 'better', 'using', 'choosing'].includes(name)) continue;
|
|
if (!counts.has(name)) counts.set(name, { name, hits: 0, example: r.title });
|
|
counts.get(name).hits += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try to resolve each name to a domain.
|
|
// Strategy:
|
|
// 1. Exact match on rootBase wins outright.
|
|
// 2. Otherwise allow rootBase.startsWith(needle) ONLY when the suffix is a known
|
|
// branding token (e.g. "serp" → "serpapi.com"). Bidirectional startsWith
|
|
// was too loose: "serp" matched serpstack.com, "exa" matched example.com.
|
|
// 3. Among multiple suffix matches, prefer the shortest suffix (most specific —
|
|
// "serp" should match "serpapi" before "serpapilabs"). Deterministic.
|
|
const BRAND_SUFFIXES = ['api','search','app','ai','io','hq','co','dev','tech','cloud','agent','agents','labs','lab'];
|
|
|
|
function resolveDomain(name) {
|
|
const needle = name.replace(/\./g, '');
|
|
let exact = null;
|
|
let bestSuffix = null; // { host, suffixLen }
|
|
for (const [root, host] of hostMap.entries()) {
|
|
const rootBase = root.split('.')[0];
|
|
if (rootBase === needle) { exact = host; break; }
|
|
if (rootBase.length > needle.length && rootBase.startsWith(needle)) {
|
|
const suffix = rootBase.slice(needle.length).replace(/^[\-_]/, '');
|
|
if (BRAND_SUFFIXES.includes(suffix)) {
|
|
if (!bestSuffix || suffix.length < bestSuffix.suffixLen) {
|
|
bestSuffix = { host, suffixLen: suffix.length };
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (exact) return exact;
|
|
if (bestSuffix) return bestSuffix.host;
|
|
return null;
|
|
}
|
|
|
|
const ranked = [...counts.values()]
|
|
.map(c => ({ ...c, domain: resolveDomain(c.name) }))
|
|
.sort((a, b) => b.hits - a.hits);
|
|
|
|
for (const c of ranked) {
|
|
console.log(JSON.stringify(c));
|
|
}
|
|
|
|
console.error(`Extracted ${ranked.length} candidate names from ${files.length} batch files`);
|