playbook/antigravity-awesome-skills/skills/competitor-analysis/scripts/extract_vs_names.mjs

#!/usr/bin/env node

// Parses "X vs Y" patterns from `browse cloud search` result titles across discovery batch files.
// Produces a ranked list of candidate competitor names, with an example title each,
// and attempts to resolve each name to a domain from the result URL pool.
//
// Usage: node extract_vs_names.mjs <directory> [--prefix competitor] [--seed "Exa,Tavily,SerpAPI"]
//
// Output: newline-delimited JSON to stdout, one object per candidate:
//   { "name": "serper", "hits": 3, "domain": "serper.dev", "example": "Tavily vs Serper..." }

import { readdirSync, readFileSync } from 'fs';
import { join } from 'path';

const args = process.argv.slice(2);

if (args.includes('--help') || args.includes('-h') || args.length === 0) {
  console.error(`Usage: node extract_vs_names.mjs <directory> [--prefix <prefix>] [--seed "<csv>"]

Reads all <prefix>_discovery_batch_*.json files, parses "X vs Y" patterns from result
titles, and outputs a ranked list of candidate competitor names as newline-delimited JSON.

Options:
  --prefix <prefix>   Batch file prefix (default: "competitor")
  --seed "<csv>"      Comma-separated list of seed names to exclude from output
                      (you already know these; want the OTHER side of the comparison)
  --help, -h          Show this help message`);
  process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1);
}

const dir = args[0];
const prefixIdx = args.indexOf('--prefix');
const prefix = prefixIdx !== -1 && args[prefixIdx + 1] ? args[prefixIdx + 1] : 'competitor';
const seedIdx = args.indexOf('--seed');
const seeds = seedIdx !== -1 && args[seedIdx + 1]
  ? args[seedIdx + 1].split(',').map(s => s.trim().toLowerCase()).filter(Boolean)
  : [];
const seedSet = new Set(seeds);

// Escape regex metacharacters in the user-supplied prefix so a value like
// "comp.+" matches the literal filename, not as a regex pattern.
const escapedPrefix = prefix.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const pattern = new RegExp(`^${escapedPrefix}_discovery_batch_.*\\.json$`);

let files;
try {
  files = readdirSync(dir).filter(f => pattern.test(f)).sort();
} catch (err) {
  console.error(`Error reading directory ${dir}: ${err.message}`);
  process.exit(1);
}

if (files.length === 0) {
  console.error(`No ${prefix}_discovery_batch_*.json files found in ${dir}`);
  process.exit(1);
}

const allResults = [];
for (const f of files) {
  try {
    const d = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
    const rs = Array.isArray(d) ? d : d.results || [];
    allResults.push(...rs);
  } catch {}
}

// Build a lookup of hostname -> candidate root domain from all result URLs.
// Used later to try to resolve "serper" -> "serper.dev".
// Exclude any host whose root-base equals a seed name — otherwise a short extracted token
// like "exa" can match the user's own domain (exa.ai).
const hostMap = new Map();
for (const r of allResults) {
  if (!r.url) continue;
  try {
    const h = new URL(r.url).hostname.replace(/^www\./, '');
    const root = h.split('.').slice(-2).join('.');
    const rootBase = root.split('.')[0];
    if (seedSet.has(rootBase)) continue;
    if (!hostMap.has(root)) hostMap.set(root, h);
  } catch {}
}

// Extract names from "X vs Y" patterns.
const counts = new Map();
for (const r of allResults) {
  const title = (r.title || '').toLowerCase();
  const ms = [...title.matchAll(/\b([a-z][\w.\-]{2,})\s+(?:vs\.?|versus)\s+([a-z][\w.\-]{2,})/g)];
  for (const m of ms) {
    for (const raw of [m[1], m[2]]) {
      const name = raw.replace(/[^a-z0-9.\-]/g, '').trim();
      if (!name || name.length < 3) continue;
      if (seedSet.has(name)) continue;
      // Reject obvious non-product tokens
      if (['the', 'and', 'for', 'with', 'best', 'top', 'better', 'using', 'choosing'].includes(name)) continue;
      if (!counts.has(name)) counts.set(name, { name, hits: 0, example: r.title });
      counts.get(name).hits += 1;
    }
  }
}

// Try to resolve each name to a domain.
// Strategy:
//   1. Exact match on rootBase wins outright.
//   2. Otherwise allow rootBase.startsWith(needle) ONLY when the suffix is a known
//      branding token (e.g. "serp" → "serpapi.com"). Bidirectional startsWith
//      was too loose: "serp" matched serpstack.com, "exa" matched example.com.
//   3. Among multiple suffix matches, prefer the shortest suffix (most specific —
//      "serp" should match "serpapi" before "serpapilabs"). Deterministic.
const BRAND_SUFFIXES = ['api','search','app','ai','io','hq','co','dev','tech','cloud','agent','agents','labs','lab'];

function resolveDomain(name) {
  const needle = name.replace(/\./g, '');
  let exact = null;
  let bestSuffix = null; // { host, suffixLen }
  for (const [root, host] of hostMap.entries()) {
    const rootBase = root.split('.')[0];
    if (rootBase === needle) { exact = host; break; }
    if (rootBase.length > needle.length && rootBase.startsWith(needle)) {
      const suffix = rootBase.slice(needle.length).replace(/^[\-_]/, '');
      if (BRAND_SUFFIXES.includes(suffix)) {
        if (!bestSuffix || suffix.length < bestSuffix.suffixLen) {
          bestSuffix = { host, suffixLen: suffix.length };
        }
      }
    }
  }
  if (exact) return exact;
  if (bestSuffix) return bestSuffix.host;
  return null;
}

const ranked = [...counts.values()]
  .map(c => ({ ...c, domain: resolveDomain(c.name) }))
  .sort((a, b) => b.hits - a.hits);

for (const c of ranked) {
  console.log(JSON.stringify(c));
}

console.error(`Extracted ${ranked.length} candidate names from ${files.length} batch files`);