225 lines
9.7 KiB
JavaScript
225 lines
9.7 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
// Category-fit gate. For each candidate URL, fetch the homepage hero via `browse cloud fetch`,
|
||
// extract visible text, and decide whether the candidate is in the same category as
|
||
// the user's company based on include/exclude keyword rules.
|
||
//
|
||
// Usage:
|
||
// cat urls.txt | node gate_candidates.mjs \
|
||
// --include "web search api,neural search,retrieval api,semantic search,search for agents" \
|
||
// --exclude "vector database,observability,analytics,enterprise search appliance,site search widget" \
|
||
// --concurrency 6
|
||
//
|
||
// Output: newline-delimited JSON to stdout with one object per URL:
|
||
// { "url": "https://foo.com", "status": "PASS" | "REJECT" | "UNKNOWN",
|
||
// "matched_includes": [...], "matched_excludes": [...], "title": "...", "hero": "..." }
|
||
|
||
import { execFile } from 'child_process';
|
||
import { promisify } from 'util';
|
||
import { readFileSync } from 'fs';
|
||
|
||
// Async execFile so the worker pool actually parallelizes. spawnSync blocks the entire
|
||
// event loop, which silently turns --concurrency N into N=1 — every URL fetched serially
|
||
// regardless of the flag. With promisified execFile, N workers can wait on N pending
|
||
// `browse cloud fetch` processes concurrently.
|
||
const execFileAsync = promisify(execFile);
|
||
|
||
const args = process.argv.slice(2);
|
||
|
||
if (args.includes('--help') || args.includes('-h')) {
|
||
console.error(`Usage: cat urls.txt | node gate_candidates.mjs [options]
|
||
|
||
Reads URLs from stdin (one per line) OR from --input <file>. For each URL, fetches
|
||
the homepage via \`browse cloud fetch --allow-redirects\`, extracts the first N chars of visible
|
||
text (the hero / tagline area), and classifies against include/exclude keyword rules.
|
||
|
||
Options:
|
||
--include "<csv>" Required. Comma-separated keywords; candidate PASSES if any match.
|
||
--exclude "<csv>" Comma-separated keywords; candidate REJECTS if any match.
|
||
--input <file> Read URLs from file instead of stdin.
|
||
--concurrency <n> Max parallel fetches (default: 6).
|
||
--hero-chars <n> Chars of visible text to examine (default: 800).
|
||
--help, -h Show this help message.`);
|
||
process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1);
|
||
}
|
||
|
||
function flag(name) {
|
||
const i = args.indexOf(name);
|
||
return i !== -1 ? args[i + 1] : null;
|
||
}
|
||
|
||
const includes = (flag('--include') || '').split(',').map(s => s.trim().toLowerCase()).filter(Boolean);
|
||
const excludes = (flag('--exclude') || '').split(',').map(s => s.trim().toLowerCase()).filter(Boolean);
|
||
// Floor at 1: `--concurrency 0` or a non-numeric value makes parseInt yield 0/NaN, which would
|
||
// spawn zero workers — the script would exit "successfully" having gated nothing, making
|
||
// discovery look empty with no error. Always run at least one worker.
|
||
const concurrency = Math.max(1, parseInt(flag('--concurrency') || '6', 10) || 0);
|
||
const heroChars = parseInt(flag('--hero-chars') || '800', 10);
|
||
const inputFile = flag('--input');
|
||
|
||
function stripHtml(html) {
|
||
const withoutActiveContent = removeElementContent(removeElementContent(html, 'script'), 'style');
|
||
return withoutActiveContent
|
||
.replace(/<[^>]*>/g, ' ')
|
||
.replace(/</g, '<')
|
||
.replace(/>/g, '>')
|
||
.replace(/"/g, '"')
|
||
.replace(/'/g, "'")
|
||
.replace(/ /g, ' ')
|
||
.replace(/&/g, '&')
|
||
.replace(/\s+/g, ' ')
|
||
.trim();
|
||
}
|
||
|
||
function removeElementContent(html, tagName) {
|
||
let out = '';
|
||
let cursor = 0;
|
||
const lower = html.toLowerCase();
|
||
const openNeedle = `<${tagName}`;
|
||
const closeNeedle = `</${tagName}`;
|
||
while (cursor < html.length) {
|
||
const start = lower.indexOf(openNeedle, cursor);
|
||
if (start === -1) {
|
||
out += html.slice(cursor);
|
||
break;
|
||
}
|
||
out += html.slice(cursor, start);
|
||
const close = lower.indexOf(closeNeedle, start + openNeedle.length);
|
||
if (close === -1) {
|
||
cursor = html.length;
|
||
break;
|
||
}
|
||
const closeEnd = html.indexOf('>', close + closeNeedle.length);
|
||
cursor = closeEnd === -1 ? html.length : closeEnd + 1;
|
||
out += ' ';
|
||
}
|
||
return out;
|
||
}
|
||
|
||
if (args.includes('--self-test')) {
|
||
console.assert(stripHtml('<p>A&lt;B</p>') === 'A<B');
|
||
console.assert(stripHtml('<script>alert(1)</script ignored><p>ok</p>') === 'ok');
|
||
process.exit(0);
|
||
}
|
||
|
||
if (includes.length === 0) {
|
||
console.error('Error: --include is required');
|
||
process.exit(1);
|
||
}
|
||
|
||
let urls;
|
||
if (inputFile) {
|
||
urls = readFileSync(inputFile, 'utf-8').split('\n').map(l => l.trim()).filter(Boolean);
|
||
} else {
|
||
const stdin = readFileSync(0, 'utf-8');
|
||
urls = stdin.split('\n').map(l => l.trim()).filter(Boolean);
|
||
}
|
||
|
||
if (urls.length === 0) {
|
||
console.error('Error: no URLs provided (pipe via stdin or use --input)');
|
||
process.exit(1);
|
||
}
|
||
|
||
// Position-aware classification:
|
||
// 1. Exclude term in <title> → REJECT (their primary identity is the excluded category)
|
||
// 2. Include term in <title> → PASS (their primary identity matches)
|
||
// 3. Include in early hero (200ch) → PASS iff no exclude in early hero
|
||
// 4. Otherwise → REJECT (default conservative)
|
||
// Rationale: <title> is the single strongest signal of what a company sells.
|
||
// Mid/late hero mentions (e.g. "we also support web scraping use cases") shouldn't
|
||
// disqualify a real competitor that self-identifies in its title as a cloud browser.
|
||
function classify(title, heroFull, includes, excludes) {
|
||
const titleLower = (title || '').toLowerCase();
|
||
const heroLower = heroFull.toLowerCase();
|
||
const heroEarly = heroLower.slice(0, 200);
|
||
|
||
const incTitle = includes.filter(k => titleLower.includes(k));
|
||
const excTitle = excludes.filter(k => titleLower.includes(k));
|
||
const incEarly = includes.filter(k => heroEarly.includes(k));
|
||
const excEarly = excludes.filter(k => heroEarly.includes(k));
|
||
const incHero = includes.filter(k => heroLower.includes(k));
|
||
const excHero = excludes.filter(k => heroLower.includes(k));
|
||
|
||
let status, reason;
|
||
if (incTitle.length > 0 && excTitle.length > 0) {
|
||
// Hybrid-identity title (e.g. "Browser Automation & Web Scraping API").
|
||
// Break the tie by the early hero — whichever category has more mentions wins.
|
||
if (incEarly.length > excEarly.length) { status = 'PASS'; reason = `title-hybrid→hero200 leans include(${incEarly[0] || incTitle[0]})`; }
|
||
else if (excEarly.length > incEarly.length) { status = 'REJECT'; reason = `title-hybrid→hero200 leans exclude(${excEarly[0] || excTitle[0]})`; }
|
||
else { status = 'PASS'; reason = `title-hybrid→tie, defaulting include(${incTitle[0]})`; }
|
||
}
|
||
else if (excTitle.length > 0) { status = 'REJECT'; reason = `title→exclude(${excTitle[0]})`; }
|
||
else if (incTitle.length > 0) { status = 'PASS'; reason = `title→include(${incTitle[0]})`; }
|
||
else if (incEarly.length > 0 && excEarly.length === 0) { status = 'PASS'; reason = `hero200→include(${incEarly[0]})`; }
|
||
else if (excEarly.length > 0) { status = 'REJECT'; reason = `hero200→exclude(${excEarly[0]})`; }
|
||
else if (incHero.length > 0 && excHero.length === 0) { status = 'PASS'; reason = `hero→include(${incHero[0]})`; }
|
||
// Late-hero conflict: both include AND exclude appear in chars 200–800 (nothing in
|
||
// title or early hero). This is genuine ambiguous signal, not absence — return UNKNOWN
|
||
// so the candidate surfaces in the user-confirmation bucket at Step 4.5 instead of
|
||
// being silently dropped as REJECT.
|
||
else if (incHero.length > 0 && excHero.length > 0) { status = 'UNKNOWN'; reason = `hero→conflict(include:${incHero[0]}, exclude:${excHero[0]})`; }
|
||
else { status = 'REJECT'; reason = 'no category signal'; }
|
||
|
||
return {
|
||
status, reason,
|
||
matched_includes: [...new Set([...incTitle, ...incEarly, ...incHero])],
|
||
matched_excludes: [...new Set([...excTitle, ...excEarly, ...excHero])],
|
||
};
|
||
}
|
||
|
||
async function gateOne(url) {
|
||
let stdout;
|
||
try {
|
||
// --format raw returns the JSON envelope with raw HTML in `.content` (the default
|
||
// is markdown, which has no <title> tag for the position-aware classifier to read).
|
||
const r = await execFileAsync('browse', ['cloud', 'fetch', '--allow-redirects', '--format', 'raw', url], {
|
||
maxBuffer: 4 * 1024 * 1024,
|
||
timeout: 20000,
|
||
});
|
||
stdout = r.stdout;
|
||
} catch (err) {
|
||
// Non-zero exit, timeout, or spawn failure all surface here.
|
||
return { url, status: 'UNKNOWN', reason: `browse cloud fetch failed: ${err.message}`, matched_includes: [], matched_excludes: [], title: '', hero: '' };
|
||
}
|
||
let resp;
|
||
try { resp = JSON.parse(stdout); } catch {
|
||
return { url, status: 'UNKNOWN', reason: 'non-JSON response', matched_includes: [], matched_excludes: [], title: '', hero: '' };
|
||
}
|
||
const html = resp.content || '';
|
||
const titleM = html.match(/<title[^>]*>([^<]*)<\/title>/i);
|
||
const title = titleM ? titleM[1].trim() : '';
|
||
const heroFull = stripHtml(html).slice(0, heroChars);
|
||
const c = classify(title, heroFull, includes, excludes);
|
||
return {
|
||
url,
|
||
status: c.status,
|
||
reason: c.reason,
|
||
matched_includes: c.matched_includes,
|
||
matched_excludes: c.matched_excludes,
|
||
title,
|
||
hero: heroFull.slice(0, 240),
|
||
};
|
||
}
|
||
|
||
// Run with bounded concurrency
|
||
const results = [];
|
||
async function runAll() {
|
||
const queue = [...urls];
|
||
const workers = Array(Math.min(concurrency, queue.length)).fill(0).map(async () => {
|
||
while (queue.length > 0) {
|
||
const u = queue.shift();
|
||
const r = await gateOne(u);
|
||
results.push(r);
|
||
console.log(JSON.stringify(r));
|
||
}
|
||
});
|
||
await Promise.all(workers);
|
||
}
|
||
|
||
await runAll();
|
||
|
||
const pass = results.filter(r => r.status === 'PASS').length;
|
||
const reject = results.filter(r => r.status === 'REJECT').length;
|
||
const unknown = results.filter(r => r.status === 'UNKNOWN').length;
|
||
console.error(`\nGate: ${pass} PASS / ${reject} REJECT / ${unknown} UNKNOWN (of ${results.length})`);
|