91 lines
3.1 KiB
JavaScript
91 lines
3.1 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
// Deduplicates discovery URLs from `browse cloud search` JSON output files.
|
|
// Usage: node list_urls.mjs /tmp [--prefix competitor]
|
|
// Reads all {prefix}_discovery_batch_*.json files, deduplicates by domain,
|
|
// outputs one URL per line to stdout, stats to stderr.
|
|
|
|
import { readdirSync, readFileSync } from 'fs';
|
|
import { join } from 'path';
|
|
|
|
const args = process.argv.slice(2);
|
|
|
|
if (args.includes('--help') || args.includes('-h') || args.length === 0) {
|
|
console.error(`Usage: node list_urls.mjs <directory> [--prefix <prefix>]
|
|
|
|
Reads all <prefix>_discovery_batch_*.json files from <directory>,
|
|
deduplicates URLs by domain, and outputs one URL per line to stdout.
|
|
|
|
Options:
|
|
--prefix <prefix> Batch file prefix (default: "competitor")
|
|
--help, -h Show this help message
|
|
|
|
Examples:
|
|
node list_urls.mjs /tmp
|
|
node list_urls.mjs /tmp --prefix competitor`);
|
|
process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1);
|
|
}
|
|
|
|
const dir = args[0];
|
|
const prefixIdx = args.indexOf('--prefix');
|
|
const prefix = prefixIdx !== -1 && args[prefixIdx + 1] ? args[prefixIdx + 1] : 'competitor';
|
|
|
|
// Escape regex metacharacters in the user-supplied prefix so a value like
|
|
// "comp.+" matches the literal filename, not as a regex pattern.
|
|
const escapedPrefix = prefix.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
const pattern = new RegExp(`^${escapedPrefix}_discovery_batch_.*\\.json$`);
|
|
|
|
let files;
|
|
try {
|
|
files = readdirSync(dir)
|
|
.filter(f => pattern.test(f))
|
|
.sort();
|
|
} catch (err) {
|
|
console.error(`Error reading directory ${dir}: ${err.message}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
if (files.length === 0) {
|
|
console.error(`No ${prefix}_discovery_batch_*.json files found in ${dir}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
// Dedup by hostname, but prefer the site root over a deep link. The first search hit for a
|
|
// domain is often a blog/doc/comparison path; gating + enrichment want the homepage, so when
|
|
// multiple URLs share a host we keep the shallowest path (fewest segments). First-seen host
|
|
// order is preserved (Map.set on an existing key keeps its position).
|
|
const byDomain = new Map(); // hostname -> { url, depth }
|
|
let totalResults = 0;
|
|
|
|
for (const file of files) {
|
|
try {
|
|
const data = JSON.parse(readFileSync(join(dir, file), 'utf-8'));
|
|
const results = Array.isArray(data) ? data : (data.results || []);
|
|
totalResults += results.length;
|
|
|
|
for (const result of results) {
|
|
const url = result.url;
|
|
if (!url) continue;
|
|
|
|
try {
|
|
const u = new URL(url);
|
|
const hostname = u.hostname.replace(/^www\./, '');
|
|
const depth = u.pathname.replace(/\/+$/, '').split('/').filter(Boolean).length;
|
|
const existing = byDomain.get(hostname);
|
|
if (!existing || depth < existing.depth) byDomain.set(hostname, { url, depth });
|
|
} catch {
|
|
// Skip invalid URLs
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.error(`Warning: Failed to parse ${file}: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
const urls = [...byDomain.values()].map(v => v.url);
|
|
for (const url of urls) {
|
|
console.log(url);
|
|
}
|
|
|
|
console.error(`\n${files.length} files, ${totalResults} total results, ${urls.length} unique domains`);
|