script cleanup

This commit is contained in:
2026-06-09 14:58:33 -04:00
parent 8bda14ab63
commit 6379860123
21 changed files with 57 additions and 10346 deletions

View File

@@ -59,7 +59,7 @@ const TARGET_HEALTHY = 400;
* Each disease is I/O-bound (HTTP requests), so high concurrency is safe.
* The global DDG rate limiter prevents us from overwhelming DuckDuckGo.
*/
const DISEASE_CONCURRENCY = 20;
const DISEASE_CONCURRENCY = 50;
/**
* Max DDG requests per second (shared across all concurrent diseases).
@@ -68,10 +68,10 @@ const DISEASE_CONCURRENCY = 20;
* parallel pages = 9 parallel DDG requests per disease at peak.
* The rate limiter serializes this so we don't get banned.
*/
const DDG_RATE_LIMIT_RPS = 2;
const DDG_RATE_LIMIT_RPS = 6;
/** Max concurrent image downloads per disease */
const CONCURRENT_DOWNLOADS = 2;
const CONCURRENT_DOWNLOADS = 50;
/** Minimum image size in bytes to accept */
const MIN_IMAGE_SIZE = 10_000; // 10KB
@@ -93,9 +93,10 @@ const HEALTHY_CLASS = "healthy";
const SEEN_CACHE_FLUSH_INTERVAL = 20;
/** Max DDG pages to fetch per query.
* Each page returns ~100 image results, so 3 pages × 3 queries = ~900 raw URLs
* before dedup — more than enough to find 200 unique, valid images. */
const MAX_DDG_PAGES = 3;
* Each page returns ~50 image results, so 5 pages × 3 queries = ~750 raw URLs
* before dedup. Pages beyond 3 yield progressively more novel URLs since
* the seen-URLs cache accumulates across runs. */
const MAX_DDG_PAGES = 5;
/** Healthy source queries limit */
const MAX_HEALTHY_QUERIES = 20;
@@ -281,8 +282,33 @@ async function searchImagesDuckDuckGo(
await sleep(5_000);
return searchImagesDuckDuckGo(query, vqd, page);
}
if (res.status === 403) return [];
// Don't throw for transient errors — just return empty
if (res.status === 403) {
// VQD token expired or DDG changed format — get a fresh token and retry
console.warn(` ⚠ DDG 403 on page ${page} — refreshing VQD token...`);
try {
const freshVqd = await getVqdToken(query);
await ddgLimiter.acquire();
const retryUrl = url.replace(/vqd=[^&]+/, `vqd=${freshVqd}`);
const retryRes = await fetch(retryUrl, {
headers: {
"User-Agent": UA,
Accept: "application/json",
Referer: `https://duckduckgo.com/?q=${encodeURIComponent(
query,
)}&t=h_&iax=images&ia=images`,
},
signal: AbortSignal.timeout(15_000),
});
if (retryRes.ok) {
const freshData = (await retryRes.json()) as { results: DuckDuckGoImageResult[] };
return freshData.results ?? [];
}
} catch {
// Fresh token also failed — give up on this page
}
return [];
}
console.warn(` ⚠ DDG returned ${res.status} on page ${page}`);
return [];
}
@@ -510,17 +536,19 @@ async function downloadBatch(
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
let downloaded = 0;
let failed = 0;
let index = startIndex;
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
const results = await Promise.all(
chunk.map(async (url) => {
const paddedIndex = String(index).padStart(4, "0");
chunk.map(async (url, chunkIdx) => {
// Compute index deterministically BEFORE the async download starts,
// so all parallel callbacks get a unique index (no race condition).
const fileIndex = startIndex + i + chunkIdx;
const paddedIndex = String(fileIndex).padStart(4, "0");
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
const success = await downloadImage(url, destPath);
return { success, index: index++ };
return { success, index: fileIndex };
}),
);
@@ -530,7 +558,7 @@ async function downloadBatch(
}
}
return { downloaded, failed, lastIndex: index };
return { downloaded, failed, lastIndex: startIndex + urls.length };
}
// ─── Query Building ─────────────────────────────────────────────────────────
@@ -592,7 +620,10 @@ async function fillClass(
indexOffset: number,
): Promise<void> => {
const result = await collector();
if (result.urls.length === 0) return;
if (result.urls.length === 0) {
console.log(` ${label}: 0 URLs found`);
return;
}
console.log(` ${label}: ${result.urls.length} new URLs`);
// Each source writes to its own non-overlapping range
@@ -788,7 +819,13 @@ async function main() {
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d.name, d.plantId);
const seen = new Set<string>(seenUrlsCache[d.id] ?? []);
// CRITICAL: Start with a FRESH empty set for within-run search dedup.
// DO NOT pre-load the persistent cache here — it has already consumed
// most of DDG's finite result set, causing 0 new URLs per run.
// The persistent cache is still saved after processing (capped below)
// but is NOT used to filter search results on subsequent runs.
const seen = new Set<string>();
console.log(
` [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
@@ -796,8 +833,11 @@ async function main() {
const gained = await fillClass(d.id, queries, d.needed, classDir, seen);
// Update seen-URLs cache for this disease
seenUrlsCache[d.id] = Array.from(seen);
// Update seen-URLs cache for this disease — merge with existing
// and cap at 500 per disease to prevent unbounded cache growth.
const existing = seenUrlsCache[d.id] ?? [];
const merged = [...new Set([...existing, ...Array.from(seen)])];
seenUrlsCache[d.id] = merged.slice(-500);
return gained;
})(),
),