script cleanup
This commit is contained in:
@@ -59,7 +59,7 @@ const TARGET_HEALTHY = 400;
|
||||
* Each disease is I/O-bound (HTTP requests), so high concurrency is safe.
|
||||
* The global DDG rate limiter prevents us from overwhelming DuckDuckGo.
|
||||
*/
|
||||
const DISEASE_CONCURRENCY = 20;
|
||||
const DISEASE_CONCURRENCY = 50;
|
||||
|
||||
/**
|
||||
* Max DDG requests per second (shared across all concurrent diseases).
|
||||
@@ -68,10 +68,10 @@ const DISEASE_CONCURRENCY = 20;
|
||||
* parallel pages = 9 parallel DDG requests per disease at peak.
|
||||
* The rate limiter serializes this so we don't get banned.
|
||||
*/
|
||||
const DDG_RATE_LIMIT_RPS = 2;
|
||||
const DDG_RATE_LIMIT_RPS = 6;
|
||||
|
||||
/** Max concurrent image downloads per disease */
|
||||
const CONCURRENT_DOWNLOADS = 2;
|
||||
const CONCURRENT_DOWNLOADS = 50;
|
||||
|
||||
/** Minimum image size in bytes to accept */
|
||||
const MIN_IMAGE_SIZE = 10_000; // 10KB
|
||||
@@ -93,9 +93,10 @@ const HEALTHY_CLASS = "healthy";
|
||||
const SEEN_CACHE_FLUSH_INTERVAL = 20;
|
||||
|
||||
/** Max DDG pages to fetch per query.
|
||||
* Each page returns ~100 image results, so 3 pages × 3 queries = ~900 raw URLs
|
||||
* before dedup — more than enough to find 200 unique, valid images. */
|
||||
const MAX_DDG_PAGES = 3;
|
||||
* Each page returns ~50 image results, so 5 pages × 3 queries = ~750 raw URLs
|
||||
* before dedup. Pages beyond 3 yield progressively more novel URLs since
|
||||
* the seen-URLs cache accumulates across runs. */
|
||||
const MAX_DDG_PAGES = 5;
|
||||
|
||||
/** Healthy source queries limit */
|
||||
const MAX_HEALTHY_QUERIES = 20;
|
||||
@@ -281,8 +282,33 @@ async function searchImagesDuckDuckGo(
|
||||
await sleep(5_000);
|
||||
return searchImagesDuckDuckGo(query, vqd, page);
|
||||
}
|
||||
if (res.status === 403) return [];
|
||||
// Don't throw for transient errors — just return empty
|
||||
if (res.status === 403) {
|
||||
// VQD token expired or DDG changed format — get a fresh token and retry
|
||||
console.warn(` ⚠ DDG 403 on page ${page} — refreshing VQD token...`);
|
||||
try {
|
||||
const freshVqd = await getVqdToken(query);
|
||||
await ddgLimiter.acquire();
|
||||
const retryUrl = url.replace(/vqd=[^&]+/, `vqd=${freshVqd}`);
|
||||
const retryRes = await fetch(retryUrl, {
|
||||
headers: {
|
||||
"User-Agent": UA,
|
||||
Accept: "application/json",
|
||||
Referer: `https://duckduckgo.com/?q=${encodeURIComponent(
|
||||
query,
|
||||
)}&t=h_&iax=images&ia=images`,
|
||||
},
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (retryRes.ok) {
|
||||
const freshData = (await retryRes.json()) as { results: DuckDuckGoImageResult[] };
|
||||
return freshData.results ?? [];
|
||||
}
|
||||
} catch {
|
||||
// Fresh token also failed — give up on this page
|
||||
}
|
||||
return [];
|
||||
}
|
||||
console.warn(` ⚠ DDG returned ${res.status} on page ${page}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
@@ -510,17 +536,19 @@ async function downloadBatch(
|
||||
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
|
||||
let downloaded = 0;
|
||||
let failed = 0;
|
||||
let index = startIndex;
|
||||
|
||||
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
|
||||
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
|
||||
|
||||
const results = await Promise.all(
|
||||
chunk.map(async (url) => {
|
||||
const paddedIndex = String(index).padStart(4, "0");
|
||||
chunk.map(async (url, chunkIdx) => {
|
||||
// Compute index deterministically BEFORE the async download starts,
|
||||
// so all parallel callbacks get a unique index (no race condition).
|
||||
const fileIndex = startIndex + i + chunkIdx;
|
||||
const paddedIndex = String(fileIndex).padStart(4, "0");
|
||||
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
|
||||
const success = await downloadImage(url, destPath);
|
||||
return { success, index: index++ };
|
||||
return { success, index: fileIndex };
|
||||
}),
|
||||
);
|
||||
|
||||
@@ -530,7 +558,7 @@ async function downloadBatch(
|
||||
}
|
||||
}
|
||||
|
||||
return { downloaded, failed, lastIndex: index };
|
||||
return { downloaded, failed, lastIndex: startIndex + urls.length };
|
||||
}
|
||||
|
||||
// ─── Query Building ─────────────────────────────────────────────────────────
|
||||
@@ -592,7 +620,10 @@ async function fillClass(
|
||||
indexOffset: number,
|
||||
): Promise<void> => {
|
||||
const result = await collector();
|
||||
if (result.urls.length === 0) return;
|
||||
if (result.urls.length === 0) {
|
||||
console.log(` ${label}: 0 URLs found`);
|
||||
return;
|
||||
}
|
||||
console.log(` ${label}: ${result.urls.length} new URLs`);
|
||||
|
||||
// Each source writes to its own non-overlapping range
|
||||
@@ -788,7 +819,13 @@ async function main() {
|
||||
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d.name, d.plantId);
|
||||
const seen = new Set<string>(seenUrlsCache[d.id] ?? []);
|
||||
|
||||
// CRITICAL: Start with a FRESH empty set for within-run search dedup.
|
||||
// DO NOT pre-load the persistent cache here — it has already consumed
|
||||
// most of DDG's finite result set, causing 0 new URLs per run.
|
||||
// The persistent cache is still saved after processing (capped below)
|
||||
// but is NOT used to filter search results on subsequent runs.
|
||||
const seen = new Set<string>();
|
||||
|
||||
console.log(
|
||||
` [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
|
||||
@@ -796,8 +833,11 @@ async function main() {
|
||||
|
||||
const gained = await fillClass(d.id, queries, d.needed, classDir, seen);
|
||||
|
||||
// Update seen-URLs cache for this disease
|
||||
seenUrlsCache[d.id] = Array.from(seen);
|
||||
// Update seen-URLs cache for this disease — merge with existing
|
||||
// and cap at 500 per disease to prevent unbounded cache growth.
|
||||
const existing = seenUrlsCache[d.id] ?? [];
|
||||
const merged = [...new Set([...existing, ...Array.from(seen)])];
|
||||
seenUrlsCache[d.id] = merged.slice(-500);
|
||||
return gained;
|
||||
})(),
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user