script cleanup

2026-06-09 14:58:33 -04:00
parent 8bda14ab63
commit 6379860123
21 changed files with 57 additions and 10346 deletions
--- a/scripts/fill-training-dataset.ts
+++ b/scripts/fill-training-dataset.ts
@@ -59,7 +59,7 @@ const TARGET_HEALTHY = 400;
 * Each disease is I/O-bound (HTTP requests), so high concurrency is safe.
 * The global DDG rate limiter prevents us from overwhelming DuckDuckGo.
 */
-const DISEASE_CONCURRENCY = 20;
+const DISEASE_CONCURRENCY = 50;

 /**
 * Max DDG requests per second (shared across all concurrent diseases).
@@ -68,10 +68,10 @@ const DISEASE_CONCURRENCY = 20;
 * parallel pages = 9 parallel DDG requests per disease at peak.
 * The rate limiter serializes this so we don't get banned.
 */
-const DDG_RATE_LIMIT_RPS = 2;
+const DDG_RATE_LIMIT_RPS = 6;

 /** Max concurrent image downloads per disease */
-const CONCURRENT_DOWNLOADS = 2;
+const CONCURRENT_DOWNLOADS = 50;

 /** Minimum image size in bytes to accept */
 const MIN_IMAGE_SIZE = 10_000; // 10KB
@@ -93,9 +93,10 @@ const HEALTHY_CLASS = "healthy";
 const SEEN_CACHE_FLUSH_INTERVAL = 20;

 /** Max DDG pages to fetch per query.
- *  Each page returns ~100 image results, so 3 pages × 3 queries = ~900 raw URLs
- *  before dedup — more than enough to find 200 unique, valid images. */
-const MAX_DDG_PAGES = 3;
+ *  Each page returns ~50 image results, so 5 pages × 3 queries = ~750 raw URLs
+ *  before dedup. Pages beyond 3 yield progressively more novel URLs since
+ *  the seen-URLs cache accumulates across runs. */
+const MAX_DDG_PAGES = 5;

 /** Healthy source queries limit */
 const MAX_HEALTHY_QUERIES = 20;
@@ -281,8 +282,33 @@ async function searchImagesDuckDuckGo(
      await sleep(5_000);
      return searchImagesDuckDuckGo(query, vqd, page);
    }
-    if (res.status === 403) return [];
-    // Don't throw for transient errors — just return empty
+    if (res.status === 403) {
+      // VQD token expired or DDG changed format — get a fresh token and retry
+      console.warn(`    ⚠ DDG 403 on page ${page} — refreshing VQD token...`);
+      try {
+        const freshVqd = await getVqdToken(query);
+        await ddgLimiter.acquire();
+        const retryUrl = url.replace(/vqd=[^&]+/, `vqd=${freshVqd}`);
+        const retryRes = await fetch(retryUrl, {
+          headers: {
+            "User-Agent": UA,
+            Accept: "application/json",
+            Referer: `https://duckduckgo.com/?q=${encodeURIComponent(
+              query,
+            )}&t=h_&iax=images&ia=images`,
+          },
+          signal: AbortSignal.timeout(15_000),
+        });
+        if (retryRes.ok) {
+          const freshData = (await retryRes.json()) as { results: DuckDuckGoImageResult[] };
+          return freshData.results ?? [];
+        }
+      } catch {
+        // Fresh token also failed — give up on this page
+      }
+      return [];
+    }
+    console.warn(`    ⚠ DDG returned ${res.status} on page ${page}`);
    return [];
  }

@@ -510,17 +536,19 @@ async function downloadBatch(
 ): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
  let downloaded = 0;
  let failed = 0;
-  let index = startIndex;

  for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
    const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);

    const results = await Promise.all(
-      chunk.map(async (url) => {
-        const paddedIndex = String(index).padStart(4, "0");
+      chunk.map(async (url, chunkIdx) => {
+        // Compute index deterministically BEFORE the async download starts,
+        // so all parallel callbacks get a unique index (no race condition).
+        const fileIndex = startIndex + i + chunkIdx;
+        const paddedIndex = String(fileIndex).padStart(4, "0");
        const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
        const success = await downloadImage(url, destPath);
-        return { success, index: index++ };
+        return { success, index: fileIndex };
      }),
    );

@@ -530,7 +558,7 @@ async function downloadBatch(
    }
  }

-  return { downloaded, failed, lastIndex: index };
+  return { downloaded, failed, lastIndex: startIndex + urls.length };
 }

 // ─── Query Building ─────────────────────────────────────────────────────────
@@ -592,7 +620,10 @@ async function fillClass(
    indexOffset: number,
  ): Promise<void> => {
    const result = await collector();
-    if (result.urls.length === 0) return;
+    if (result.urls.length === 0) {
+      console.log(`    ${label}: 0 URLs found`);
+      return;
+    }
    console.log(`    ${label}: ${result.urls.length} new URLs`);

    // Each source writes to its own non-overlapping range
@@ -788,7 +819,13 @@ async function main() {

            const classDir = resolve(DATASET_DIR, d.id);
            const queries = buildSearchQueries(d.name, d.plantId);
-            const seen = new Set<string>(seenUrlsCache[d.id] ?? []);
+
+            // CRITICAL: Start with a FRESH empty set for within-run search dedup.
+            // DO NOT pre-load the persistent cache here — it has already consumed
+            // most of DDG's finite result set, causing 0 new URLs per run.
+            // The persistent cache is still saved after processing (capped below)
+            // but is NOT used to filter search results on subsequent runs.
+            const seen = new Set<string>();

            console.log(
              `  [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
@@ -796,8 +833,11 @@ async function main() {

            const gained = await fillClass(d.id, queries, d.needed, classDir, seen);

-            // Update seen-URLs cache for this disease
-            seenUrlsCache[d.id] = Array.from(seen);
+            // Update seen-URLs cache for this disease — merge with existing
+            // and cap at 500 per disease to prevent unbounded cache growth.
+            const existing = seenUrlsCache[d.id] ?? [];
+            const merged = [...new Set([...existing, ...Array.from(seen)])];
+            seenUrlsCache[d.id] = merged.slice(-500);
            return gained;
          })(),
        ),