flag impl fin

2026-06-06 17:22:31 -04:00
parent db4c656730
commit 96de91e86c
12 changed files with 1025 additions and 65 deletions
--- a/apps/web/scripts/scrape-training-dataset.ts
+++ b/apps/web/scripts/scrape-training-dataset.ts
@@ -20,7 +20,7 @@
 */

 import "dotenv/config";
-import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
+import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "fs";
 import { resolve, extname } from "path";

 // Load .env.development for DB creds
@@ -137,17 +137,32 @@ interface DuckDuckGoImageResult {
  width: number;
 }

+interface SourceState {
+  exhausted: boolean;
+}
+
 interface ClassProgress {
  count: number;
  downloaded: number;
  failed: number;
  seenUrls: string[];
  exhausted: boolean;
+  /** Per-source exhaustion tracking — prevents re-scraping exhausted sources on resume */
+  sources: {
+    db: SourceState;
+    duckduckgo: SourceState;
+    inaturalist: SourceState;
+    wikimedia: SourceState;
+  };
 }

 interface Progress {
  lastUpdated: string;
  classes: Record<string, ClassProgress>;
+  /** Phase checkpoint: 0=core, 1=full, 2=healthy. On resume, skip to this phase. */
+  phase: number;
+  /** Index within the current phase's disease array. On resume, skip to this index. */
+  phaseIndex: number;
 }

 // ─── DB Loading ──────────────────────────────────────────────────────────────
@@ -358,7 +373,7 @@ async function searchImagesCommons(
      srlimit: "50",
      sroffset: String(sroffset),
      format: "json",
-      origin: "*", // server-side API call
+      // No origin needed — server-side fetch, Wikimedia ignores CORS headers on API calls
    });

    const url = `https://commons.wikimedia.org/w/api.php?${params}`;
@@ -472,9 +487,40 @@ async function downloadBatch(

 function loadProgress(): Progress {
  if (!existsSync(PROGRESS_FILE)) {
-    return { lastUpdated: new Date().toISOString(), classes: {} };
+    return {
+      lastUpdated: new Date().toISOString(),
+      classes: {},
+      phase: 0,
+      phaseIndex: 0,
+    };
+  }
+  try {
+    const raw = JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Partial<Progress>;
+    // Backward compat: ensure new fields exist
+    raw.phase ??= 0;
+    raw.phaseIndex ??= 0;
+    raw.classes ??= {};
+    // Ensure each class has the sources field
+    for (const key of Object.keys(raw.classes)) {
+      const cp = raw.classes[key] as Partial<ClassProgress>;
+      cp.sources ??= {
+        db: { exhausted: false },
+        duckduckgo: { exhausted: false },
+        inaturalist: { exhausted: false },
+        wikimedia: { exhausted: false },
+      };
+      cp.seenUrls ??= [];
+    }
+    return raw as Progress;
+  } catch {
+    console.warn("  ⚠ Corrupt progress file, starting fresh");
+    return {
+      lastUpdated: new Date().toISOString(),
+      classes: {},
+      phase: 0,
+      phaseIndex: 0,
+    };
  }
-  return JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Progress;
 }

 function saveProgress(progress: Progress): void {
@@ -490,6 +536,12 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
      failed: 0,
      seenUrls: [],
      exhausted: false,
+      sources: {
+        db: { exhausted: false },
+        duckduckgo: { exhausted: false },
+        inaturalist: { exhausted: false },
+        wikimedia: { exhausted: false },
+      },
    };
  }
  return progress.classes[classId];
@@ -514,6 +566,37 @@ function buildHealthyQueries(plant: string): string[] {
  ];
 }

+// ─── File Reconciliation ───────────────────────────────────────────────────
+
+/**
+ * Count actual image files in a class directory.
+ * Returns the count of files matching img_* pattern, OR 0 if dir doesn't exist.
+ */
+function countImagesInDir(classDir: string): number {
+  if (!existsSync(classDir)) return 0;
+  try {
+    const files = readdirSync(classDir);
+    return files.filter((f) => f.startsWith("img_")).length;
+  } catch {
+    return 0;
+  }
+}
+
+/**
+ * Reconcile a class's progress count with actual files on disk.
+ * If files were deleted after the progress file was saved, this
+ * adjusts the count downward so we re-download the missing ones.
+ * Returns the reconciled count.
+ */
+function reconcileClassCount(classDir: string, progressCount: number): number {
+  const fileCount = countImagesInDir(classDir);
+  if (fileCount < progressCount) {
+    console.log(`    ↻ File count (${fileCount}) < progress count (${progressCount}) — reconciling`);
+    return fileCount;
+  }
+  return progressCount;
+}
+
 // ─── Dataset Collection ─────────────────────────────────────────────────────

 async function collectClassImages(
@@ -526,14 +609,32 @@ async function collectClassImages(
  fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
 ): Promise<void> {
  const cp = getClassProgress(progress, classId);
+
+  // ── Reconcile with actual files on disk ─────────────────────────────────
+  const actualCount = reconcileClassCount(classDir, cp.count);
+  if (actualCount !== cp.count) {
+    cp.count = actualCount;
+    saveProgress(progress);
+  }
+
  const seenUrls = new Set(cp.seenUrls);
+  const sources = cp.sources;

  if (cp.count >= target) {
    console.log(`  ✓ Already have ${cp.count}/${target}`);
    return;
  }

-  if (cp.exhausted) {
+  // Check if ALL sources are exhausted
+  const allExhausted =
+    sources.db.exhausted &&
+    sources.duckduckgo.exhausted &&
+    sources.inaturalist.exhausted &&
+    sources.wikimedia.exhausted;
+
+  if (allExhausted) {
+    cp.exhausted = true;
+    saveProgress(progress);
    console.log(`  ✓ Exhausted (${cp.count}/${target})`);
    return;
  }
@@ -541,73 +642,111 @@ async function collectClassImages(
  mkdirSync(classDir, { recursive: true });

  const allUrls: string[] = [];
-  let exhausted = false;
+  let anyNewResults = false;
+  const needed = target - cp.count;

  // ── Source 0: Existing DB URLs ──────────────────────────────────────────
-  const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
-  if (freshDbUrls.length > 0) {
-    console.log(`  DB: ${freshDbUrls.length} existing URLs`);
-    for (const url of freshDbUrls) {
-      if (allUrls.length >= target) break;
-      seenUrls.add(url);
-      allUrls.push(url);
+  if (!sources.db.exhausted) {
+    const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
+    if (freshDbUrls.length > 0) {
+      console.log(`  DB: ${freshDbUrls.length} existing URLs`);
+      for (const url of freshDbUrls) {
+        if (allUrls.length >= needed) break;
+        seenUrls.add(url);
+        allUrls.push(url);
+      }
+      if (freshDbUrls.length > 0) anyNewResults = true;
    }
+    // DB source is always "exhausted" after processing its initial URLs
+    sources.db.exhausted = true;
  }

  // ── Source 1: DuckDuckGo ──────────────────────────────────────────────
-  // Skip DDG in fast mode (full set — DDG is slowest source)
-  if (!fastMode && allUrls.length < target) {
+  if (!fastMode && !sources.duckduckgo.exhausted && allUrls.length < needed) {
    for (const query of queries) {
-      if (allUrls.length >= target) break;
+      if (allUrls.length >= needed) break;
      process.stdout.write(`  DDG: "${query.substring(0, 40)}"... `);
-      const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls);
+      const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
      allUrls.push(...result.urls);
-      if (result.exhausted) exhausted = true;
+      if (result.exhausted) {
+        sources.duckduckgo.exhausted = true;
+      }
+      if (result.urls.length > 0) anyNewResults = true;
      console.log(`${result.urls.length} new`);
-      if (allUrls.length >= target) break;
+      if (allUrls.length >= needed) break;
+    }
+    // If DDG never gave us anything, mark exhausted to avoid re-trying
+    if (!anyNewResults && sources.duckduckgo.exhausted) {
+      /* already marked */
    }
  }

  // ── Source 2: iNaturalist ──────────────────────────────────────────────
-  if (allUrls.length < target) {
+  if (!sources.inaturalist.exhausted && allUrls.length < needed) {
    const primaryQuery = queries[0];
    console.log(`  iNat: Searching...`);
-    const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls);
+    const result = await searchImagesInaturalist(primaryQuery, needed - allUrls.length, seenUrls);
    allUrls.push(...result.urls);
-    if (result.exhausted) exhausted = true;
+    if (result.exhausted) sources.inaturalist.exhausted = true;
+    if (result.urls.length > 0) anyNewResults = true;
    console.log(`  iNat: ${result.urls.length} images`);
  }

  // ── Source 3: Wikimedia Commons ────────────────────────────────────────
-  if (allUrls.length < target) {
+  if (!sources.wikimedia.exhausted && allUrls.length < needed) {
    const primaryQuery = queries[0];
    console.log(`  Commons: Searching...`);
-    const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls);
+    const result = await searchImagesCommons(primaryQuery, needed - allUrls.length, seenUrls);
    allUrls.push(...result.urls);
-    if (result.exhausted) exhausted = true;
+    if (result.exhausted) sources.wikimedia.exhausted = true;
+    if (result.urls.length > 0) anyNewResults = true;
    console.log(`  Commons: ${result.urls.length} images`);
  }

  if (allUrls.length === 0) {
-    cp.exhausted = exhausted;
+    cp.exhausted = true;
    saveProgress(progress);
-    console.log(`  ✗ No images found`);
+    console.log(`  ✗ No images found — exhausted`);
    return;
  }

+  if (!anyNewResults && allUrls.length > 0) {
+    // Only DB URLs survived — nothing more will come from searches
+    cp.exhausted = true;
+    saveProgress(progress);
+  }
+
  // Save progress with seen URLs BEFORE downloading
  cp.seenUrls = Array.from(seenUrls);
-  cp.exhausted = exhausted;
  saveProgress(progress);

  console.log(`  Downloading ${allUrls.length} images...`);

-  const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count);
+  // Use actual file count as start index so filenames don't have gaps
+  const startIndex = countImagesInDir(classDir);
+  const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);

-  cp.count += downloaded;
+  // Re-count actual files on disk after download (more reliable than tracking)
+  const newTotal = countImagesInDir(classDir);
+  cp.count = newTotal;
  cp.downloaded += downloaded;
  cp.failed += failed;

+  // Check if all sources exhausted
+  if (
+    sources.db.exhausted &&
+    sources.duckduckgo.exhausted &&
+    sources.inaturalist.exhausted &&
+    sources.wikimedia.exhausted
+  ) {
+    cp.exhausted = true;
+  }
+
+  // Don't mark exhausted if we still have room to grow
+  if (cp.count >= target) {
+    cp.exhausted = true;
+  }
+
  saveProgress(progress);

  const pct = Math.round((cp.count / target) * 100);
@@ -645,7 +784,12 @@ async function main() {
  console.log("PHASE 1: Core Diseases (100 images each)");
  console.log("─".repeat(60));

-  for (let i = 0; i < coreDiseases.length; i++) {
+  const coreStart = progress.phase === 0 ? progress.phaseIndex : 0;
+  if (coreStart > 0) {
+    console.log(`  Resuming from disease #${coreStart + 1} (${((coreStart / coreDiseases.length) * 100).toFixed(0)}% done)`);
+  }
+
+  for (let i = coreStart; i < coreDiseases.length; i++) {
    const d = coreDiseases[i];
    const classDir = resolve(DATASET_DIR, d.id);
    const queries = buildSearchQueries(d);
@@ -655,6 +799,11 @@ async function main() {
    console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);

    await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
+
+    // Save checkpoint: phase 0, at index i
+    progress.phase = 0;
+    progress.phaseIndex = i + 1;
+    saveProgress(progress);
  }

  // ── Phase 2: Full set ──────────────────────────────────────────────────
@@ -663,7 +812,12 @@ async function main() {
  console.log("PHASE 2: Full Disease Set (10 images each)");
  console.log("─".repeat(60));

-  for (let i = 0; i < fullDiseases.length; i++) {
+  const fullStart = progress.phase === 1 ? progress.phaseIndex : 0;
+  if (fullStart > 0) {
+    console.log(`  Resuming from disease #${fullStart + 1} (${((fullStart / fullDiseases.length) * 100).toFixed(0)}% done)`);
+  }
+
+  for (let i = fullStart; i < fullDiseases.length; i++) {
    const d = fullDiseases[i];
    const classDir = resolve(DATASET_DIR, d.id);
    const queries = buildSearchQueries(d);
@@ -673,6 +827,11 @@ async function main() {
    console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);

    await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
+
+    // Save checkpoint: phase 1, at index i
+    progress.phase = 1;
+    progress.phaseIndex = i + 1;
+    saveProgress(progress);
  }

  // ── Phase 3: Healthy class ──────────────────────────────────────────────
@@ -683,6 +842,14 @@ async function main() {

  const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
  const healthyCp = getClassProgress(progress, HEALTHY_CLASS);
+
+  // Reconcile healthy class with files on disk
+  const healthyActualCount = reconcileClassCount(healthyDir, healthyCp.count);
+  if (healthyActualCount !== healthyCp.count) {
+    healthyCp.count = healthyActualCount;
+    saveProgress(progress);
+  }
+
  const healthySeen = new Set(healthyCp.seenUrls);

  if (healthyCp.count >= TARGET_HEALTHY) {
@@ -730,16 +897,23 @@ async function main() {
      saveProgress(progress);

      console.log(`\n  Downloading ${totalHealthyUrls.length} healthy images...`);
+      const healthyStartIndex = countImagesInDir(healthyDir);
      const { downloaded, failed } = await downloadBatch(
        totalHealthyUrls,
        healthyDir,
-        healthyCp.count,
+        healthyStartIndex,
      );

-      healthyCp.count += downloaded;
+      // Re-count actual files on disk
+      const newHealthyTotal = countImagesInDir(healthyDir);
+      healthyCp.count = newHealthyTotal;
      healthyCp.downloaded += downloaded;
      healthyCp.failed += failed;

+      if (healthyCp.count >= TARGET_HEALTHY) {
+        healthyCp.exhausted = true;
+      }
+
      const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
      console.log(
        `  Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
@@ -753,6 +927,11 @@ async function main() {

  // ── Summary ────────────────────────────────────────────────────────────────

+  // Mark all phases complete
+  progress.phase = 3;
+  progress.phaseIndex = 0;
+  saveProgress(progress);
+
  const elapsed = Math.round((Date.now() - startTime) / 1000);
  const mins = Math.floor(elapsed / 60);
  const hrs = Math.floor(mins / 60);
@@ -765,7 +944,7 @@ async function main() {
  }

  console.log("\n" + "=".repeat(60));
-  console.log("COMPLETE");
+  console.log("  ✅ ALL PHASES COMPLETE");
  console.log("=".repeat(60));
  console.log(`  Time:       ${hrs}h ${mins % 60}m`);
  console.log(`  Downloaded: ${totalDownloaded} images`);