flagging

2026-06-06 17:02:45 -04:00
parent 47609e5e42
commit db4c656730
22 changed files with 6195 additions and 326 deletions
--- a/apps/web/scripts/scrape-training-dataset.ts
+++ b/apps/web/scripts/scrape-training-dataset.ts
@@ -2,59 +2,113 @@
 /**
 * scrape-training-dataset.ts
 *
- * Collects a training dataset for fine-tuning by scraping DuckDuckGo image search.
+ * Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
 *
- * Targets:
- *   - 200 images per disease class (93 diseases)
- *   - 400 images for the "healthy" class
- *   - Full resolution images stored in data/dataset/{class_id}/
+ * Targets (tiered by plant type):
+ *   - Core plants (houseplants + common garden): 100 images per disease
+ *   - Full set (all 11,498 DB diseases): 10 images per disease
+ *   - Healthy: 400 images
 *
- * DuckDuckGo approach (no API key needed):
- *   1. Fetch the main search page to extract a vqd (query) token
- *   2. Use the vqd token to paginate through image results
- *   3. Download each image to the dataset directory
+ * Sources (all free, no API keys):
+ *   1. DB image_url — existing images already found
+ *   2. DuckDuckGo  — general web image search
+ *   3. iNaturalist — real-world plant observation photos
+ *   4. Wikimedia Commons — curated scientific/educational images
 *
 * Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
- *
- * Progress is tracked in data/dataset/.progress.json — interrupt and resume safely.
+ * Progress: data/dataset/.progress.json — interrupt and resume safely.
 */

 import "dotenv/config";
-import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
-import { resolve, extname, join } from "path";
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
+import { resolve, extname } from "path";
+
+// Load .env.development for DB creds
+const envPath = resolve(__dirname, "../.env.development");
+try {
+  const env = readFileSync(envPath, "utf-8");
+  for (const line of env.split("\n")) {
+    const trimmed = line.trim();
+    if (trimmed && !trimmed.startsWith("#")) {
+      const eqIdx = trimmed.indexOf("=");
+      if (eqIdx > 0) {
+        const key = trimmed.slice(0, eqIdx).trim();
+        const val = trimmed.slice(eqIdx + 1).trim();
+        if (!process.env[key]) process.env[key] = val;
+      }
+    }
+  }
+} catch {}
+
+import { getDb, closeDb } from "@/lib/db/index";
+import { diseases } from "@/lib/db/schema";

 // ─── Config ─────────────────────────────────────────────────────────────────

-const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
-const PLANTS_JSON = resolve(__dirname, "../src/data/plants.json");
-
 const DATASET_DIR = resolve(__dirname, "../data/dataset");
 const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");

-/** Target images per disease class */
-const TARGET_PER_DISEASE = 200;
+/** Target images per disease for CORE plants */
+const TARGET_CORE = 100;

-/** Target images for the "healthy" class (2× normal) */
+/** Target images per disease for the FULL set */
+const TARGET_FULL = 10;
+
+/** Target images for the "healthy" class */
 const TARGET_HEALTHY = 400;

+/** Core plants that get higher image targets */
+const CORE_PLANTS = new Set([
+  // Houseplants
+  "monstera",
+  "pothos",
+  "snake-plant",
+  "peace-lily",
+  "orchid",
+  "succulent",
+  "fiddle-leaf-fig",
+  "aloe-vera",
+  "cactus",
+  "fern",
+  // Garden plants
+  "tomato",
+  "basil",
+  "rose",
+  "pepper",
+  "strawberry",
+  "cucumber",
+  "squash",
+  "lettuce",
+  "spinach",
+  "cabbage",
+  "lavender",
+  "mint",
+  "jasmine",
+  "sunflower",
+  "daisy",
+  "zucchini",
+  "bean",
+  "eggplant",
+  "chili",
+  // General disease patterns
+  "general",
+]);
+
 /** Delay between DuckDuckGo search API calls (ms) */
 const SEARCH_DELAY = 1500;

 /** Delay between image downloads (ms) */
-const DOWNLOAD_DELAY = 300;
+const DOWNLOAD_DELAY = 100;

 /** Max concurrent downloads */
-const CONCURRENT_DOWNLOADS = 5;
+const CONCURRENT_DOWNLOADS = 10;

-/** Minimum image size in bytes to accept (reject tiny placeholders) */
+/** Minimum image size in bytes to accept */
 const MIN_IMAGE_SIZE = 10_000; // 10KB

 /** Maximum image size in bytes */
 const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB

-/** Allowed image content types */
-const ALLOWED_CONTENT_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif"];
-
 /** Allowed file extensions */
 const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];

@@ -62,22 +116,16 @@ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
 const UA =
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";

+/** Class ID for healthy plants */
+const HEALTHY_CLASS = "healthy";
+
 // ─── Types ──────────────────────────────────────────────────────────────────

-interface DiseaseSeed {
+interface DbDisease {
  id: string;
  plantId: string;
  name: string;
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  [key: string]: any;
-}
-
-interface PlantSeed {
-  id: string;
-  commonName: string;
-  scientificName: string;
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  [key: string]: any;
+  imageUrl: string | null;
 }

 interface DuckDuckGoImageResult {
@@ -93,10 +141,7 @@ interface ClassProgress {
  count: number;
  downloaded: number;
  failed: number;
-  skipped: number;
-  /** URLs we've already seen (to avoid duplicates) */
  seenUrls: string[];
-  /** Whether we've exhausted search results */
  exhausted: boolean;
 }

@@ -105,15 +150,27 @@ interface Progress {
  classes: Record<string, ClassProgress>;
 }

-/** Class ID for healthy plants */
-const HEALTHY_CLASS = "healthy";
+// ─── DB Loading ──────────────────────────────────────────────────────────────
+
+/**
+ * Load all diseases from the database with their existing image URLs.
+ */
+async function loadDiseasesFromDb(): Promise<DbDisease[]> {
+  const db = getDb();
+  const rows = await db
+    .select({
+      id: diseases.id,
+      plantId: diseases.plantId,
+      name: diseases.name,
+      imageUrl: diseases.imageUrl,
+    })
+    .from(diseases)
+    .orderBy(diseases.id);
+  return rows;
+}

 // ─── DuckDuckGo API ─────────────────────────────────────────────────────────

-/**
- * Extract the vqd token from DuckDuckGo's search page.
- * Required for paginating image results.
- */
 async function getVqdToken(query: string): Promise<string> {
  const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;

@@ -122,25 +179,15 @@ async function getVqdToken(query: string): Promise<string> {
    signal: AbortSignal.timeout(15_000),
  });

-  if (!res.ok) {
-    throw new Error(`Failed to get vqd token: ${res.status}`);
-  }
+  if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);

  const html = await res.text();
-
-  // Extract vqd token from the HTML
-  // Format: vqd='<token>' or vqd="<token>"
  const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
-  if (!match) {
-    throw new Error(`Could not extract vqd token from DuckDuckGo response for "${query}"`);
-  }
+  if (!match) throw new Error(`Could not extract vqd token for "${query}"`);

  return match[1];
 }

-/**
- * Fetch a page of DuckDuckGo image results.
- */
 async function searchImagesDuckDuckGo(
  query: string,
  vqd: string,
@@ -161,12 +208,9 @@ async function searchImagesDuckDuckGo(
    if (res.status === 429) {
      console.warn("  ⚠ Rate limited (429). Waiting 10s...");
      await sleep(10_000);
-      return searchImagesDuckDuckGo(query, vqd, page); // Retry
-    }
-    if (res.status === 403) {
-      console.warn("  ⚠ Forbidden (403). Token may have expired.");
-      return []; // Token expired — no more pages
+      return searchImagesDuckDuckGo(query, vqd, page);
    }
+    if (res.status === 403) return [];
    throw new Error(`DuckDuckGo search failed: ${res.status}`);
  }

@@ -174,11 +218,7 @@ async function searchImagesDuckDuckGo(
  return data.results ?? [];
 }

-/**
- * Search DuckDuckGo images, automatically paginating to collect up to `target` results.
- * Returns unique image URLs.
- */
-async function collectImages(
+async function collectImagesDuckDuckGo(
  query: string,
  target: number,
  seenUrls: Set<string>,
@@ -188,27 +228,29 @@ async function collectImages(
  let exhausted = false;
  let consecutiveEmpty = 0;

-  // Get vqd token
  let vqd: string;
  try {
    vqd = await getVqdToken(query);
  } catch (err) {
-    console.warn(`  ⚠ Failed to get vqd token: ${err instanceof Error ? err.message : "unknown"}`);
+    console.warn(`  ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
    return { urls: [], exhausted: true };
  }

-  while (results.length < target) {
+  const MAX_PAGES = 5;
+  let lowNoveltyCount = 0;
+
+  while (results.length < target && page <= MAX_PAGES) {
    await sleep(SEARCH_DELAY);

    let pageResults: DuckDuckGoImageResult[];
    try {
      pageResults = await searchImagesDuckDuckGo(query, vqd, page);
    } catch (err) {
-      console.warn(`  ⚠ Search error: ${err instanceof Error ? err.message : "unknown"}`);
+      console.warn(`  ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
      break;
    }

-    if (pageResults.length === 0) {
+    if (!pageResults || pageResults.length === 0) {
      consecutiveEmpty++;
      if (consecutiveEmpty >= 3) {
        exhausted = true;
@@ -223,78 +265,160 @@ async function collectImages(

    for (const r of pageResults) {
      if (results.length >= target) break;
-
      const imgUrl = r.image || r.url;
-
-      // Skip if we've already seen this URL
+      if (!imgUrl || typeof imgUrl !== "string") continue;
      if (seenUrls.has(imgUrl)) continue;
-
-      // Validate URL looks like an image
-      const ext = extname(new URL(imgUrl).pathname).toLowerCase();
-      if (!ALLOWED_EXTENSIONS.includes(ext) && !ext) {
-        // No extension - still try, could be a CDN URL
+      try {
+        new URL(imgUrl);
+      } catch {
+        continue;
      }
-
      seenUrls.add(imgUrl);
      results.push(imgUrl);
      newCount++;
    }

-    if (newCount === 0 && pageResults.every((r) => seenUrls.has(r.image || r.url))) {
-      // All results on this page were already seen
-      page++;
-      continue;
+    const newRatio = newCount / pageResults.length;
+    if (newRatio < 0.05) {
+      lowNoveltyCount++;
+      if (lowNoveltyCount >= 2) break;
+    } else {
+      lowNoveltyCount = 0;
    }

-    if (results.length < target) {
-      page++;
-    }
+    if (results.length < target) page++;
  }

  return { urls: results.slice(0, target), exhausted };
 }

+// ─── iNaturalist API ─────────────────────────────────────────────────────────
+
+async function searchImagesInaturalist(
+  query: string,
+  target: number,
+  seenUrls: Set<string>,
+): Promise<{ urls: string[]; exhausted: boolean }> {
+  const results: string[] = [];
+  const perPage = Math.min(target, 200);
+
+  const apiUrl =
+    `https://api.inaturalist.org/v1/observations` +
+    `?q=${encodeURIComponent(query)}` +
+    `&photos_only=true` +
+    `&quality_grade=research` +
+    `&per_page=${perPage}` +
+    `&order_by=observed_on&order=desc`;
+
+  try {
+    const res = await fetch(apiUrl, {
+      headers: { "User-Agent": UA, Accept: "application/json" },
+      signal: AbortSignal.timeout(15_000),
+    });
+    if (!res.ok) return { urls: [], exhausted: false };
+
+    const data = (await res.json()) as {
+      results: Array<{ photos: Array<{ url: string }> }>;
+    };
+
+    for (const obs of data.results ?? []) {
+      if (results.length >= target) break;
+      for (const photo of obs.photos ?? []) {
+        if (results.length >= target) break;
+        const url = photo.url;
+        if (!url || seenUrls.has(url)) continue;
+        const fullUrl = url.replace("/medium.", "/original.");
+        seenUrls.add(fullUrl);
+        results.push(fullUrl);
+      }
+    }
+
+    return { urls: results, exhausted: results.length < target };
+  } catch {
+    return { urls: results, exhausted: false };
+  }
+}
+
+// ─── Wikimedia Commons API ──────────────────────────────────────────────────
+
+async function searchImagesCommons(
+  query: string,
+  target: number,
+  seenUrls: Set<string>,
+): Promise<{ urls: string[]; exhausted: boolean }> {
+  const results: string[] = [];
+  let sroffset = 0;
+
+  while (results.length < target) {
+    const params = new URLSearchParams({
+      action: "query",
+      list: "search",
+      srsearch: query,
+      srnamespace: "6",
+      srlimit: "50",
+      sroffset: String(sroffset),
+      format: "json",
+      origin: "*", // server-side API call
+    });
+
+    const url = `https://commons.wikimedia.org/w/api.php?${params}`;
+
+    try {
+      const res = await fetch(url, {
+        headers: { "User-Agent": UA },
+        signal: AbortSignal.timeout(10_000),
+      });
+      if (!res.ok) break;
+
+      const data = (await res.json()) as {
+        query?: { search?: Array<{ title: string }> };
+        continue?: { sroffset?: number };
+      };
+
+      const hits = data.query?.search ?? [];
+      if (hits.length === 0) break;
+
+      for (const hit of hits) {
+        if (results.length >= target) break;
+        const filename = hit.title.replace(/^File:/, "");
+        const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(filename)}`;
+        if (seenUrls.has(imgUrl)) continue;
+        seenUrls.add(imgUrl);
+        results.push(imgUrl);
+      }
+
+      sroffset = data.continue?.sroffset ?? sroffset + hits.length;
+    } catch {
+      break;
+    }
+  }
+
+  return { urls: results, exhausted: results.length < target };
+}
+
 // ─── Image Download ─────────────────────────────────────────────────────────

-/**
- * Download a single image from a URL to the target path.
- * Returns true if successful, false otherwise.
- */
 async function downloadImage(url: string, destPath: string): Promise<boolean> {
  try {
    const res = await fetch(url, {
-      headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg" },
+      headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
      signal: AbortSignal.timeout(15_000),
    });
-
    if (!res.ok) return false;

    const contentType = res.headers.get("content-type") || "";
-    const contentLength = parseInt(res.headers.get("content-length") || "0", 10);
-
-    // Validate content type
-    if (!ALLOWED_CONTENT_TYPES.some((t) => contentType.includes(t))) {
-      return false;
-    }
-
-    // Validate size
-    if (contentLength > 0 && contentLength < MIN_IMAGE_SIZE) return false;
-    if (contentLength > MAX_IMAGE_SIZE) return false;
+    if (contentType.includes("text/html")) return false;

    const buffer = Buffer.from(await res.arrayBuffer());
-
-    // Double-check actual buffer size
    if (buffer.length < MIN_IMAGE_SIZE) return false;
    if (buffer.length > MAX_IMAGE_SIZE) return false;

-    // Determine correct extension from content type or URL
    let ext = extname(new URL(url).pathname).toLowerCase();
    if (!ALLOWED_EXTENSIONS.includes(ext)) {
-      // Map from content type
      if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
      else if (contentType.includes("png")) ext = ".png";
      else if (contentType.includes("webp")) ext = ".webp";
-      else ext = ".jpg"; // Default
+      else ext = ".jpg";
    }

    const filePath = destPath.replace(/\.\w+$/, ext);
@@ -305,9 +429,6 @@ async function downloadImage(url: string, destPath: string): Promise<boolean> {
  }
 }

-/**
- * Download multiple images concurrently, respecting a per-download delay.
- */
 async function downloadBatch(
  urls: string[],
  classDir: string,
@@ -317,7 +438,6 @@ async function downloadBatch(
  let failed = 0;
  let index = startIndex;

-  // Process in chunks to control concurrency
  for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
    const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);

@@ -325,16 +445,23 @@ async function downloadBatch(
      chunk.map(async (url) => {
        const paddedIndex = String(index).padStart(4, "0");
        const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
-
        const success = await downloadImage(url, destPath);
        await sleep(DOWNLOAD_DELAY);
-        return { success, index: index++ };
+        return { success, index: index++, url: url.substring(0, 50) };
      }),
    );

    for (const r of results) {
      if (r.success) downloaded++;
-      else failed++;
+      else {
+        failed++;
+        if (failed % 20 === 1) console.log(`    ⚠ Failed: ${r.url}...`);
+      }
+    }
+
+    const total = downloaded + failed;
+    if (total % 30 === 0 || total === urls.length) {
+      console.log(`    Progress: ${downloaded}/${urls.length} (${failed} failed)`);
    }
  }

@@ -361,7 +488,6 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
      count: 0,
      downloaded: 0,
      failed: 0,
-      skipped: 0,
      seenUrls: [],
      exhausted: false,
    };
@@ -369,26 +495,22 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
  return progress.classes[classId];
 }

-// ─── Search Query Building ──────────────────────────────────────────────────
+// ─── Query Building ─────────────────────────────────────────────────────────

-function buildSearchQueries(disease: DiseaseSeed, plant: PlantSeed | null): string[] {
-  const name = disease.name;
-  const plantName = plant?.commonName || disease.plantId;
-
-  return [
-    `${name} ${plantName} leaf disease`,
-    `${plantName} ${name} symptoms`,
-    `${name} plant disease`,
-    `${plantName} diseased leaf`,
-  ];
+function buildSearchQueries(disease: DbDisease): string[] {
+  const name = disease.name || disease.id.replace(/-/g, " ");
+  const plant = disease.plantId.replace(/-/g, " ");
+  // Every query keeps the disease NAME to avoid noisy labels
+  return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
 }

-function buildHealthyQueries(plant: PlantSeed): string[] {
+function buildHealthyQueries(plant: string): string[] {
+  const name = plant.replace(/-/g, " ");
  return [
-    `healthy ${plant.commonName} leaf`,
-    `${plant.commonName} leaf closeup`,
-    `healthy ${plant.commonName} plant`,
-    `${plant.commonName} foliage`,
+    `healthy ${name} leaf`,
+    `${name} leaf closeup`,
+    `healthy ${name} plant`,
+    `${name} foliage`,
  ];
 }

@@ -400,64 +522,97 @@ async function collectClassImages(
  target: number,
  progress: Progress,
  classDir: string,
+  existingUrls: string[] = [],
+  fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
 ): Promise<void> {
  const cp = getClassProgress(progress, classId);
  const seenUrls = new Set(cp.seenUrls);

  if (cp.count >= target) {
-    console.log(`  ✓ Already have ${cp.count}/${target} images`);
+    console.log(`  ✓ Already have ${cp.count}/${target}`);
    return;
  }

  if (cp.exhausted) {
-    console.log(`  ✓ Already exhausted search results (${cp.count}/${target} images)`);
+    console.log(`  ✓ Exhausted (${cp.count}/${target})`);
    return;
  }

  mkdirSync(classDir, { recursive: true });

-  const totalUrls: string[] = [];
+  const allUrls: string[] = [];
  let exhausted = false;

-  // Search with each query until we hit the target
-  for (const query of queries) {
-    if (totalUrls.length >= target) break;
-
-    console.log(`  Searching: "${query}"...`);
-    const result = await collectImages(query, target - totalUrls.length, seenUrls);
-
-    totalUrls.push(...result.urls);
-    cp.seenUrls = Array.from(seenUrls);
-
-    if (result.exhausted) {
-      exhausted = true;
+  // ── Source 0: Existing DB URLs ──────────────────────────────────────────
+  const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
+  if (freshDbUrls.length > 0) {
+    console.log(`  DB: ${freshDbUrls.length} existing URLs`);
+    for (const url of freshDbUrls) {
+      if (allUrls.length >= target) break;
+      seenUrls.add(url);
+      allUrls.push(url);
    }
-
-    if (totalUrls.length >= target) break;
  }

-  if (totalUrls.length === 0) {
+  // ── Source 1: DuckDuckGo ──────────────────────────────────────────────
+  // Skip DDG in fast mode (full set — DDG is slowest source)
+  if (!fastMode && allUrls.length < target) {
+    for (const query of queries) {
+      if (allUrls.length >= target) break;
+      process.stdout.write(`  DDG: "${query.substring(0, 40)}"... `);
+      const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls);
+      allUrls.push(...result.urls);
+      if (result.exhausted) exhausted = true;
+      console.log(`${result.urls.length} new`);
+      if (allUrls.length >= target) break;
+    }
+  }
+
+  // ── Source 2: iNaturalist ──────────────────────────────────────────────
+  if (allUrls.length < target) {
+    const primaryQuery = queries[0];
+    console.log(`  iNat: Searching...`);
+    const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls);
+    allUrls.push(...result.urls);
+    if (result.exhausted) exhausted = true;
+    console.log(`  iNat: ${result.urls.length} images`);
+  }
+
+  // ── Source 3: Wikimedia Commons ────────────────────────────────────────
+  if (allUrls.length < target) {
+    const primaryQuery = queries[0];
+    console.log(`  Commons: Searching...`);
+    const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls);
+    allUrls.push(...result.urls);
+    if (result.exhausted) exhausted = true;
+    console.log(`  Commons: ${result.urls.length} images`);
+  }
+
+  if (allUrls.length === 0) {
    cp.exhausted = exhausted;
    saveProgress(progress);
-    console.log(`  ✗ No images found for "${classId}"`);
+    console.log(`  ✗ No images found`);
    return;
  }

-  console.log(`  Found ${totalUrls.length} unique image URLs. Downloading...`);
+  // Save progress with seen URLs BEFORE downloading
+  cp.seenUrls = Array.from(seenUrls);
+  cp.exhausted = exhausted;
+  saveProgress(progress);

-  // Download the images
-  const { downloaded, failed } = await downloadBatch(totalUrls, classDir, cp.count);
+  console.log(`  Downloading ${allUrls.length} images...`);
+
+  const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count);

  cp.count += downloaded;
  cp.downloaded += downloaded;
  cp.failed += failed;
-  cp.exhausted = exhausted;

  saveProgress(progress);

  const pct = Math.round((cp.count / target) * 100);
  console.log(
-    `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
+    `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${allUrls.length} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
  );
 }

@@ -465,25 +620,18 @@ async function collectClassImages(

 async function main() {
  console.log("=".repeat(60));
-  console.log("PLANT DISEASE DATASET COLLECTOR");
+  console.log("PLANT DISEASE DATASET COLLECTOR — FULL DB");
  console.log("=".repeat(60));

-  // Load knowledge base
-  const diseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
-  const plants = JSON.parse(readFileSync(PLANTS_JSON, "utf-8")) as PlantSeed[];
+  // Load diseases from DB
+  console.log("\nLoading diseases from database...");
+  const dbDiseases = await loadDiseasesFromDb();
+  console.log(`  ${dbDiseases.length} diseases loaded`);

-  const plantMap = new Map<string, PlantSeed>();
-  for (const p of plants) {
-    plantMap.set(p.id, p);
-  }
-
-  console.log(`\nLoaded ${diseases.length} diseases, ${plants.length} plants`);
-  console.log(
-    `Target: ${TARGET_PER_DISEASE} images/disease (×${diseases.length} = ${diseases.length * TARGET_PER_DISEASE})`,
-  );
-  console.log(`Target: ${TARGET_HEALTHY} images for "healthy" class`);
-  console.log(`Output: ${DATASET_DIR}/`);
-  console.log("");
+  const coreDiseases = dbDiseases.filter((d) => CORE_PLANTS.has(d.plantId));
+  const fullDiseases = dbDiseases.filter((d) => !CORE_PLANTS.has(d.plantId));
+  console.log(`  Core plants: ${coreDiseases.length} diseases (target: ${TARGET_CORE})`);
+  console.log(`  Full set: ${fullDiseases.length} diseases (target: ${TARGET_FULL})`);

  // Load progress
  mkdirSync(DATASET_DIR, { recursive: true });
@@ -491,28 +639,46 @@ async function main() {

  const startTime = Date.now();

-  // ── Phase 1: Disease classes ──────────────────────────────────────────────
-
-  console.log("─".repeat(60));
-  console.log("PHASE 1: Disease Images");
-  console.log("─".repeat(60));
-
-  for (let i = 0; i < diseases.length; i++) {
-    const disease = diseases[i];
-    const plant = plantMap.get(disease.plantId) ?? null;
-    const classDir = resolve(DATASET_DIR, disease.id);
-    const queries = buildSearchQueries(disease, plant);
-
-    const pct = Math.round((i / diseases.length) * 100);
-    console.log(`\n[${i + 1}/${diseases.length}] (${pct}%) ${disease.name} (${disease.id})`);
-
-    await collectClassImages(disease.id, queries, TARGET_PER_DISEASE, progress, classDir);
-  }
-
-  // ── Phase 2: Healthy class ────────────────────────────────────────────────
+  // ── Phase 1: Core set ──────────────────────────────────────────────────

  console.log("\n" + "─".repeat(60));
-  console.log("PHASE 2: Healthy Plant Images");
+  console.log("PHASE 1: Core Diseases (100 images each)");
+  console.log("─".repeat(60));
+
+  for (let i = 0; i < coreDiseases.length; i++) {
+    const d = coreDiseases[i];
+    const classDir = resolve(DATASET_DIR, d.id);
+    const queries = buildSearchQueries(d);
+    const existingUrls = d.imageUrl ? [d.imageUrl] : [];
+
+    const pct = Math.round((i / coreDiseases.length) * 100);
+    console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
+
+    await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
+  }
+
+  // ── Phase 2: Full set ──────────────────────────────────────────────────
+
+  console.log("\n" + "─".repeat(60));
+  console.log("PHASE 2: Full Disease Set (10 images each)");
+  console.log("─".repeat(60));
+
+  for (let i = 0; i < fullDiseases.length; i++) {
+    const d = fullDiseases[i];
+    const classDir = resolve(DATASET_DIR, d.id);
+    const queries = buildSearchQueries(d);
+    const existingUrls = d.imageUrl ? [d.imageUrl] : [];
+
+    const pct = Math.round((i / fullDiseases.length) * 100);
+    console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
+
+    await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
+  }
+
+  // ── Phase 3: Healthy class ──────────────────────────────────────────────
+
+  console.log("\n" + "─".repeat(60));
+  console.log("PHASE 3: Healthy Plant Images");
  console.log("─".repeat(60));

  const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
@@ -520,39 +686,50 @@ async function main() {
  const healthySeen = new Set(healthyCp.seenUrls);

  if (healthyCp.count >= TARGET_HEALTHY) {
-    console.log(`\n  ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY} healthy images`);
+    console.log(`\n  ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`);
  } else {
-    // Build a pool of healthy plant queries
+    // Collect all unique plants
+    const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))];
    const allHealthyQueries: string[] = [];
-    for (const plant of plants) {
+    for (const plant of allPlants) {
      allHealthyQueries.push(...buildHealthyQueries(plant));
    }

+    const healthySources = [
+      { name: "DDG", collector: collectImagesDuckDuckGo },
+      { name: "iNat", collector: searchImagesInaturalist },
+      { name: "Commons", collector: searchImagesCommons },
+    ] as const;
+
    const totalHealthyUrls: string[] = [];
-    let healthyExhausted = false;
+    let anyRemaining = false;

-    for (const query of allHealthyQueries) {
+    for (const source of healthySources) {
      if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
-      if (healthyExhausted) break;
+      console.log(`\n  Source: ${source.name}`);

-      console.log(`\n  Searching: "${query}"...`);
-      const result = await collectImages(
-        query,
-        TARGET_HEALTHY - totalHealthyUrls.length,
-        healthySeen,
-      );
+      for (const query of allHealthyQueries.slice(0, 20)) {
+        if (totalHealthyUrls.length >= TARGET_HEALTHY) break;

-      totalHealthyUrls.push(...result.urls);
-
-      if (result.exhausted) {
-        healthyExhausted = true;
+        process.stdout.write(`    "${query}"... `);
+        const result = await source.collector(
+          query,
+          TARGET_HEALTHY - totalHealthyUrls.length,
+          healthySeen,
+        );
+        totalHealthyUrls.push(...result.urls);
+        if (!result.exhausted) anyRemaining = true;
+        console.log(`${result.urls.length} new`);
      }
    }

    healthyCp.seenUrls = Array.from(healthySeen);

    if (totalHealthyUrls.length > 0) {
-      console.log(`\n  Found ${totalHealthyUrls.length} healthy image URLs. Downloading...`);
+      healthyCp.exhausted = !anyRemaining;
+      saveProgress(progress);
+
+      console.log(`\n  Downloading ${totalHealthyUrls.length} healthy images...`);
      const { downloaded, failed } = await downloadBatch(
        totalHealthyUrls,
        healthyDir,
@@ -562,14 +739,12 @@ async function main() {
      healthyCp.count += downloaded;
      healthyCp.downloaded += downloaded;
      healthyCp.failed += failed;
-      healthyCp.exhausted = healthyExhausted;

      const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
      console.log(
-        `  Got ${downloaded} images (${failed} failed). Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
+        `  Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
      );
    } else {
-      healthyCp.exhausted = true;
      console.log(`  ✗ No healthy images found`);
    }

@@ -580,76 +755,27 @@ async function main() {

  const elapsed = Math.round((Date.now() - startTime) / 1000);
  const mins = Math.floor(elapsed / 60);
-  const secs = elapsed % 60;
+  const hrs = Math.floor(mins / 60);

  let totalDownloaded = 0;
  let totalFailed = 0;
-  let totalTarget = 0;
-
-  for (const [classId, cp] of Object.entries(progress.classes)) {
+  for (const [, cp] of Object.entries(progress.classes)) {
    totalDownloaded += cp.downloaded || 0;
    totalFailed += cp.failed || 0;
-    totalTarget += classId === HEALTHY_CLASS ? TARGET_HEALTHY : TARGET_PER_DISEASE;
  }

-  const totalSize = await getDatasetSize();
-  const sizeGb = (totalSize / (1024 * 1024 * 1024)).toFixed(2);
-
  console.log("\n" + "=".repeat(60));
  console.log("COMPLETE");
  console.log("=".repeat(60));
-  console.log(`  Time:       ${mins}m ${secs}s`);
+  console.log(`  Time:       ${hrs}h ${mins % 60}m`);
  console.log(`  Downloaded: ${totalDownloaded} images`);
  console.log(`  Failed:     ${totalFailed} images`);
-  console.log(`  Target:     ${totalTarget} images`);
-  console.log(`  Dataset size: ${sizeGb} GB`);
-  console.log(`  Dataset location: ${DATASET_DIR}/`);
-  console.log("");
-  console.log("Next steps:");
-  console.log("  1. Run the fine-tuning script to train on this dataset");
-  console.log("  2. The fine-tuning script will resize to 160×160 and augment");
+  console.log(`  Dataset:    ${DATASET_DIR}/`);
+
+  await closeDb();
  console.log("=".repeat(60));
 }

-/**
- * Calculate total size of the dataset directory.
- */
-async function getDatasetSize(): Promise<number> {
-  let total = 0;
-  if (!existsSync(DATASET_DIR)) return 0;
-
-  const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
-
-  for (const entry of entries) {
-    if (!entry.name.startsWith(".")) {
-      const fullPath = resolve(DATASET_DIR, entry.name);
-      if (entry.isDirectory()) {
-        total += dirSize(fullPath);
-      }
-    }
-  }
-
-  return total;
-}
-
-function dirSize(dirPath: string): number {
-  let total = 0;
-  try {
-    const entries = readdirSync(dirPath, { withFileTypes: true });
-    for (const entry of entries) {
-      const fullPath = join(dirPath, entry.name);
-      if (entry.isFile()) {
-        total += statSync(fullPath).size;
-      } else if (entry.isDirectory()) {
-        total += dirSize(fullPath);
-      }
-    }
-  } catch {
-    // skip errors
-  }
-  return total;
-}
-
 function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
 }