flagging

2026-06-06 17:02:45 -04:00
parent 47609e5e42
commit db4c656730
22 changed files with 6195 additions and 326 deletions
--- a/apps/web/scripts/.brave-progress.json
+++ b/apps/web/scripts/.brave-progress.json
--- a/apps/web/scripts/.ddg-progress.json
+++ b/apps/web/scripts/.ddg-progress.json
--- a/apps/web/scripts/.flagged-content-review-needed.md
+++ b/apps/web/scripts/.flagged-content-review-needed.md
@@ -0,0 +1,11 @@
+# 🚩 Flagged Content Review — Nothing to Review
+
+Generated: 2026-06-06T21:02:03.301Z
+
+**No content has been flagged for review yet.**
+
+Flagged items will appear here once users flag content for manual review.
+
+---
+
+_Report generated with min-flags=1_
--- a/apps/web/scripts/apply-flag-migration.ts
+++ b/apps/web/scripts/apply-flag-migration.ts
@@ -0,0 +1,53 @@
+/**
+ * apply-flag-migration.ts
+ *
+ * Applies the flagged_content table migration to Turso.
+ * Run with: npx tsx scripts/apply-flag-migration.ts
+ */
+
+import dotenv from "dotenv";
+import path from "node:path";
+
+const envFile =
+  process.env.NODE_ENV === "production" ? "../.env.production" : "../.env.development";
+dotenv.config({ path: path.resolve(__dirname, envFile) });
+
+import { createClient } from "@libsql/client";
+
+async function main() {
+  const db = createClient({
+    url: process.env.DATABASE_URL!,
+    authToken: process.env.DATABASE_TOKEN!,
+  });
+
+  console.log("Applying migration: create flagged_content table...");
+
+  await db.execute(`
+    CREATE TABLE IF NOT EXISTS flagged_content (
+      id text PRIMARY KEY NOT NULL,
+      content_type text NOT NULL,
+      content_id text NOT NULL,
+      field_name text NOT NULL,
+      notes text DEFAULT '',
+      flag_count integer DEFAULT 1 NOT NULL,
+      created_at text DEFAULT (datetime('now')) NOT NULL,
+      updated_at text DEFAULT (datetime('now')) NOT NULL
+    )
+  `);
+
+  await db.execute(`
+    CREATE INDEX IF NOT EXISTS idx_flagged_content_type ON flagged_content (content_type)
+  `);
+
+  await db.execute(`
+    CREATE INDEX IF NOT EXISTS idx_flagged_content_id ON flagged_content (content_id)
+  `);
+
+  console.log("Migration applied successfully.");
+  db.close();
+}
+
+main().catch((err) => {
+  console.error("Migration failed:", err);
+  process.exit(1);
+});
--- a/apps/web/scripts/check-progress.mjs
+++ b/apps/web/scripts/check-progress.mjs
@@ -0,0 +1,19 @@
+import { createClient } from "@libsql/client";
+const c = createClient({
+  url: process.env.DATABASE_URL,
+  authToken: process.env.DATABASE_TOKEN,
+});
+const r = await c.execute("SELECT COUNT(*) as cnt FROM diseases");
+const r2 = await c.execute(
+  `SELECT SUM(CASE WHEN image_url IS NOT NULL AND image_url != '' THEN 1 ELSE 0 END) as has, SUM(CASE WHEN image_url IS NULL OR image_url = '' THEN 1 ELSE 0 END) as miss FROM diseases`,
+);
+const r3 = await c.execute(
+  `SELECT severity, COUNT(*) as total, SUM(CASE WHEN image_url IS NOT NULL AND image_url != '' THEN 1 ELSE 0 END) as has FROM diseases GROUP BY severity ORDER BY severity`,
+);
+console.log(
+  `Total: ${r.rows[0].cnt} | With images: ${r2.rows[0].has} | Missing: ${r2.rows[0].miss}`,
+);
+for (const row of r3.rows) {
+  console.log(`  ${row.severity?.padEnd(10)}: ${row.has}/${row.total}`);
+}
+c.close();
--- a/apps/web/scripts/generate-flagged-report.ts
+++ b/apps/web/scripts/generate-flagged-report.ts
@@ -0,0 +1,379 @@
+/**
+ * generate-flagged-report.ts
+ *
+ * Reads all flagged content from the database and generates a pretty
+ * markdown report organized by content type. The report includes:
+ *  - Summary table with counts per content type
+ *  - Plant images flagged for review
+ *  - Disease images flagged for review
+ *  - Disease symptoms flagged for review
+ *  - Disease causes flagged for review
+ *  - Disease treatment steps flagged for review
+ *  - Disease prevention tips flagged for review
+ *
+ * Usage:
+ *   npx tsx scripts/generate-flagged-report.ts [--min-flags N] [--output path/to/report.md]
+ *
+ * Options:
+ *   --min-flags  Minimum flag count to include (default: 1)
+ *   --output     Output path (default: scripts/.flagged-content-review-needed.md)
+ */
+
+import dotenv from "dotenv";
+import path from "node:path";
+
+// Load DB config from .env.development (or .env.production if NODE_ENV=production)
+const envFile =
+  process.env.NODE_ENV === "production" ? "../.env.production" : "../.env.development";
+dotenv.config({ path: path.resolve(__dirname, envFile) });
+import { createClient } from "@libsql/client";
+import fs from "node:fs";
+
+// ─── Config ─────────────────────────────────────────────────────────────────
+
+const MIN_FLAGS = parseInt(
+  process.argv.find((a) => a.startsWith("--min-flags="))?.split("=")[1] ?? "1",
+  10,
+);
+const OUTPUT_PATH =
+  process.argv.find((a) => a.startsWith("--output="))?.split("=")[1] ??
+  path.join(__dirname, ".flagged-content-review-needed.md");
+
+// ─── DB Connection ──────────────────────────────────────────────────────────
+
+const db = createClient({
+  url: process.env.DATABASE_URL!,
+  authToken: process.env.DATABASE_TOKEN!,
+});
+
+// ─── Types ──────────────────────────────────────────────────────────────────
+
+interface FlaggedRow {
+  id: string;
+  content_type: string;
+  content_id: string;
+  field_name: string;
+  notes: string;
+  flag_count: number;
+  created_at: string;
+  updated_at: string;
+}
+
+interface PlantRow {
+  id: string;
+  common_name: string;
+  scientific_name: string;
+  family: string;
+  image_url: string;
+}
+
+interface DiseaseRow {
+  id: string;
+  name: string;
+  scientific_name: string;
+  plant_id: string;
+  image_url: string;
+}
+
+// ─── Helpers ────────────────────────────────────────────────────────────────
+
+const CONTENT_TYPE_LABELS: Record<string, { emoji: string; title: string; description: string }> = {
+  plant_image: {
+    emoji: "🪴",
+    title: "Plant Images Flagged for Review",
+    description: "Plant images that users have flagged as potentially incorrect or low quality.",
+  },
+  disease_image: {
+    emoji: "📸",
+    title: "Disease Images Flagged for Review",
+    description:
+      "Disease symptom images that users have flagged as potentially incorrect or misleading.",
+  },
+  disease_symptoms: {
+    emoji: "⚠️",
+    title: "Disease Symptoms Flagged for Review",
+    description: "Symptom descriptions that users have flagged as potentially inaccurate.",
+  },
+  disease_causes: {
+    emoji: "🔍",
+    title: "Disease Causes Flagged for Review",
+    description:
+      "Causes and contributing factors that users have flagged as potentially incorrect.",
+  },
+  disease_treatment: {
+    emoji: "💊",
+    title: "Disease Treatment Steps Flagged for Review",
+    description:
+      "Treatment instructions that users have flagged as potentially incorrect or harmful.",
+  },
+  disease_prevention: {
+    emoji: "🛡️",
+    title: "Disease Prevention Tips Flagged for Review",
+    description: "Prevention tips that users have flagged as potentially incorrect or misleading.",
+  },
+};
+
+function formatDate(iso: string): string {
+  const d = new Date(iso);
+  return d.toLocaleDateString("en-US", {
+    year: "numeric",
+    month: "short",
+    day: "numeric",
+    hour: "2-digit",
+    minute: "2-digit",
+  });
+}
+
+// ─── Main ───────────────────────────────────────────────────────────────────
+
+async function main() {
+  console.log(`📋 Generating flagged content report (min flags: ${MIN_FLAGS})...`);
+
+  // Fetch flagged content
+  const flaggedRs = await db.execute({
+    sql: "SELECT * FROM flagged_content WHERE flag_count >= ? ORDER BY content_type, flag_count DESC, updated_at DESC",
+    args: [MIN_FLAGS],
+  });
+  const flaggedRows = flaggedRs.rows as unknown as FlaggedRow[];
+
+  if (flaggedRows.length === 0) {
+    const report = [
+      "# 🚩 Flagged Content Review — Nothing to Review",
+      "",
+      `Generated: ${new Date().toISOString()}`,
+      "",
+      "**No content has been flagged for review yet.**",
+      "",
+      "Flagged items will appear here once users flag content for manual review.",
+      "",
+      "---",
+      "",
+      `_Report generated with min-flags=${MIN_FLAGS}_`,
+      "",
+    ].join("\n");
+
+    fs.writeFileSync(OUTPUT_PATH, report, "utf-8");
+    console.log(`✅ Report written to ${OUTPUT_PATH} (no flagged items)`);
+    db.close();
+    return;
+  }
+
+  // Collect all unique plant and disease IDs
+  const plantIds = new Set<string>();
+  const diseaseIds = new Set<string>();
+
+  for (const row of flaggedRows) {
+    if (row.content_type === "plant_image") {
+      plantIds.add(row.content_id);
+    } else {
+      diseaseIds.add(row.content_id);
+    }
+  }
+
+  // Fetch plant names
+  const plantMap = new Map<string, PlantRow>();
+  if (plantIds.size > 0) {
+    const plantRs = await db.execute({
+      sql: `SELECT id, common_name, scientific_name, family, image_url FROM plants WHERE id IN (${[...plantIds].map(() => "?").join(",")})`,
+      args: [...plantIds],
+    });
+    for (const row of plantRs.rows as unknown as PlantRow[]) {
+      plantMap.set(row.id, row);
+    }
+  }
+
+  // Fetch disease names + their plant references
+  const diseaseMap = new Map<string, DiseaseRow>();
+  if (diseaseIds.size > 0) {
+    const diseaseRs = await db.execute({
+      sql: `SELECT id, name, scientific_name, plant_id, image_url FROM diseases WHERE id IN (${[...diseaseIds].map(() => "?").join(",")})`,
+      args: [...diseaseIds],
+    });
+    for (const row of diseaseRs.rows as unknown as DiseaseRow[]) {
+      diseaseMap.set(row.id, row);
+      if (!plantMap.has(row.plant_id)) {
+        plantIds.add(row.plant_id);
+      }
+    }
+    // Fetch any missing plant references for diseases
+    if (plantIds.size > 0) {
+      const missingPlantIds = [...plantIds].filter((id) => !plantMap.has(id));
+      if (missingPlantIds.length > 0) {
+        const plantRs = await db.execute({
+          sql: `SELECT id, common_name, scientific_name, family, image_url FROM plants WHERE id IN (${missingPlantIds.map(() => "?").join(",")})`,
+          args: missingPlantIds,
+        });
+        for (const row of plantRs.rows as unknown as PlantRow[]) {
+          plantMap.set(row.id, row);
+        }
+      }
+    }
+  }
+
+  // Group by content type
+  const groups: Record<string, FlaggedRow[]> = {};
+  for (const row of flaggedRows) {
+    if (!groups[row.content_type]) groups[row.content_type] = [];
+    groups[row.content_type].push(row);
+  }
+
+  // ─── Build Report ────────────────────────────────────────────────────────
+
+  const lines: string[] = [];
+  const totalFlags = flaggedRows.reduce((sum, r) => sum + r.flag_count, 0);
+
+  lines.push("# 🚩 Flagged Content — Manual Review Needed");
+  lines.push("");
+  lines.push(`Generated: ${new Date().toISOString()}`);
+  lines.push("");
+  lines.push(
+    flaggedRows.length === 1
+      ? `**${flaggedRows.length} item** flagged for review (${totalFlags} total flags).`
+      : `**${flaggedRows.length} items** flagged for review (${totalFlags} total flags).`,
+  );
+  lines.push("");
+  lines.push("Most data in this knowledge base is not reviewed by humans. ");
+  lines.push("Items listed below have been flagged by users for manual review. ");
+  lines.push("Please review each item and take appropriate action.");
+  lines.push("");
+
+  // Summary table
+  lines.push("## 📊 Summary");
+  lines.push("");
+  lines.push("| Content Type | Count | Total Flags |");
+  lines.push("|---|---|---|");
+  const orderedTypes = [
+    "plant_image",
+    "disease_image",
+    "disease_symptoms",
+    "disease_causes",
+    "disease_treatment",
+    "disease_prevention",
+  ];
+  for (const type of orderedTypes) {
+    const items = groups[type];
+    if (!items) continue;
+    const label = CONTENT_TYPE_LABELS[type]?.title ?? type;
+    const count = items.length;
+    const sumFlags = items.reduce((s, r) => s + r.flag_count, 0);
+    lines.push(`| ${label} | ${count} | ${sumFlags} |`);
+  }
+  lines.push(`| **Total** | **${flaggedRows.length}** | **${totalFlags}** |`);
+  lines.push("");
+  lines.push("---");
+  lines.push("");
+
+  // Detail sections per content type
+  for (const type of orderedTypes) {
+    const items = groups[type];
+    if (!items) continue;
+
+    const config = CONTENT_TYPE_LABELS[type];
+    lines.push(`## ${config?.emoji ?? "📋"} ${config?.title ?? type}`);
+    lines.push("");
+    lines.push(config?.description ?? "");
+    lines.push("");
+    lines.push(`**${items.length} item${items.length === 1 ? "" : "s"} flagged**`);
+    lines.push("");
+
+    for (const item of items) {
+      // Build label
+      let label = item.content_id;
+      let plantLabel = "";
+
+      if (type === "plant_image") {
+        const plant = plantMap.get(item.content_id);
+        if (plant) {
+          label = `${plant.common_name} (_${plant.scientific_name}_)`;
+          plantLabel = `${plant.family} family`;
+        }
+      } else {
+        const disease = diseaseMap.get(item.content_id);
+        if (disease) {
+          const plant = plantMap.get(disease.plant_id);
+          const plantName = plant?.common_name ?? disease.plant_id;
+          label = `${disease.name} (_${disease.scientific_name}_) on **${plantName}**`;
+          plantLabel = `Affects: ${plantName}`;
+        }
+      }
+
+      const flagWord = item.flag_count === 1 ? "flag" : "flags";
+      const firstFlagged = formatDate(item.created_at);
+      const lastFlagged = formatDate(item.updated_at);
+
+      lines.push(`### ${label}`);
+      lines.push("");
+      lines.push(`- **Field:** \`${item.field_name}\``);
+      lines.push(`- **Flags:** ${item.flag_count} ${flagWord}`);
+      lines.push(`- **First flagged:** ${firstFlagged}`);
+      lines.push(`- **Last flagged:** ${lastFlagged}`);
+      if (plantLabel) {
+        lines.push(`- **${plantLabel}**`);
+      }
+      if (item.notes) {
+        lines.push(`- **User notes:** ${item.notes}`);
+      }
+
+      // Show the content data if we can fetch it
+      if (type === "plant_image") {
+        const plant = plantMap.get(item.content_id);
+        if (plant?.image_url) {
+          lines.push("");
+          lines.push(`  ![${plant.common_name}](${plant.image_url})`);
+        }
+      } else {
+        const disease = diseaseMap.get(item.content_id);
+        if (type === "disease_image" && disease?.image_url) {
+          lines.push("");
+          lines.push(`  ![${disease.name}](${disease.image_url})`);
+        }
+      }
+
+      lines.push("");
+    }
+
+    lines.push("---");
+    lines.push("");
+  }
+
+  // Footer
+  lines.push("## ℹ️ How This Works");
+  lines.push("");
+  lines.push("1. **Users** click the 🚩 Flag button on any content they believe needs review.");
+  lines.push("2. **The system** stores the flag in the database with a counter.");
+  lines.push(
+    "3. **This report** is generated by querying the database and formatting the results.",
+  );
+  lines.push("4. **Reviewers** go through each item and take action (fix, update, or dismiss).");
+  lines.push("");
+  lines.push("### Taking Action");
+  lines.push("");
+  lines.push("After reviewing an item, you can clear its flags by running:");
+  lines.push("");
+  lines.push("```sql");
+  lines.push("DELETE FROM flagged_content WHERE id = '<item-id>';");
+  lines.push("```");
+  lines.push("");
+  lines.push("Or clear all flags for a specific item by running:");
+  lines.push("");
+  lines.push("```sql");
+  lines.push(
+    "UPDATE flagged_content SET flag_count = 0 WHERE content_id = '<id>' AND field_name = '<field>';",
+  );
+  lines.push("```");
+  lines.push("");
+  lines.push("---");
+  lines.push("");
+  lines.push(`_Report generated with min-flags=${MIN_FLAGS}_`);
+
+  // Write report
+  fs.writeFileSync(OUTPUT_PATH, lines.join("\n"), "utf-8");
+  console.log(`✅ Report written to ${OUTPUT_PATH}`);
+  console.log(`   ${flaggedRows.length} items, ${totalFlags} total flags`);
+  db.close();
+}
+
+main().catch((err) => {
+  console.error("❌ Failed to generate report:", err);
+  process.exit(1);
+});
--- a/apps/web/scripts/scrape-training-dataset.ts
+++ b/apps/web/scripts/scrape-training-dataset.ts
@@ -2,59 +2,113 @@
 /**
 * scrape-training-dataset.ts
 *
- * Collects a training dataset for fine-tuning by scraping DuckDuckGo image search.
+ * Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
 *
- * Targets:
- *   - 200 images per disease class (93 diseases)
- *   - 400 images for the "healthy" class
- *   - Full resolution images stored in data/dataset/{class_id}/
+ * Targets (tiered by plant type):
+ *   - Core plants (houseplants + common garden): 100 images per disease
+ *   - Full set (all 11,498 DB diseases): 10 images per disease
+ *   - Healthy: 400 images
 *
- * DuckDuckGo approach (no API key needed):
- *   1. Fetch the main search page to extract a vqd (query) token
- *   2. Use the vqd token to paginate through image results
- *   3. Download each image to the dataset directory
+ * Sources (all free, no API keys):
+ *   1. DB image_url — existing images already found
+ *   2. DuckDuckGo  — general web image search
+ *   3. iNaturalist — real-world plant observation photos
+ *   4. Wikimedia Commons — curated scientific/educational images
 *
 * Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
- *
- * Progress is tracked in data/dataset/.progress.json — interrupt and resume safely.
+ * Progress: data/dataset/.progress.json — interrupt and resume safely.
 */

 import "dotenv/config";
-import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
-import { resolve, extname, join } from "path";
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
+import { resolve, extname } from "path";
+
+// Load .env.development for DB creds
+const envPath = resolve(__dirname, "../.env.development");
+try {
+  const env = readFileSync(envPath, "utf-8");
+  for (const line of env.split("\n")) {
+    const trimmed = line.trim();
+    if (trimmed && !trimmed.startsWith("#")) {
+      const eqIdx = trimmed.indexOf("=");
+      if (eqIdx > 0) {
+        const key = trimmed.slice(0, eqIdx).trim();
+        const val = trimmed.slice(eqIdx + 1).trim();
+        if (!process.env[key]) process.env[key] = val;
+      }
+    }
+  }
+} catch {}
+
+import { getDb, closeDb } from "@/lib/db/index";
+import { diseases } from "@/lib/db/schema";

 // ─── Config ─────────────────────────────────────────────────────────────────

-const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
-const PLANTS_JSON = resolve(__dirname, "../src/data/plants.json");
-
 const DATASET_DIR = resolve(__dirname, "../data/dataset");
 const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");

-/** Target images per disease class */
-const TARGET_PER_DISEASE = 200;
+/** Target images per disease for CORE plants */
+const TARGET_CORE = 100;

-/** Target images for the "healthy" class (2× normal) */
+/** Target images per disease for the FULL set */
+const TARGET_FULL = 10;
+
+/** Target images for the "healthy" class */
 const TARGET_HEALTHY = 400;

+/** Core plants that get higher image targets */
+const CORE_PLANTS = new Set([
+  // Houseplants
+  "monstera",
+  "pothos",
+  "snake-plant",
+  "peace-lily",
+  "orchid",
+  "succulent",
+  "fiddle-leaf-fig",
+  "aloe-vera",
+  "cactus",
+  "fern",
+  // Garden plants
+  "tomato",
+  "basil",
+  "rose",
+  "pepper",
+  "strawberry",
+  "cucumber",
+  "squash",
+  "lettuce",
+  "spinach",
+  "cabbage",
+  "lavender",
+  "mint",
+  "jasmine",
+  "sunflower",
+  "daisy",
+  "zucchini",
+  "bean",
+  "eggplant",
+  "chili",
+  // General disease patterns
+  "general",
+]);
+
 /** Delay between DuckDuckGo search API calls (ms) */
 const SEARCH_DELAY = 1500;

 /** Delay between image downloads (ms) */
-const DOWNLOAD_DELAY = 300;
+const DOWNLOAD_DELAY = 100;

 /** Max concurrent downloads */
-const CONCURRENT_DOWNLOADS = 5;
+const CONCURRENT_DOWNLOADS = 10;

-/** Minimum image size in bytes to accept (reject tiny placeholders) */
+/** Minimum image size in bytes to accept */
 const MIN_IMAGE_SIZE = 10_000; // 10KB

 /** Maximum image size in bytes */
 const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB

-/** Allowed image content types */
-const ALLOWED_CONTENT_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif"];
-
 /** Allowed file extensions */
 const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];

@@ -62,22 +116,16 @@ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
 const UA =
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";

+/** Class ID for healthy plants */
+const HEALTHY_CLASS = "healthy";
+
 // ─── Types ──────────────────────────────────────────────────────────────────

-interface DiseaseSeed {
+interface DbDisease {
  id: string;
  plantId: string;
  name: string;
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  [key: string]: any;
-}
-
-interface PlantSeed {
-  id: string;
-  commonName: string;
-  scientificName: string;
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  [key: string]: any;
+  imageUrl: string | null;
 }

 interface DuckDuckGoImageResult {
@@ -93,10 +141,7 @@ interface ClassProgress {
  count: number;
  downloaded: number;
  failed: number;
-  skipped: number;
-  /** URLs we've already seen (to avoid duplicates) */
  seenUrls: string[];
-  /** Whether we've exhausted search results */
  exhausted: boolean;
 }

@@ -105,15 +150,27 @@ interface Progress {
  classes: Record<string, ClassProgress>;
 }

-/** Class ID for healthy plants */
-const HEALTHY_CLASS = "healthy";
+// ─── DB Loading ──────────────────────────────────────────────────────────────
+
+/**
+ * Load all diseases from the database with their existing image URLs.
+ */
+async function loadDiseasesFromDb(): Promise<DbDisease[]> {
+  const db = getDb();
+  const rows = await db
+    .select({
+      id: diseases.id,
+      plantId: diseases.plantId,
+      name: diseases.name,
+      imageUrl: diseases.imageUrl,
+    })
+    .from(diseases)
+    .orderBy(diseases.id);
+  return rows;
+}

 // ─── DuckDuckGo API ─────────────────────────────────────────────────────────

-/**
- * Extract the vqd token from DuckDuckGo's search page.
- * Required for paginating image results.
- */
 async function getVqdToken(query: string): Promise<string> {
  const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;

@@ -122,25 +179,15 @@ async function getVqdToken(query: string): Promise<string> {
    signal: AbortSignal.timeout(15_000),
  });

-  if (!res.ok) {
-    throw new Error(`Failed to get vqd token: ${res.status}`);
-  }
+  if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);

  const html = await res.text();
-
-  // Extract vqd token from the HTML
-  // Format: vqd='<token>' or vqd="<token>"
  const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
-  if (!match) {
-    throw new Error(`Could not extract vqd token from DuckDuckGo response for "${query}"`);
-  }
+  if (!match) throw new Error(`Could not extract vqd token for "${query}"`);

  return match[1];
 }

-/**
- * Fetch a page of DuckDuckGo image results.
- */
 async function searchImagesDuckDuckGo(
  query: string,
  vqd: string,
@@ -161,12 +208,9 @@ async function searchImagesDuckDuckGo(
    if (res.status === 429) {
      console.warn("  ⚠ Rate limited (429). Waiting 10s...");
      await sleep(10_000);
-      return searchImagesDuckDuckGo(query, vqd, page); // Retry
-    }
-    if (res.status === 403) {
-      console.warn("  ⚠ Forbidden (403). Token may have expired.");
-      return []; // Token expired — no more pages
+      return searchImagesDuckDuckGo(query, vqd, page);
    }
+    if (res.status === 403) return [];
    throw new Error(`DuckDuckGo search failed: ${res.status}`);
  }

@@ -174,11 +218,7 @@ async function searchImagesDuckDuckGo(
  return data.results ?? [];
 }

-/**
- * Search DuckDuckGo images, automatically paginating to collect up to `target` results.
- * Returns unique image URLs.
- */
-async function collectImages(
+async function collectImagesDuckDuckGo(
  query: string,
  target: number,
  seenUrls: Set<string>,
@@ -188,27 +228,29 @@ async function collectImages(
  let exhausted = false;
  let consecutiveEmpty = 0;

-  // Get vqd token
  let vqd: string;
  try {
    vqd = await getVqdToken(query);
  } catch (err) {
-    console.warn(`  ⚠ Failed to get vqd token: ${err instanceof Error ? err.message : "unknown"}`);
+    console.warn(`  ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
    return { urls: [], exhausted: true };
  }

-  while (results.length < target) {
+  const MAX_PAGES = 5;
+  let lowNoveltyCount = 0;
+
+  while (results.length < target && page <= MAX_PAGES) {
    await sleep(SEARCH_DELAY);

    let pageResults: DuckDuckGoImageResult[];
    try {
      pageResults = await searchImagesDuckDuckGo(query, vqd, page);
    } catch (err) {
-      console.warn(`  ⚠ Search error: ${err instanceof Error ? err.message : "unknown"}`);
+      console.warn(`  ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
      break;
    }

-    if (pageResults.length === 0) {
+    if (!pageResults || pageResults.length === 0) {
      consecutiveEmpty++;
      if (consecutiveEmpty >= 3) {
        exhausted = true;
@@ -223,78 +265,160 @@ async function collectImages(

    for (const r of pageResults) {
      if (results.length >= target) break;
-
      const imgUrl = r.image || r.url;
-
-      // Skip if we've already seen this URL
+      if (!imgUrl || typeof imgUrl !== "string") continue;
      if (seenUrls.has(imgUrl)) continue;
-
-      // Validate URL looks like an image
-      const ext = extname(new URL(imgUrl).pathname).toLowerCase();
-      if (!ALLOWED_EXTENSIONS.includes(ext) && !ext) {
-        // No extension - still try, could be a CDN URL
+      try {
+        new URL(imgUrl);
+      } catch {
+        continue;
      }
-
      seenUrls.add(imgUrl);
      results.push(imgUrl);
      newCount++;
    }

-    if (newCount === 0 && pageResults.every((r) => seenUrls.has(r.image || r.url))) {
-      // All results on this page were already seen
-      page++;
-      continue;
+    const newRatio = newCount / pageResults.length;
+    if (newRatio < 0.05) {
+      lowNoveltyCount++;
+      if (lowNoveltyCount >= 2) break;
+    } else {
+      lowNoveltyCount = 0;
    }

-    if (results.length < target) {
-      page++;
-    }
+    if (results.length < target) page++;
  }

  return { urls: results.slice(0, target), exhausted };
 }

+// ─── iNaturalist API ─────────────────────────────────────────────────────────
+
+async function searchImagesInaturalist(
+  query: string,
+  target: number,
+  seenUrls: Set<string>,
+): Promise<{ urls: string[]; exhausted: boolean }> {
+  const results: string[] = [];
+  const perPage = Math.min(target, 200);
+
+  const apiUrl =
+    `https://api.inaturalist.org/v1/observations` +
+    `?q=${encodeURIComponent(query)}` +
+    `&photos_only=true` +
+    `&quality_grade=research` +
+    `&per_page=${perPage}` +
+    `&order_by=observed_on&order=desc`;
+
+  try {
+    const res = await fetch(apiUrl, {
+      headers: { "User-Agent": UA, Accept: "application/json" },
+      signal: AbortSignal.timeout(15_000),
+    });
+    if (!res.ok) return { urls: [], exhausted: false };
+
+    const data = (await res.json()) as {
+      results: Array<{ photos: Array<{ url: string }> }>;
+    };
+
+    for (const obs of data.results ?? []) {
+      if (results.length >= target) break;
+      for (const photo of obs.photos ?? []) {
+        if (results.length >= target) break;
+        const url = photo.url;
+        if (!url || seenUrls.has(url)) continue;
+        const fullUrl = url.replace("/medium.", "/original.");
+        seenUrls.add(fullUrl);
+        results.push(fullUrl);
+      }
+    }
+
+    return { urls: results, exhausted: results.length < target };
+  } catch {
+    return { urls: results, exhausted: false };
+  }
+}
+
+// ─── Wikimedia Commons API ──────────────────────────────────────────────────
+
+async function searchImagesCommons(
+  query: string,
+  target: number,
+  seenUrls: Set<string>,
+): Promise<{ urls: string[]; exhausted: boolean }> {
+  const results: string[] = [];
+  let sroffset = 0;
+
+  while (results.length < target) {
+    const params = new URLSearchParams({
+      action: "query",
+      list: "search",
+      srsearch: query,
+      srnamespace: "6",
+      srlimit: "50",
+      sroffset: String(sroffset),
+      format: "json",
+      origin: "*", // server-side API call
+    });
+
+    const url = `https://commons.wikimedia.org/w/api.php?${params}`;
+
+    try {
+      const res = await fetch(url, {
+        headers: { "User-Agent": UA },
+        signal: AbortSignal.timeout(10_000),
+      });
+      if (!res.ok) break;
+
+      const data = (await res.json()) as {
+        query?: { search?: Array<{ title: string }> };
+        continue?: { sroffset?: number };
+      };
+
+      const hits = data.query?.search ?? [];
+      if (hits.length === 0) break;
+
+      for (const hit of hits) {
+        if (results.length >= target) break;
+        const filename = hit.title.replace(/^File:/, "");
+        const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(filename)}`;
+        if (seenUrls.has(imgUrl)) continue;
+        seenUrls.add(imgUrl);
+        results.push(imgUrl);
+      }
+
+      sroffset = data.continue?.sroffset ?? sroffset + hits.length;
+    } catch {
+      break;
+    }
+  }
+
+  return { urls: results, exhausted: results.length < target };
+}
+
 // ─── Image Download ─────────────────────────────────────────────────────────

-/**
- * Download a single image from a URL to the target path.
- * Returns true if successful, false otherwise.
- */
 async function downloadImage(url: string, destPath: string): Promise<boolean> {
  try {
    const res = await fetch(url, {
-      headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg" },
+      headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
      signal: AbortSignal.timeout(15_000),
    });
-
    if (!res.ok) return false;

    const contentType = res.headers.get("content-type") || "";
-    const contentLength = parseInt(res.headers.get("content-length") || "0", 10);
-
-    // Validate content type
-    if (!ALLOWED_CONTENT_TYPES.some((t) => contentType.includes(t))) {
-      return false;
-    }
-
-    // Validate size
-    if (contentLength > 0 && contentLength < MIN_IMAGE_SIZE) return false;
-    if (contentLength > MAX_IMAGE_SIZE) return false;
+    if (contentType.includes("text/html")) return false;

    const buffer = Buffer.from(await res.arrayBuffer());
-
-    // Double-check actual buffer size
    if (buffer.length < MIN_IMAGE_SIZE) return false;
    if (buffer.length > MAX_IMAGE_SIZE) return false;

-    // Determine correct extension from content type or URL
    let ext = extname(new URL(url).pathname).toLowerCase();
    if (!ALLOWED_EXTENSIONS.includes(ext)) {
-      // Map from content type
      if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
      else if (contentType.includes("png")) ext = ".png";
      else if (contentType.includes("webp")) ext = ".webp";
-      else ext = ".jpg"; // Default
+      else ext = ".jpg";
    }

    const filePath = destPath.replace(/\.\w+$/, ext);
@@ -305,9 +429,6 @@ async function downloadImage(url: string, destPath: string): Promise<boolean> {
  }
 }

-/**
- * Download multiple images concurrently, respecting a per-download delay.
- */
 async function downloadBatch(
  urls: string[],
  classDir: string,
@@ -317,7 +438,6 @@ async function downloadBatch(
  let failed = 0;
  let index = startIndex;

-  // Process in chunks to control concurrency
  for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
    const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);

@@ -325,16 +445,23 @@ async function downloadBatch(
      chunk.map(async (url) => {
        const paddedIndex = String(index).padStart(4, "0");
        const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
-
        const success = await downloadImage(url, destPath);
        await sleep(DOWNLOAD_DELAY);
-        return { success, index: index++ };
+        return { success, index: index++, url: url.substring(0, 50) };
      }),
    );

    for (const r of results) {
      if (r.success) downloaded++;
-      else failed++;
+      else {
+        failed++;
+        if (failed % 20 === 1) console.log(`    ⚠ Failed: ${r.url}...`);
+      }
+    }
+
+    const total = downloaded + failed;
+    if (total % 30 === 0 || total === urls.length) {
+      console.log(`    Progress: ${downloaded}/${urls.length} (${failed} failed)`);
    }
  }

@@ -361,7 +488,6 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
      count: 0,
      downloaded: 0,
      failed: 0,
-      skipped: 0,
      seenUrls: [],
      exhausted: false,
    };
@@ -369,26 +495,22 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
  return progress.classes[classId];
 }

-// ─── Search Query Building ──────────────────────────────────────────────────
+// ─── Query Building ─────────────────────────────────────────────────────────

-function buildSearchQueries(disease: DiseaseSeed, plant: PlantSeed | null): string[] {
-  const name = disease.name;
-  const plantName = plant?.commonName || disease.plantId;
-
-  return [
-    `${name} ${plantName} leaf disease`,
-    `${plantName} ${name} symptoms`,
-    `${name} plant disease`,
-    `${plantName} diseased leaf`,
-  ];
+function buildSearchQueries(disease: DbDisease): string[] {
+  const name = disease.name || disease.id.replace(/-/g, " ");
+  const plant = disease.plantId.replace(/-/g, " ");
+  // Every query keeps the disease NAME to avoid noisy labels
+  return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
 }

-function buildHealthyQueries(plant: PlantSeed): string[] {
+function buildHealthyQueries(plant: string): string[] {
+  const name = plant.replace(/-/g, " ");
  return [
-    `healthy ${plant.commonName} leaf`,
-    `${plant.commonName} leaf closeup`,
-    `healthy ${plant.commonName} plant`,
-    `${plant.commonName} foliage`,
+    `healthy ${name} leaf`,
+    `${name} leaf closeup`,
+    `healthy ${name} plant`,
+    `${name} foliage`,
  ];
 }

@@ -400,64 +522,97 @@ async function collectClassImages(
  target: number,
  progress: Progress,
  classDir: string,
+  existingUrls: string[] = [],
+  fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
 ): Promise<void> {
  const cp = getClassProgress(progress, classId);
  const seenUrls = new Set(cp.seenUrls);

  if (cp.count >= target) {
-    console.log(`  ✓ Already have ${cp.count}/${target} images`);
+    console.log(`  ✓ Already have ${cp.count}/${target}`);
    return;
  }

  if (cp.exhausted) {
-    console.log(`  ✓ Already exhausted search results (${cp.count}/${target} images)`);
+    console.log(`  ✓ Exhausted (${cp.count}/${target})`);
    return;
  }

  mkdirSync(classDir, { recursive: true });

-  const totalUrls: string[] = [];
+  const allUrls: string[] = [];
  let exhausted = false;

-  // Search with each query until we hit the target
-  for (const query of queries) {
-    if (totalUrls.length >= target) break;
-
-    console.log(`  Searching: "${query}"...`);
-    const result = await collectImages(query, target - totalUrls.length, seenUrls);
-
-    totalUrls.push(...result.urls);
-    cp.seenUrls = Array.from(seenUrls);
-
-    if (result.exhausted) {
-      exhausted = true;
+  // ── Source 0: Existing DB URLs ──────────────────────────────────────────
+  const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
+  if (freshDbUrls.length > 0) {
+    console.log(`  DB: ${freshDbUrls.length} existing URLs`);
+    for (const url of freshDbUrls) {
+      if (allUrls.length >= target) break;
+      seenUrls.add(url);
+      allUrls.push(url);
    }
-
-    if (totalUrls.length >= target) break;
  }

-  if (totalUrls.length === 0) {
+  // ── Source 1: DuckDuckGo ──────────────────────────────────────────────
+  // Skip DDG in fast mode (full set — DDG is slowest source)
+  if (!fastMode && allUrls.length < target) {
+    for (const query of queries) {
+      if (allUrls.length >= target) break;
+      process.stdout.write(`  DDG: "${query.substring(0, 40)}"... `);
+      const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls);
+      allUrls.push(...result.urls);
+      if (result.exhausted) exhausted = true;
+      console.log(`${result.urls.length} new`);
+      if (allUrls.length >= target) break;
+    }
+  }
+
+  // ── Source 2: iNaturalist ──────────────────────────────────────────────
+  if (allUrls.length < target) {
+    const primaryQuery = queries[0];
+    console.log(`  iNat: Searching...`);
+    const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls);
+    allUrls.push(...result.urls);
+    if (result.exhausted) exhausted = true;
+    console.log(`  iNat: ${result.urls.length} images`);
+  }
+
+  // ── Source 3: Wikimedia Commons ────────────────────────────────────────
+  if (allUrls.length < target) {
+    const primaryQuery = queries[0];
+    console.log(`  Commons: Searching...`);
+    const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls);
+    allUrls.push(...result.urls);
+    if (result.exhausted) exhausted = true;
+    console.log(`  Commons: ${result.urls.length} images`);
+  }
+
+  if (allUrls.length === 0) {
    cp.exhausted = exhausted;
    saveProgress(progress);
-    console.log(`  ✗ No images found for "${classId}"`);
+    console.log(`  ✗ No images found`);
    return;
  }

-  console.log(`  Found ${totalUrls.length} unique image URLs. Downloading...`);
+  // Save progress with seen URLs BEFORE downloading
+  cp.seenUrls = Array.from(seenUrls);
+  cp.exhausted = exhausted;
+  saveProgress(progress);

-  // Download the images
-  const { downloaded, failed } = await downloadBatch(totalUrls, classDir, cp.count);
+  console.log(`  Downloading ${allUrls.length} images...`);
+
+  const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count);

  cp.count += downloaded;
  cp.downloaded += downloaded;
  cp.failed += failed;
-  cp.exhausted = exhausted;

  saveProgress(progress);

  const pct = Math.round((cp.count / target) * 100);
  console.log(
-    `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
+    `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${allUrls.length} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
  );
 }

@@ -465,25 +620,18 @@ async function collectClassImages(

 async function main() {
  console.log("=".repeat(60));
-  console.log("PLANT DISEASE DATASET COLLECTOR");
+  console.log("PLANT DISEASE DATASET COLLECTOR — FULL DB");
  console.log("=".repeat(60));

-  // Load knowledge base
-  const diseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
-  const plants = JSON.parse(readFileSync(PLANTS_JSON, "utf-8")) as PlantSeed[];
+  // Load diseases from DB
+  console.log("\nLoading diseases from database...");
+  const dbDiseases = await loadDiseasesFromDb();
+  console.log(`  ${dbDiseases.length} diseases loaded`);

-  const plantMap = new Map<string, PlantSeed>();
-  for (const p of plants) {
-    plantMap.set(p.id, p);
-  }
-
-  console.log(`\nLoaded ${diseases.length} diseases, ${plants.length} plants`);
-  console.log(
-    `Target: ${TARGET_PER_DISEASE} images/disease (×${diseases.length} = ${diseases.length * TARGET_PER_DISEASE})`,
-  );
-  console.log(`Target: ${TARGET_HEALTHY} images for "healthy" class`);
-  console.log(`Output: ${DATASET_DIR}/`);
-  console.log("");
+  const coreDiseases = dbDiseases.filter((d) => CORE_PLANTS.has(d.plantId));
+  const fullDiseases = dbDiseases.filter((d) => !CORE_PLANTS.has(d.plantId));
+  console.log(`  Core plants: ${coreDiseases.length} diseases (target: ${TARGET_CORE})`);
+  console.log(`  Full set: ${fullDiseases.length} diseases (target: ${TARGET_FULL})`);

  // Load progress
  mkdirSync(DATASET_DIR, { recursive: true });
@@ -491,28 +639,46 @@ async function main() {

  const startTime = Date.now();

-  // ── Phase 1: Disease classes ──────────────────────────────────────────────
-
-  console.log("─".repeat(60));
-  console.log("PHASE 1: Disease Images");
-  console.log("─".repeat(60));
-
-  for (let i = 0; i < diseases.length; i++) {
-    const disease = diseases[i];
-    const plant = plantMap.get(disease.plantId) ?? null;
-    const classDir = resolve(DATASET_DIR, disease.id);
-    const queries = buildSearchQueries(disease, plant);
-
-    const pct = Math.round((i / diseases.length) * 100);
-    console.log(`\n[${i + 1}/${diseases.length}] (${pct}%) ${disease.name} (${disease.id})`);
-
-    await collectClassImages(disease.id, queries, TARGET_PER_DISEASE, progress, classDir);
-  }
-
-  // ── Phase 2: Healthy class ────────────────────────────────────────────────
+  // ── Phase 1: Core set ──────────────────────────────────────────────────

  console.log("\n" + "─".repeat(60));
-  console.log("PHASE 2: Healthy Plant Images");
+  console.log("PHASE 1: Core Diseases (100 images each)");
+  console.log("─".repeat(60));
+
+  for (let i = 0; i < coreDiseases.length; i++) {
+    const d = coreDiseases[i];
+    const classDir = resolve(DATASET_DIR, d.id);
+    const queries = buildSearchQueries(d);
+    const existingUrls = d.imageUrl ? [d.imageUrl] : [];
+
+    const pct = Math.round((i / coreDiseases.length) * 100);
+    console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
+
+    await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
+  }
+
+  // ── Phase 2: Full set ──────────────────────────────────────────────────
+
+  console.log("\n" + "─".repeat(60));
+  console.log("PHASE 2: Full Disease Set (10 images each)");
+  console.log("─".repeat(60));
+
+  for (let i = 0; i < fullDiseases.length; i++) {
+    const d = fullDiseases[i];
+    const classDir = resolve(DATASET_DIR, d.id);
+    const queries = buildSearchQueries(d);
+    const existingUrls = d.imageUrl ? [d.imageUrl] : [];
+
+    const pct = Math.round((i / fullDiseases.length) * 100);
+    console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
+
+    await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
+  }
+
+  // ── Phase 3: Healthy class ──────────────────────────────────────────────
+
+  console.log("\n" + "─".repeat(60));
+  console.log("PHASE 3: Healthy Plant Images");
  console.log("─".repeat(60));

  const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
@@ -520,39 +686,50 @@ async function main() {
  const healthySeen = new Set(healthyCp.seenUrls);

  if (healthyCp.count >= TARGET_HEALTHY) {
-    console.log(`\n  ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY} healthy images`);
+    console.log(`\n  ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`);
  } else {
-    // Build a pool of healthy plant queries
+    // Collect all unique plants
+    const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))];
    const allHealthyQueries: string[] = [];
-    for (const plant of plants) {
+    for (const plant of allPlants) {
      allHealthyQueries.push(...buildHealthyQueries(plant));
    }

+    const healthySources = [
+      { name: "DDG", collector: collectImagesDuckDuckGo },
+      { name: "iNat", collector: searchImagesInaturalist },
+      { name: "Commons", collector: searchImagesCommons },
+    ] as const;
+
    const totalHealthyUrls: string[] = [];
-    let healthyExhausted = false;
+    let anyRemaining = false;

-    for (const query of allHealthyQueries) {
+    for (const source of healthySources) {
      if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
-      if (healthyExhausted) break;
+      console.log(`\n  Source: ${source.name}`);

-      console.log(`\n  Searching: "${query}"...`);
-      const result = await collectImages(
-        query,
-        TARGET_HEALTHY - totalHealthyUrls.length,
-        healthySeen,
-      );
+      for (const query of allHealthyQueries.slice(0, 20)) {
+        if (totalHealthyUrls.length >= TARGET_HEALTHY) break;

-      totalHealthyUrls.push(...result.urls);
-
-      if (result.exhausted) {
-        healthyExhausted = true;
+        process.stdout.write(`    "${query}"... `);
+        const result = await source.collector(
+          query,
+          TARGET_HEALTHY - totalHealthyUrls.length,
+          healthySeen,
+        );
+        totalHealthyUrls.push(...result.urls);
+        if (!result.exhausted) anyRemaining = true;
+        console.log(`${result.urls.length} new`);
      }
    }

    healthyCp.seenUrls = Array.from(healthySeen);

    if (totalHealthyUrls.length > 0) {
-      console.log(`\n  Found ${totalHealthyUrls.length} healthy image URLs. Downloading...`);
+      healthyCp.exhausted = !anyRemaining;
+      saveProgress(progress);
+
+      console.log(`\n  Downloading ${totalHealthyUrls.length} healthy images...`);
      const { downloaded, failed } = await downloadBatch(
        totalHealthyUrls,
        healthyDir,
@@ -562,14 +739,12 @@ async function main() {
      healthyCp.count += downloaded;
      healthyCp.downloaded += downloaded;
      healthyCp.failed += failed;
-      healthyCp.exhausted = healthyExhausted;

      const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
      console.log(
-        `  Got ${downloaded} images (${failed} failed). Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
+        `  Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
      );
    } else {
-      healthyCp.exhausted = true;
      console.log(`  ✗ No healthy images found`);
    }

@@ -580,76 +755,27 @@ async function main() {

  const elapsed = Math.round((Date.now() - startTime) / 1000);
  const mins = Math.floor(elapsed / 60);
-  const secs = elapsed % 60;
+  const hrs = Math.floor(mins / 60);

  let totalDownloaded = 0;
  let totalFailed = 0;
-  let totalTarget = 0;
-
-  for (const [classId, cp] of Object.entries(progress.classes)) {
+  for (const [, cp] of Object.entries(progress.classes)) {
    totalDownloaded += cp.downloaded || 0;
    totalFailed += cp.failed || 0;
-    totalTarget += classId === HEALTHY_CLASS ? TARGET_HEALTHY : TARGET_PER_DISEASE;
  }

-  const totalSize = await getDatasetSize();
-  const sizeGb = (totalSize / (1024 * 1024 * 1024)).toFixed(2);
-
  console.log("\n" + "=".repeat(60));
  console.log("COMPLETE");
  console.log("=".repeat(60));
-  console.log(`  Time:       ${mins}m ${secs}s`);
+  console.log(`  Time:       ${hrs}h ${mins % 60}m`);
  console.log(`  Downloaded: ${totalDownloaded} images`);
  console.log(`  Failed:     ${totalFailed} images`);
-  console.log(`  Target:     ${totalTarget} images`);
-  console.log(`  Dataset size: ${sizeGb} GB`);
-  console.log(`  Dataset location: ${DATASET_DIR}/`);
-  console.log("");
-  console.log("Next steps:");
-  console.log("  1. Run the fine-tuning script to train on this dataset");
-  console.log("  2. The fine-tuning script will resize to 160×160 and augment");
+  console.log(`  Dataset:    ${DATASET_DIR}/`);
+
+  await closeDb();
  console.log("=".repeat(60));
 }

-/**
- * Calculate total size of the dataset directory.
- */
-async function getDatasetSize(): Promise<number> {
-  let total = 0;
-  if (!existsSync(DATASET_DIR)) return 0;
-
-  const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
-
-  for (const entry of entries) {
-    if (!entry.name.startsWith(".")) {
-      const fullPath = resolve(DATASET_DIR, entry.name);
-      if (entry.isDirectory()) {
-        total += dirSize(fullPath);
-      }
-    }
-  }
-
-  return total;
-}
-
-function dirSize(dirPath: string): number {
-  let total = 0;
-  try {
-    const entries = readdirSync(dirPath, { withFileTypes: true });
-    for (const entry of entries) {
-      const fullPath = join(dirPath, entry.name);
-      if (entry.isFile()) {
-        total += statSync(fullPath).size;
-      } else if (entry.isDirectory()) {
-        total += dirSize(fullPath);
-      }
-    }
-  } catch {
-    // skip errors
-  }
-  return total;
-}
-
 function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
 }