prevelance data added

2026-06-07 12:06:41 -04:00
parent cc7b2a593a
commit 876c26968b
11 changed files with 2305 additions and 148 deletions
--- a/apps/web/drizzle/0005_add-prevalence-score.sql
+++ b/apps/web/drizzle/0005_add-prevalence-score.sql
@@ -0,0 +1 @@
+ALTER TABLE `diseases` ADD COLUMN `prevalence_score` integer DEFAULT 0 NOT NULL;
--- a/apps/web/drizzle/meta/_journal.json
+++ b/apps/web/drizzle/meta/_journal.json
@@ -36,6 +36,13 @@
      "when": 1751846400000,
      "tag": "0004_add-flagged-content",
      "breakpoints": true
+    },
+    {
+      "idx": 5,
+      "version": "6",
+      "when": 1751846400000,
+      "tag": "0005_add-prevalence-score",
+      "breakpoints": true
    }
  ]
 }
--- a/apps/web/scripts/.ddg-progress.json
+++ b/apps/web/scripts/.ddg-progress.json
--- a/apps/web/scripts/fill-ddg-images.ts
+++ b/apps/web/scripts/fill-ddg-images.ts
@@ -51,7 +51,7 @@ interface DiseaseRow {

 // ─── Config ──────────────────────────────────────────────────────────────────

-const POLITE_DELAY = 1100; // ms between calls
+const POLITE_DELAY = 800; // ms between calls
 const DB_FLUSH_BATCH = 50;
 const STATE_FILE = resolve(__dirname, ".ddg-progress.json");

@@ -163,6 +163,8 @@ async function main() {
    const query1 = `${d.name} on ${plantName} plant disease`;
    const query2 = `${d.scientificName || d.name} on ${plantName} disease`;
    const query3 = `${d.name} plant disease ${plantName}`;
+    const query4 = `${d.name} plant`;
+    const query5 = `${d.name} symptom`;

    process.stdout.write(
      `  [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 42).padEnd(44)} `,
@@ -170,7 +172,7 @@ async function main() {

    // Try queries in order until we get a result
    let url: string | null = null;
-    for (const q of [query1, query2, query3]) {
+    for (const q of [query1, query2, query3, query4, query5]) {
      url = await searchImage(q);
      if (url) break;
    }
--- a/apps/web/scripts/fill-training-dataset.ts
+++ b/apps/web/scripts/fill-training-dataset.ts
@@ -0,0 +1,768 @@
+#!/usr/bin/env node
+/**
+ * fill-training-dataset.ts
+ *
+ * Scans the existing dataset directory and downloads any missing images
+ * to reach the target counts (200 per disease, 400 for healthy).
+ *
+ * Does NOT re-run prevalence queries — just fills gaps from image sources.
+ * Each run scans the directory, reports deficits, then fills them.
+ * Interrupt-safe: re-run to pick up where you left off.
+ *
+ * Usage: cd apps/web && npx tsx scripts/fill-training-dataset.ts
+ */
+
+import "dotenv/config";
+import { readFileSync, readdirSync, writeFileSync, existsSync, mkdirSync } from "fs";
+import { resolve, extname } from "path";
+
+// Load .env.development for DB creds
+const envPath = resolve(__dirname, "../.env.development");
+try {
+  const env = readFileSync(envPath, "utf-8");
+  for (const line of env.split("\n")) {
+    const trimmed = line.trim();
+    if (trimmed && !trimmed.startsWith("#")) {
+      const eqIdx = trimmed.indexOf("=");
+      if (eqIdx > 0) {
+        const key = trimmed.slice(0, eqIdx).trim();
+        const val = trimmed.slice(eqIdx + 1).trim();
+        if (!process.env[key]) process.env[key] = val;
+      }
+    }
+  }
+} catch {}
+
+import { getDb, closeDb } from "@/lib/db/index";
+import { diseases } from "@/lib/db/schema";
+import { sql } from "drizzle-orm";
+
+// ─── Config ─────────────────────────────────────────────────────────────────
+
+const DATASET_DIR = resolve(__dirname, "../data/dataset");
+const SEEN_CACHE_FILE = resolve(DATASET_DIR, ".fill-seen-urls.json");
+
+/** Target images per disease */
+const TARGET_PER_DISEASE = 200;
+
+/** Target images for the "healthy" class */
+const TARGET_HEALTHY = 400;
+
+/** Delay between DuckDuckGo search API calls (ms) */
+const SEARCH_DELAY = 1500;
+
+/** Max concurrent image downloads per disease */
+const CONCURRENT_DOWNLOADS = 30;
+
+/** Number of diseases to process in parallel */
+const DISEASE_CONCURRENCY = 5;
+
+/** Minimum image size in bytes to accept */
+const MIN_IMAGE_SIZE = 10_000; // 10KB
+
+/** Maximum image size in bytes */
+const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
+
+/** Allowed file extensions */
+const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
+
+/** User agent for requests */
+const UA =
+  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
+
+/** Healthy class directory name */
+const HEALTHY_CLASS = "healthy";
+
+// ─── Types ──────────────────────────────────────────────────────────────────
+
+interface DuckDuckGoImageResult {
+  image: string;
+  title: string;
+  url: string;
+  thumbnail: string;
+  height: number;
+  width: number;
+}
+
+interface DiseaseInfo {
+  id: string;
+  name: string;
+  plantId: string;
+  have: number;
+  needed: number;
+}
+
+// ─── Helpers ────────────────────────────────────────────────────────────────
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/** Count actual image files in a directory (matching img_* pattern). */
+function countImagesInDir(dir: string): number {
+  if (!existsSync(dir)) return 0;
+  try {
+    const files = readdirSync(dir);
+    return files.filter((f) => f.startsWith("img_")).length;
+  } catch {
+    return 0;
+  }
+}
+
+/** Format bytes for display */
+function formatBytes(bytes: number): string {
+  if (bytes < 1024) return `${bytes} B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
+  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+}
+
+// ─── Seen-URLs Cache ──────────────────────────────────────────────────────
+
+/**
+ * Load the per-disease seen-URLs cache from disk.
+ * This prevents re-fetching the same URLs across runs.
+ */
+function loadSeenUrlsCache(): Record<string, string[]> {
+  if (existsSync(SEEN_CACHE_FILE)) {
+    try {
+      return JSON.parse(readFileSync(SEEN_CACHE_FILE, "utf-8"));
+    } catch {}
+  }
+  return {};
+}
+
+/**
+ * Save the seen-URLs cache to disk.
+ */
+function saveSeenUrlsCache(cache: Record<string, string[]>): void {
+  writeFileSync(SEEN_CACHE_FILE, JSON.stringify(cache, null, 2));
+}
+
+// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
+
+async function getVqdToken(query: string): Promise<string> {
+  const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
+
+  const res = await fetch(url, {
+    headers: { "User-Agent": UA, Accept: "text/html" },
+    signal: AbortSignal.timeout(15_000),
+  });
+
+  if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
+
+  const html = await res.text();
+  const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
+  if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
+
+  return match[1];
+}
+
+async function searchImagesDuckDuckGo(
+  query: string,
+  vqd: string,
+  page: number,
+): Promise<DuckDuckGoImageResult[]> {
+  const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
+    query,
+  )}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
+
+  const res = await fetch(url, {
+    headers: {
+      "User-Agent": UA,
+      Accept: "application/json",
+      Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
+    },
+    signal: AbortSignal.timeout(15_000),
+  });
+
+  if (!res.ok) {
+    if (res.status === 429) {
+      console.warn("    ⚠ DDG rate limited (429). Waiting 10s...");
+      await sleep(10_000);
+      return searchImagesDuckDuckGo(query, vqd, page);
+    }
+    if (res.status === 403) return [];
+    throw new Error(`DuckDuckGo search failed: ${res.status}`);
+  }
+
+  const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
+  return data.results ?? [];
+}
+
+async function collectImagesDuckDuckGo(
+  query: string,
+  target: number,
+  seenUrls: Set<string>,
+): Promise<{ urls: string[]; exhausted: boolean }> {
+  const results: string[] = [];
+  let page = 1;
+  let exhausted = false;
+  let consecutiveEmpty = 0;
+
+  let vqd: string;
+  try {
+    vqd = await getVqdToken(query);
+  } catch (err) {
+    console.warn(`    ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
+    return { urls: [], exhausted: true };
+  }
+
+  const MAX_PAGES = 5;
+  let lowNoveltyCount = 0;
+
+  while (results.length < target && page <= MAX_PAGES) {
+    await sleep(SEARCH_DELAY);
+
+    let pageResults: DuckDuckGoImageResult[];
+    try {
+      pageResults = await searchImagesDuckDuckGo(query, vqd, page);
+    } catch (err) {
+      console.warn(`    ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
+      break;
+    }
+
+    if (!pageResults || pageResults.length === 0) {
+      consecutiveEmpty++;
+      if (consecutiveEmpty >= 3) {
+        exhausted = true;
+        break;
+      }
+      page++;
+      continue;
+    }
+
+    consecutiveEmpty = 0;
+    let newCount = 0;
+
+    for (const r of pageResults) {
+      if (results.length >= target) break;
+      const imgUrl = r.image || r.url;
+      if (!imgUrl || typeof imgUrl !== "string") continue;
+      if (seenUrls.has(imgUrl)) continue;
+      try {
+        new URL(imgUrl);
+      } catch {
+        continue;
+      }
+      seenUrls.add(imgUrl);
+      results.push(imgUrl);
+      newCount++;
+    }
+
+    const newRatio = newCount / pageResults.length;
+    if (newRatio < 0.05) {
+      lowNoveltyCount++;
+      if (lowNoveltyCount >= 2) break;
+    } else {
+      lowNoveltyCount = 0;
+    }
+
+    if (results.length < target) page++;
+  }
+
+  return { urls: results.slice(0, target), exhausted };
+}
+
+// ─── iNaturalist API ───────────────────────────────────────────────────────
+
+async function searchImagesInaturalist(
+  query: string,
+  target: number,
+  seenUrls: Set<string>,
+): Promise<{ urls: string[]; exhausted: boolean }> {
+  const results: string[] = [];
+  const perPage = Math.min(target, 200);
+
+  const apiUrl =
+    `https://api.inaturalist.org/v1/observations` +
+    `?q=${encodeURIComponent(query)}` +
+    `&photos_only=true` +
+    `&quality_grade=research` +
+    `&per_page=${perPage}` +
+    `&order_by=observed_on&order=desc`;
+
+  try {
+    const res = await fetch(apiUrl, {
+      headers: { "User-Agent": UA, Accept: "application/json" },
+      signal: AbortSignal.timeout(15_000),
+    });
+    if (!res.ok) return { urls: [], exhausted: false };
+
+    const data = (await res.json()) as {
+      results: Array<{ photos: Array<{ url: string }> }>;
+    };
+
+    for (const obs of data.results ?? []) {
+      if (results.length >= target) break;
+      for (const photo of obs.photos ?? []) {
+        if (results.length >= target) break;
+        const url = photo.url;
+        if (!url || seenUrls.has(url)) continue;
+        const fullUrl = url.replace("/medium.", "/original.");
+        seenUrls.add(fullUrl);
+        results.push(fullUrl);
+      }
+    }
+
+    return { urls: results, exhausted: results.length < target };
+  } catch {
+    return { urls: results, exhausted: false };
+  }
+}
+
+// ─── Wikimedia Commons API ─────────────────────────────────────────────────
+
+async function searchImagesCommons(
+  query: string,
+  target: number,
+  seenUrls: Set<string>,
+): Promise<{ urls: string[]; exhausted: boolean }> {
+  const results: string[] = [];
+  let sroffset = 0;
+
+  while (results.length < target) {
+    const params = new URLSearchParams({
+      action: "query",
+      list: "search",
+      srsearch: query,
+      srnamespace: "6",
+      srlimit: "50",
+      sroffset: String(sroffset),
+      format: "json",
+    });
+
+    const url = `https://commons.wikimedia.org/w/api.php?${params}`;
+
+    try {
+      const res = await fetch(url, {
+        headers: { "User-Agent": UA },
+        signal: AbortSignal.timeout(10_000),
+      });
+      if (!res.ok) break;
+
+      const data = (await res.json()) as {
+        query?: { search?: Array<{ title: string }> };
+        continue?: { sroffset?: number };
+      };
+
+      const hits = data.query?.search ?? [];
+      if (hits.length === 0) break;
+
+      for (const hit of hits) {
+        if (results.length >= target) break;
+        const filename = hit.title.replace(/^File:/, "");
+        const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
+          filename,
+        )}`;
+        if (seenUrls.has(imgUrl)) continue;
+        seenUrls.add(imgUrl);
+        results.push(imgUrl);
+      }
+
+      sroffset = data.continue?.sroffset ?? sroffset + hits.length;
+    } catch {
+      break;
+    }
+  }
+
+  return { urls: results, exhausted: results.length < target };
+}
+
+// ─── Image Download ─────────────────────────────────────────────────────────
+
+async function downloadImage(url: string, destPath: string): Promise<boolean> {
+  try {
+    const res = await fetch(url, {
+      headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
+      signal: AbortSignal.timeout(15_000),
+    });
+    if (!res.ok) return false;
+
+    const contentType = res.headers.get("content-type") || "";
+    if (contentType.includes("text/html")) return false;
+
+    const buffer = Buffer.from(await res.arrayBuffer());
+    if (buffer.length < MIN_IMAGE_SIZE) return false;
+    if (buffer.length > MAX_IMAGE_SIZE) return false;
+
+    let ext = extname(new URL(url).pathname).toLowerCase();
+    if (!ALLOWED_EXTENSIONS.includes(ext)) {
+      if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
+      else if (contentType.includes("png")) ext = ".png";
+      else if (contentType.includes("webp")) ext = ".webp";
+      else ext = ".jpg";
+    }
+
+    const filePath = destPath.replace(/\.\w+$/, ext);
+    writeFileSync(filePath, buffer);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+async function downloadBatch(
+  urls: string[],
+  classDir: string,
+  startIndex: number,
+): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
+  let downloaded = 0;
+  let failed = 0;
+  let index = startIndex;
+
+  for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
+    const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
+
+    const results = await Promise.all(
+      chunk.map(async (url) => {
+        const paddedIndex = String(index).padStart(4, "0");
+        const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
+        const success = await downloadImage(url, destPath);
+        return { success, index: index++ };
+      }),
+    );
+
+    for (const r of results) {
+      if (r.success) downloaded++;
+      else failed++;
+    }
+
+    const total = downloaded + failed;
+    if (total % 30 === 0 || total === urls.length) {
+      process.stdout.write(`\r    Progress: ${downloaded}/${urls.length} (${failed} failed)`);
+    }
+  }
+  console.log();
+
+  return { downloaded, failed, lastIndex: index };
+}
+
+// ─── Query Building ─────────────────────────────────────────────────────────
+
+function buildSearchQueries(name: string, plant: string): string[] {
+  return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
+}
+
+function buildHealthyQueries(plant: string): string[] {
+  const name = plant.replace(/-/g, " ");
+  return [
+    `healthy ${name} leaf`,
+    `${name} leaf closeup`,
+    `healthy ${name} plant`,
+    `${name} foliage`,
+  ];
+}
+
+// ─── Fill Logic ─────────────────────────────────────────────────────────────
+
+/**
+ * Try to collect up to `needed` images for a disease by hitting all three
+ * sources in order. Returns how many new images were actually downloaded.
+ */
+async function fillClass(
+  diseaseId: string,
+  queries: string[],
+  needed: number,
+  classDir: string,
+  seenUrls: Set<string>,
+): Promise<number> {
+  if (needed <= 0) return 0;
+
+  mkdirSync(classDir, { recursive: true });
+
+  const allUrls: string[] = [];
+
+  // ── Source 1: DuckDuckGo ───────────────────────────────────────────────
+  if (allUrls.length < needed) {
+    for (const query of queries) {
+      if (allUrls.length >= needed) break;
+      process.stdout.write(`    DDG: "${query.substring(0, 40)}"... `);
+      const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
+      allUrls.push(...result.urls);
+      console.log(`${result.urls.length} new`);
+      if (result.exhausted) break;
+    }
+  }
+
+  // ── Source 2: iNaturalist ──────────────────────────────────────────────
+  if (allUrls.length < needed) {
+    process.stdout.write(`    iNat: Searching... `);
+    const result = await searchImagesInaturalist(queries[0], needed - allUrls.length, seenUrls);
+    allUrls.push(...result.urls);
+    console.log(`${result.urls.length} new`);
+  }
+
+  // ── Source 3: Wikimedia Commons ────────────────────────────────────────
+  if (allUrls.length < needed) {
+    process.stdout.write(`    Commons: Searching... `);
+    const result = await searchImagesCommons(queries[0], needed - allUrls.length, seenUrls);
+    allUrls.push(...result.urls);
+    console.log(`${result.urls.length} new`);
+  }
+
+  if (allUrls.length === 0) {
+    console.log(`    ✗ No new images found from any source`);
+    return 0;
+  }
+
+  console.log(`    Downloading ${allUrls.length} images...`);
+  const startIndex = countImagesInDir(classDir);
+  const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);
+
+  const newTotal = countImagesInDir(classDir);
+  const gained = newTotal - startIndex;
+  console.log(
+    `    ${downloaded > 0 ? "✓" : "✗"} Downloaded ${downloaded}/${allUrls.length}` +
+      ` (${failed} failed, ${gained} new files)`,
+  );
+
+  return gained;
+}
+
+// ─── Directory Scanner ─────────────────────────────────────────────────────
+
+interface ScanResult {
+  /** Disease id → how many images currently on disk */
+  diseaseCounts: Map<string, number>;
+  /** How many healthy images on disk */
+  healthyCount: number;
+}
+
+function scanDataset(): ScanResult {
+  const diseaseCounts = new Map<string, number>();
+  let healthyCount = 0;
+
+  if (!existsSync(DATASET_DIR)) {
+    return { diseaseCounts, healthyCount: 0 };
+  }
+
+  const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
+
+  for (const entry of entries) {
+    if (!entry.isDirectory()) continue;
+    if (entry.name.startsWith(".")) continue;
+
+    if (entry.name === HEALTHY_CLASS) {
+      healthyCount = countImagesInDir(resolve(DATASET_DIR, entry.name));
+    } else {
+      const count = countImagesInDir(resolve(DATASET_DIR, entry.name));
+      if (count > 0) {
+        diseaseCounts.set(entry.name, count);
+      }
+    }
+  }
+
+  return { diseaseCounts, healthyCount };
+}
+
+// ─── Main ───────────────────────────────────────────────────────────────────
+
+async function main() {
+  console.log("=".repeat(60));
+  console.log("TRAINING DATASET FILL — Gap-filling download");
+  console.log("=".repeat(60));
+
+  // Ensure dataset directory exists
+  mkdirSync(DATASET_DIR, { recursive: true });
+
+  // ── Step 1: Scan what we already have ────────────────────────────────────
+  console.log("\nScanning existing dataset...");
+  const { diseaseCounts, healthyCount } = scanDataset();
+  console.log(`  Found ${diseaseCounts.size} disease directories, ${healthyCount} healthy images`);
+
+  // ── Step 2: Load disease info from DB ────────────────────────────────────
+  console.log("\nLoading disease info from database...");
+  const db = getDb();
+
+  const allDiseases = await db
+    .select({
+      id: diseases.id,
+      plantId: diseases.plantId,
+      name: diseases.name,
+    })
+    .from(diseases);
+
+  // Build a deduplicated map: disease id → first disease info found
+  const diseaseInfo = new Map<string, { name: string; plantId: string }>();
+  for (const d of allDiseases) {
+    if (!diseaseInfo.has(d.id)) {
+      diseaseInfo.set(d.id, { name: d.name, plantId: d.plantId });
+    }
+  }
+  console.log(`  Loaded ${diseaseInfo.size} unique diseases from DB`);
+
+  // ── Step 3: Build deficit list ──────────────────────────────────────────
+  const deficits: DiseaseInfo[] = [];
+
+  for (const [id, info] of diseaseInfo) {
+    const have = diseaseCounts.get(id) ?? 0;
+    const needed = TARGET_PER_DISEASE - have;
+    if (needed > 0) {
+      deficits.push({ id, name: info.name, plantId: info.plantId, have, needed });
+    }
+  }
+
+  // Sort by deficit size (largest first) so we prioritize the neediest diseases
+  deficits.sort((a, b) => b.needed - a.needed);
+
+  const healthyDeficit = TARGET_HEALTHY - healthyCount;
+
+  console.log(`\n${"=".repeat(60)}`);
+  console.log("DEFICIT REPORT");
+  console.log(`${"=".repeat(60)}`);
+  console.log(`  Diseases needing images: ${deficits.length}/${diseaseInfo.size}`);
+  console.log(`  Total images missing:   ${deficits.reduce((s, d) => s + d.needed, 0)}`);
+  console.log(`  Healthy deficit:        ${Math.max(0, healthyDeficit)}`);
+  console.log(`${"=".repeat(60)}`);
+
+  if (deficits.length === 0 && healthyDeficit <= 0) {
+    console.log("\n  ✓ Nothing to do — all targets met!\n");
+    await closeDb();
+    return;
+  }
+
+  // ── Step 4: Load seen-URLs cache ────────────────────────────────────────
+  const seenUrlsCache = loadSeenUrlsCache();
+  let totalDownloaded = 0;
+  let totalFailed = 0;
+  const startTime = Date.now();
+
+  // ── Step 5: Fill disease deficits ───────────────────────────────────────
+  if (deficits.length > 0) {
+    console.log("\n" + "─".repeat(60));
+    console.log(`FILLING ${deficits.length} DISEASES (target: ${TARGET_PER_DISEASE} each)`);
+    console.log("─".repeat(60));
+
+    // Process in parallel batches
+    for (let i = 0; i < deficits.length; i += DISEASE_CONCURRENCY) {
+      const batch = deficits.slice(i, i + DISEASE_CONCURRENCY);
+      const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
+      const totalBatches = Math.ceil(deficits.length / DISEASE_CONCURRENCY);
+
+      console.log(`\n[Batch ${batchNum}/${totalBatches}] Processing ${batch.length} diseases...`);
+
+      await Promise.all(
+        batch.map(async (d) => {
+          const classDir = resolve(DATASET_DIR, d.id);
+          const queries = buildSearchQueries(d.name, d.plantId);
+          const seen = new Set<string>(seenUrlsCache[d.id] ?? []);
+
+          console.log(
+            `  [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
+          );
+
+          const gained = await fillClass(d.id, queries, d.needed, classDir, seen);
+
+          // Update seen-URLs cache for this disease
+          seenUrlsCache[d.id] = Array.from(seen);
+          saveSeenUrlsCache(seenUrlsCache);
+
+          totalDownloaded += gained;
+        }),
+      );
+
+      // Save seen cache after every batch
+      saveSeenUrlsCache(seenUrlsCache);
+
+      const elapsed = Math.round((Date.now() - startTime) / 1000);
+      console.log(
+        `  [Batch ${batchNum}/${totalBatches}] checkpoint — ` +
+          `${totalDownloaded} downloaded so far (${elapsed}s elapsed)`,
+      );
+    }
+  }
+
+  // ── Step 6: Fill healthy deficit ────────────────────────────────────────
+  if (healthyDeficit > 0) {
+    console.log("\n" + "─".repeat(60));
+    console.log(`FILLING HEALTHY CLASS (target: ${TARGET_HEALTHY})`);
+    console.log("─".repeat(60));
+
+    const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
+    mkdirSync(healthyDir, { recursive: true });
+
+    // Collect all unique plants from the disease info
+    const allPlants = [...new Set(diseaseInfo.values())].map((d) => d.plantId);
+    const allHealthyQueries: string[] = [];
+    for (const plant of allPlants) {
+      allHealthyQueries.push(...buildHealthyQueries(plant));
+    }
+
+    const healthySeen = new Set<string>(seenUrlsCache[HEALTHY_CLASS] ?? []);
+    const healthyNeeded = TARGET_HEALTHY - countImagesInDir(healthyDir);
+    const allUrls: string[] = [];
+
+    // Try each source with up to 20 healthy queries
+    const sources = [
+      { name: "DDG", collector: collectImagesDuckDuckGo },
+      { name: "iNat", collector: searchImagesInaturalist },
+      { name: "Commons", collector: searchImagesCommons },
+    ] as const;
+
+    for (const source of sources) {
+      if (allUrls.length >= healthyNeeded) break;
+      console.log(`\n  Source: ${source.name}`);
+
+      for (const query of allHealthyQueries.slice(0, 20)) {
+        if (allUrls.length >= healthyNeeded) break;
+
+        process.stdout.write(`    "${query}"... `);
+        const result = await source.collector(query, healthyNeeded - allUrls.length, healthySeen);
+        allUrls.push(...result.urls);
+        console.log(`${result.urls.length} new`);
+      }
+    }
+
+    if (allUrls.length > 0) {
+      console.log(`\n  Downloading ${allUrls.length} healthy images...`);
+      const startIdx = countImagesInDir(healthyDir);
+      const { downloaded, failed } = await downloadBatch(allUrls, healthyDir, startIdx);
+
+      const newTotal = countImagesInDir(healthyDir);
+      const gained = newTotal - healthyCount;
+      totalDownloaded += gained;
+      totalFailed += failed;
+
+      console.log(
+        `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images.` +
+          ` Total healthy: ${newTotal}/${TARGET_HEALTHY} (${gained} new)`,
+      );
+    } else {
+      console.log(`\n  ✗ No healthy images found`);
+    }
+
+    // Update seen-URLs cache
+    seenUrlsCache[HEALTHY_CLASS] = Array.from(healthySeen);
+    saveSeenUrlsCache(seenUrlsCache);
+  }
+
+  // ── Summary ──────────────────────────────────────────────────────────────
+  const elapsed = Math.round((Date.now() - startTime) / 1000);
+  const mins = Math.floor(elapsed / 60);
+  const hrs = Math.floor(mins / 60);
+
+  // Final scan
+  const finalScan = scanDataset();
+  const totalHave = [...finalScan.diseaseCounts.values()].reduce((s, c) => s + c, 0);
+  const atTarget = [...finalScan.diseaseCounts.values()].filter(
+    (c) => c >= TARGET_PER_DISEASE,
+  ).length;
+
+  console.log("\n" + "=".repeat(60));
+  console.log("  ✅ FILL COMPLETE");
+  console.log("=".repeat(60));
+  console.log(`  Time:              ${hrs}h ${mins % 60}m`);
+  console.log(`  Diseases at target: ${atTarget}/${diseaseInfo.size}`);
+  console.log(`  Total images:       ${totalHave}`);
+  console.log(`  Healthy images:     ${finalScan.healthyCount}/${TARGET_HEALTHY}`);
+  console.log(`  New downloads:      ${totalDownloaded}`);
+  console.log(`  Dataset dir:        ${DATASET_DIR}/`);
+
+  await closeDb();
+  console.log("=".repeat(60));
+}
+
+main().catch((err) => {
+  console.error("\nFatal error:", err);
+  process.exit(1);
+});
--- a/apps/web/scripts/scrape-training-dataset.ts
+++ b/apps/web/scripts/scrape-training-dataset.ts
@@ -4,10 +4,10 @@
 *
 * Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
 *
- * Targets (tiered by plant type):
- *   - Core plants (houseplants + common garden): 100 images per disease
- *   - Full set (all 11,498 DB diseases): 10 images per disease
- *   - Healthy: 400 images
+ * Target: Top 200 most common plant diseases (ranked by iNaturalist observation counts)
+ *   - 200 images per disease
+ *   - 200 healthy plant images
+ *   - Processes 5 diseases in parallel with 30 concurrent downloads each
 *
 * Sources (all free, no API keys):
 *   1. DB image_url — existing images already found
@@ -42,66 +42,30 @@ try {

 import { getDb, closeDb } from "@/lib/db/index";
 import { diseases } from "@/lib/db/schema";
+import { sql } from "drizzle-orm";

 // ─── Config ─────────────────────────────────────────────────────────────────

 const DATASET_DIR = resolve(__dirname, "../data/dataset");
 const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");

-/** Target images per disease for CORE plants */
-const TARGET_CORE = 100;
+/** Target images per disease */
+const TARGET_PER_DISEASE = 200;

-/** Target images per disease for the FULL set */
-const TARGET_FULL = 10;
+/** Number of diseases to target (most common first) */
+const TARGET_DISEASE_COUNT = 200;

 /** Target images for the "healthy" class */
 const TARGET_HEALTHY = 400;

-/** Core plants that get higher image targets */
-const CORE_PLANTS = new Set([
-  // Houseplants
-  "monstera",
-  "pothos",
-  "snake-plant",
-  "peace-lily",
-  "orchid",
-  "succulent",
-  "fiddle-leaf-fig",
-  "aloe-vera",
-  "cactus",
-  "fern",
-  // Garden plants
-  "tomato",
-  "basil",
-  "rose",
-  "pepper",
-  "strawberry",
-  "cucumber",
-  "squash",
-  "lettuce",
-  "spinach",
-  "cabbage",
-  "lavender",
-  "mint",
-  "jasmine",
-  "sunflower",
-  "daisy",
-  "zucchini",
-  "bean",
-  "eggplant",
-  "chili",
-  // General disease patterns
-  "general",
-]);
-
 /** Delay between DuckDuckGo search API calls (ms) */
 const SEARCH_DELAY = 1500;

-/** Delay between image downloads (ms) */
-const DOWNLOAD_DELAY = 100;
+/** Max concurrent image downloads per disease */
+const CONCURRENT_DOWNLOADS = 30;

-/** Max concurrent downloads */
-const CONCURRENT_DOWNLOADS = 10;
+/** Number of diseases to process in parallel */
+const DISEASE_CONCURRENCY = 5;

 /** Minimum image size in bytes to accept */
 const MIN_IMAGE_SIZE = 10_000; // 10KB
@@ -167,21 +131,246 @@ interface Progress {

 // ─── DB Loading ──────────────────────────────────────────────────────────────

+const INAT_CACHE_FILE = resolve(DATASET_DIR, ".inat-prevalence-cache.json");
+
 /**
- * Load all diseases from the database with their existing image URLs.
+ * Query iNaturalist for real-world prevalence of a disease.
+ * Returns observation count (higher = more common in the real world).
+ */
+async function getInatPrevalence(diseaseName: string, plantName?: string): Promise<number> {
+  try {
+    const headers = { "User-Agent": UA, Accept: "application/json" };
+    const signal = AbortSignal.timeout(10_000);
+    const baseUrl = "https://api.inaturalist.org/v1/observations";
+
+    // Tier 1: disease + plant name, research-grade, Plantae/Fungi/Chromista
+    // This is the most specific and reliable query — filters to relevant kingdoms
+    // and only counts community-verified observations.
+    if (plantName) {
+      const q = `${diseaseName} ${plantName}`;
+      const url =
+        `${baseUrl}?q=${encodeURIComponent(q)}` +
+        `&quality_grade=research` +
+        `&iconic_taxon_id=47126,47158,47686` +
+        `&photos_only=true&per_page=1`;
+      const res = await fetch(url, { headers, signal });
+      if (res.ok) {
+        const data = (await res.json()) as { total_results: number };
+        if ((data.total_results ?? 0) > 0) return data.total_results!;
+      }
+    }
+
+    // Fallback: disease name only, all quality grades (original behavior)
+    const url = `${baseUrl}?q=${encodeURIComponent(diseaseName.toLowerCase())}&photos_only=true&per_page=1`;
+    const res = await fetch(url, { headers, signal });
+    if (!res.ok) return 0;
+    const data = (await res.json()) as { total_results: number };
+    return data.total_results ?? 0;
+  } catch {
+    return 0;
+  }
+}
+
+/**
+ * Load prevalence data from cache or build it by querying iNaturalist.
+ * Caches results to avoid re-querying on every run.
+ */
+async function loadPrevalenceData(
+  uniqueNames: string[],
+  plantMap?: Map<string, string>,
+): Promise<Map<string, number>> {
+  // Load cache if exists
+  let cache: Record<string, number> = {};
+  if (existsSync(INAT_CACHE_FILE)) {
+    try {
+      cache = JSON.parse(readFileSync(INAT_CACHE_FILE, "utf-8"));
+    } catch {}
+  }
+
+  const prevalenceMap = new Map<string, number>();
+  const toQuery: string[] = [];
+
+  // Check which names need querying
+  for (const name of uniqueNames) {
+    const key = name.toLowerCase();
+    if (key in cache) {
+      prevalenceMap.set(name, cache[key]);
+    } else {
+      toQuery.push(name);
+    }
+  }
+
+  if (toQuery.length > 0) {
+    console.log(`\n  Querying iNaturalist for ${toQuery.length} disease prevalence scores...`);
+    let queried = 0;
+
+    for (const name of toQuery) {
+      const count = await getInatPrevalence(name, plantMap?.get(name));
+      const key = name.toLowerCase();
+      cache[key] = count;
+      prevalenceMap.set(name, count);
+      queried++;
+
+      // Save cache every 10 queries
+      if (queried % 10 === 0) {
+        writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2));
+        console.log(`    Queried ${queried}/${toQuery.length}...`);
+      }
+
+      // Rate limit: ~100 req/min
+      await sleep(600);
+    }
+
+    // Final cache save
+    writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2));
+    console.log(`    ✓ Queried ${queried} diseases, cached to ${INAT_CACHE_FILE}`);
+  }
+
+  return prevalenceMap;
+}
+
+/**
+ * Persist prevalence scores to the database and update prevalence enum.
+ * Maps observation counts to common/uncommon/rare based on thresholds.
+ */
+async function persistPrevalenceData(
+  db: ReturnType<typeof getDb>,
+  prevalenceMap: Map<string, number>,
+): Promise<void> {
+  // Load all diseases to update
+  const allDiseases = await db
+    .select({
+      id: diseases.id,
+      name: diseases.name,
+    })
+    .from(diseases);
+
+  // Compute percentile-based thresholds from actual score distribution.
+  // Top 25% → common, bottom 25% → rare, middle 50% → uncommon.
+  // This guarantees meaningful classification regardless of absolute scale.
+  const scores = Array.from(prevalenceMap.values())
+    .filter((s) => s > 0)
+    .sort((a, b) => a - b);
+  const n = scores.length;
+  const commonThreshold = n > 0 ? scores[Math.floor(n * 0.75)] : 1000;
+  const rareThreshold = n > 0 ? scores[Math.floor(n * 0.25)] : 10;
+
+  console.log(
+    `\n  Prevalence distribution: ${n} non-zero scores` +
+      `, p25=${rareThreshold.toLocaleString()}` +
+      `, p75=${commonThreshold.toLocaleString()}`,
+  );
+  console.log(`  Persisting prevalence data for ${allDiseases.length} diseases...`);
+  let updated = 0;
+
+  for (const disease of allDiseases) {
+    const score = prevalenceMap.get(disease.name) ?? 0;
+
+    // Map score to prevalence enum using distribution-based thresholds.
+    // Score of 0 means no iNaturalist observations found — genuinely rare.
+    let prevalence: "common" | "uncommon" | "rare" | "very_rare";
+    if (score === 0) {
+      prevalence = "very_rare";
+    } else if (score >= commonThreshold) {
+      prevalence = "common";
+    } else if (score > rareThreshold) {
+      prevalence = "uncommon";
+    } else {
+      prevalence = "rare";
+    }
+
+    await db
+      .update(diseases)
+      .set({
+        prevalenceScore: score,
+        prevalence,
+        updatedAt: sql`(datetime('now'))`,
+      })
+      .where(sql`${diseases.id} = ${disease.id}`);
+
+    updated++;
+    if (updated % 100 === 0) {
+      console.log(`    Updated ${updated}/${allDiseases.length}...`);
+    }
+  }
+
+  console.log(`    ✓ Updated ${updated} diseases with prevalence data`);
+}
+
+/**
+ * Load the top 200 most common diseases from the database.
+ * Ranks by iNaturalist observation counts (real-world prevalence data).
 */
 async function loadDiseasesFromDb(): Promise<DbDisease[]> {
  const db = getDb();
-  const rows = await db
+
+  // Get unique disease names and their most common host plant for better iNaturalist queries.
+  const nameStats = await db
+    .select({
+      name: diseases.name,
+      plantId: diseases.plantId,
+      count: sql<number>`COUNT(*)`.mapWith(Number),
+    })
+    .from(diseases)
+    .groupBy(diseases.name, diseases.plantId);
+
+  // Aggregate: unique names, name frequency (across all plants), and most common plant per name
+  const seenNames = new Set<string>();
+  const nameFrequency = new Map<string, number>();
+  const plantFreq = new Map<string, Map<string, number>>();
+  let totalDiseases = 0;
+
+  for (const row of nameStats) {
+    seenNames.add(row.name);
+    nameFrequency.set(row.name, (nameFrequency.get(row.name) ?? 0) + row.count);
+    totalDiseases += row.count;
+
+    if (!plantFreq.has(row.name)) plantFreq.set(row.name, new Map());
+    plantFreq.get(row.name)!.set(row.plantId, row.count);
+  }
+
+  const uniqueNames = [...seenNames];
+
+  // For each disease name, pick the most frequent host plant for more specific iNaturalist queries
+  const plantMap = new Map<string, string>();
+  for (const [name, freq] of plantFreq) {
+    const top = [...freq.entries()].sort((a, b) => b[1] - a[1])[0];
+    plantMap.set(name, top[0]);
+  }
+
+  console.log(
+    `  Found ${uniqueNames.length} unique disease names across ${totalDiseases} diseases`,
+  );
+
+  // Load or build prevalence data from iNaturalist (with plant context for better queries)
+  const prevalenceMap = await loadPrevalenceData(uniqueNames, plantMap);
+
+  // Persist prevalence scores to database
+  await persistPrevalenceData(db, prevalenceMap);
+
+  // Load all diseases
+  const allDiseases = await db
    .select({
      id: diseases.id,
      plantId: diseases.plantId,
      name: diseases.name,
      imageUrl: diseases.imageUrl,
    })
-    .from(diseases)
-    .orderBy(diseases.id);
-  return rows;
+    .from(diseases);
+
+  // Sort by iNaturalist prevalence (descending), then by name frequency as tiebreaker
+  allDiseases.sort((a, b) => {
+    const prevA = prevalenceMap.get(a.name) ?? 0;
+    const prevB = prevalenceMap.get(b.name) ?? 0;
+    if (prevA !== prevB) return prevB - prevA;
+    // Tiebreaker: name frequency
+    const freqA = nameFrequency.get(a.name) ?? 0;
+    const freqB = nameFrequency.get(b.name) ?? 0;
+    return freqB - freqA;
+  });
+
+  // Return top TARGET_DISEASE_COUNT
+  return allDiseases.slice(0, TARGET_DISEASE_COUNT);
 }

 // ─── DuckDuckGo API ─────────────────────────────────────────────────────────
@@ -208,7 +397,9 @@ async function searchImagesDuckDuckGo(
  vqd: string,
  page: number,
 ): Promise<DuckDuckGoImageResult[]> {
-  const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(query)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
+  const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
+    query,
+  )}&vqd=${vqd}&o=json&p=${page}&f=,,,`;

  const res = await fetch(url, {
    headers: {
@@ -396,7 +587,9 @@ async function searchImagesCommons(
      for (const hit of hits) {
        if (results.length >= target) break;
        const filename = hit.title.replace(/^File:/, "");
-        const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(filename)}`;
+        const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
+          filename,
+        )}`;
        if (seenUrls.has(imgUrl)) continue;
        seenUrls.add(imgUrl);
        results.push(imgUrl);
@@ -461,7 +654,6 @@ async function downloadBatch(
        const paddedIndex = String(index).padStart(4, "0");
        const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
        const success = await downloadImage(url, destPath);
-        await sleep(DOWNLOAD_DELAY);
        return { success, index: index++, url: url.substring(0, 50) };
      }),
    );
@@ -496,19 +688,36 @@ function loadProgress(): Progress {
  }
  try {
    const raw = JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Partial<Progress>;
-    // Backward compat: ensure new fields exist
-    raw.phase ??= 0;
-    raw.phaseIndex ??= 0;
    raw.classes ??= {};
+
+    // Migration: detect old tiered system (phaseIndex > 200 means it's from old core/full system)
+    const isOldFormat = (raw.phaseIndex ?? 0) > 200 || !raw.phase;
+    if (isOldFormat) {
+      console.warn("  ↻ Migrating progress file from old tiered system to new format");
+      console.warn("    Phase checkpoint reset to 0 (will re-scan all 200 diseases)");
+      console.warn("    Per-class progress (seenUrls, counts) preserved");
+      raw.phase = 0;
+      raw.phaseIndex = 0;
+    } else {
+      raw.phase ??= 0;
+      raw.phaseIndex ??= 0;
+    }
+
    // Ensure each class has the sources field
    for (const key of Object.keys(raw.classes)) {
      const cp = raw.classes[key] as Partial<ClassProgress>;
-      cp.sources ??= {
-        db: { exhausted: false },
-        duckduckgo: { exhausted: false },
-        inaturalist: { exhausted: false },
-        wikimedia: { exhausted: false },
-      };
+
+      // Migrate class-level exhausted to per-source exhausted if needed
+      if (!cp.sources) {
+        const classExhausted = cp.exhausted ?? false;
+        cp.sources = {
+          db: { exhausted: classExhausted },
+          duckduckgo: { exhausted: classExhausted },
+          inaturalist: { exhausted: classExhausted },
+          wikimedia: { exhausted: classExhausted },
+        };
+      }
+
      cp.seenUrls ??= [];
    }
    return raw as Progress;
@@ -608,7 +817,6 @@ async function collectClassImages(
  progress: Progress,
  classDir: string,
  existingUrls: string[] = [],
-  fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
 ): Promise<void> {
  const cp = getClassProgress(progress, classId);

@@ -664,7 +872,7 @@ async function collectClassImages(
  }

  // ── Source 1: DuckDuckGo ──────────────────────────────────────────────
-  if (!fastMode && !sources.duckduckgo.exhausted && allUrls.length < needed) {
+  if (!sources.duckduckgo.exhausted && allUrls.length < needed) {
    for (const query of queries) {
      if (allUrls.length >= needed) break;
      process.stdout.write(`  DDG: "${query.substring(0, 40)}"... `);
@@ -753,7 +961,9 @@ async function collectClassImages(

  const pct = Math.round((cp.count / target) * 100);
  console.log(
-    `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${allUrls.length} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
+    `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${
+      allUrls.length
+    } (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
  );
 }

@@ -761,21 +971,18 @@ async function collectClassImages(

 async function main() {
  console.log("=".repeat(60));
-  console.log("PLANT DISEASE DATASET COLLECTOR — FULL DB");
+  console.log("PLANT DISEASE DATASET COLLECTOR — TOP 200 COMMON DISEASES");
  console.log("=".repeat(60));

+  // Ensure dataset directory exists before any cache writes
+  mkdirSync(DATASET_DIR, { recursive: true });
+
  // Load diseases from DB
-  console.log("\nLoading diseases from database...");
+  console.log("\nLoading top 200 most common diseases from database...");
  const dbDiseases = await loadDiseasesFromDb();
  console.log(`  ${dbDiseases.length} diseases loaded`);

-  const coreDiseases = dbDiseases.filter((d) => CORE_PLANTS.has(d.plantId));
-  const fullDiseases = dbDiseases.filter((d) => !CORE_PLANTS.has(d.plantId));
-  console.log(`  Core plants: ${coreDiseases.length} diseases (target: ${TARGET_CORE})`);
-  console.log(`  Full set: ${fullDiseases.length} diseases (target: ${TARGET_FULL})`);
-
  // Load progress
-  mkdirSync(DATASET_DIR, { recursive: true });
  const progress = loadProgress();

  // If all phases complete, exit early
@@ -787,63 +994,57 @@ async function main() {

  const startTime = Date.now();

-  // ── Phase 1: Core set ──────────────────────────────────────────────────
+  // ── Phase 1: Common diseases (200 images each) ──────────────────────────

  console.log("\n" + "─".repeat(60));
-  console.log("PHASE 1: Core Diseases (100 images each)");
+  console.log("PHASE 1: Common Diseases (200 images each)");
  console.log("─".repeat(60));

-  const coreStart = progress.phase === 0 ? progress.phaseIndex : 0;
-  if (coreStart > 0) {
+  const diseaseStart = progress.phase === 0 ? progress.phaseIndex : 0;
+  if (diseaseStart > 0) {
    console.log(
-      `  Resuming from disease #${coreStart + 1} (${((coreStart / coreDiseases.length) * 100).toFixed(0)}% done)`,
+      `  Resuming from disease #${diseaseStart + 1} (${(
+        (diseaseStart / dbDiseases.length) *
+        100
+      ).toFixed(0)}% done)`,
    );
  }

-  for (let i = coreStart; i < coreDiseases.length; i++) {
-    const d = coreDiseases[i];
-    const classDir = resolve(DATASET_DIR, d.id);
-    const queries = buildSearchQueries(d);
-    const existingUrls = d.imageUrl ? [d.imageUrl] : [];
+  // Process diseases in parallel batches
+  for (let i = diseaseStart; i < dbDiseases.length; i += DISEASE_CONCURRENCY) {
+    const batch = dbDiseases.slice(i, i + DISEASE_CONCURRENCY);
+    const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
+    const totalBatches = Math.ceil(dbDiseases.length / DISEASE_CONCURRENCY);
+    const pct = Math.round((i / dbDiseases.length) * 100);

-    const pct = Math.round((i / coreDiseases.length) * 100);
-    console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
+    console.log(
+      `\n[Batch ${batchNum}/${totalBatches}] (${pct}%) Processing ${batch.length} diseases in parallel...`,
+    );

-    await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
+    // Process all diseases in this batch concurrently
+    await Promise.all(
+      batch.map(async (d, batchIdx) => {
+        const diseaseIdx = i + batchIdx;
+        const classDir = resolve(DATASET_DIR, d.id);
+        const queries = buildSearchQueries(d);
+        const existingUrls = d.imageUrl ? [d.imageUrl] : [];

-    // Save checkpoint: phase 0, at index i
+        console.log(`  [${diseaseIdx + 1}/${dbDiseases.length}] ${d.name || d.id} (${d.plantId})`);
+
+        await collectClassImages(
+          d.id,
+          queries,
+          TARGET_PER_DISEASE,
+          progress,
+          classDir,
+          existingUrls,
+        );
+      }),
+    );
+
+    // Save checkpoint: phase 0, at index i + batch.length
    progress.phase = 0;
-    progress.phaseIndex = i + 1;
-    saveProgress(progress);
-  }
-
-  // ── Phase 2: Full set ──────────────────────────────────────────────────
-
-  console.log("\n" + "─".repeat(60));
-  console.log("PHASE 2: Full Disease Set (10 images each)");
-  console.log("─".repeat(60));
-
-  const fullStart = progress.phase === 1 ? progress.phaseIndex : 0;
-  if (fullStart > 0) {
-    console.log(
-      `  Resuming from disease #${fullStart + 1} (${((fullStart / fullDiseases.length) * 100).toFixed(0)}% done)`,
-    );
-  }
-
-  for (let i = fullStart; i < fullDiseases.length; i++) {
-    const d = fullDiseases[i];
-    const classDir = resolve(DATASET_DIR, d.id);
-    const queries = buildSearchQueries(d);
-    const existingUrls = d.imageUrl ? [d.imageUrl] : [];
-
-    const pct = Math.round((i / fullDiseases.length) * 100);
-    console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
-
-    await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
-
-    // Save checkpoint: phase 1, at index i
-    progress.phase = 1;
-    progress.phaseIndex = i + 1;
+    progress.phaseIndex = i + batch.length;
    saveProgress(progress);
  }

--- a/apps/web/src/app/browse/[plantId]/DiseaseCards.tsx
+++ b/apps/web/src/app/browse/[plantId]/DiseaseCards.tsx
@@ -272,18 +272,22 @@ function PrevalenceBadge({ prevalence }: { prevalence: Prevalence }) {
    common: "📊",
    uncommon: "📋",
    rare: "📌",
+    very_rare: "🔍",
  };
  const colors: Record<Prevalence, string> = {
    common: "bg-emerald-100 text-emerald-800 dark:bg-emerald-900/40 dark:text-emerald-300",
    uncommon: "bg-zinc-100 text-zinc-700 dark:bg-zinc-800/60 dark:text-zinc-300",
    rare: "bg-amber-100 text-amber-800 dark:bg-amber-900/40 dark:text-amber-300",
+    very_rare: "bg-red-100 text-red-800 dark:bg-red-900/40 dark:text-red-300",
  };

+  const label = prevalence.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
+
  return (
    <span
      className={`inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium ${colors[prevalence]}`}
    >
-      {icons[prevalence]} {prevalence.charAt(0).toUpperCase() + prevalence.slice(1)}
+      {icons[prevalence]} {label}
    </span>
  );
 }
@@ -298,9 +302,10 @@ const SEVERITY_RANK: Record<Severity, number> = {
 };

 const PREVALENCE_RANK: Record<Prevalence, number> = {
-  common: 3,
-  uncommon: 2,
-  rare: 1,
+  common: 4,
+  uncommon: 3,
+  rare: 2,
+  very_rare: 1,
 };

 type SortField = "prevalence" | "danger";
--- a/apps/web/src/lib/api/browse.ts
+++ b/apps/web/src/lib/api/browse.ts
@@ -3,7 +3,7 @@
 * for the browse page. Runs server-side only.
 */

-import { sql, eq } from "drizzle-orm";
+import { sql, eq, inArray, notInArray } from "drizzle-orm";
 import { getDb } from "@/lib/db/index";
 import { plants, diseases, plantViews } from "@/lib/db/schema";
 import type { PlantCardData } from "@/components/PlantCard";
@@ -12,11 +12,13 @@ export type { PlantCardData };

 /**
 * Get all plants with their disease counts for the browse page.
+ *
+ * Uses scalar subqueries for COUNT to avoid expensive LEFT JOIN + GROUP BY
+ * on the large diseases table (11,498 rows).
 */
 export async function getBrowsePlants(): Promise<PlantCardData[]> {
  const db = getDb();

-  // LEFT JOIN to include plants with zero diseases
  const rows = await db
    .select({
      id: plants.id,
@@ -27,12 +29,10 @@ export async function getBrowsePlants(): Promise<PlantCardData[]> {
      imageUrl: plants.imageUrl,
      updatedAt: plants.updatedAt,
      viewCount: sql<number>`COALESCE(${plantViews.viewCount}, 0)`,
-      diseaseCount: sql<number>`COUNT(${diseases.id})`,
+      diseaseCount: sql<number>`(SELECT COUNT(*) FROM ${diseases} WHERE ${diseases.plantId} = ${plants.id})`,
    })
    .from(plants)
-    .leftJoin(diseases, eq(diseases.plantId, plants.id))
    .leftJoin(plantViews, eq(plantViews.plantId, plants.id))
-    .groupBy(plants.id)
    .orderBy(plants.commonName);

  return rows.map((r) => ({
@@ -61,12 +61,10 @@ export async function getBrowsePlant(id: string): Promise<PlantCardData | null>
      family: plants.family,
      category: plants.category,
      imageUrl: plants.imageUrl,
-      diseaseCount: sql<number>`COUNT(${diseases.id})`,
+      diseaseCount: sql<number>`(SELECT COUNT(*) FROM ${diseases} WHERE ${diseases.plantId} = ${plants.id})`,
    })
    .from(plants)
-    .leftJoin(diseases, eq(diseases.plantId, plants.id))
    .where(eq(plants.id, id))
-    .groupBy(plants.id)
    .limit(1);

  return rows[0] ?? null;
@@ -91,12 +89,47 @@ const FEATURED_IDS = [
 ];

 export async function getFeaturedPlants(): Promise<PlantCardData[]> {
-  const all = await getBrowsePlants();
-  const featured = all.filter((p) => FEATURED_IDS.includes(p.id));
-  // If fewer than expected are found, pad with first available plants
-  if (featured.length < 6) {
-    const rest = all.filter((p) => !FEATURED_IDS.includes(p.id));
-    return [...featured, ...rest].slice(0, 12);
+  const db = getDb();
+
+  const selectFeatured = db
+    .select({
+      id: plants.id,
+      commonName: plants.commonName,
+      scientificName: plants.scientificName,
+      family: plants.family,
+      category: plants.category,
+      imageUrl: plants.imageUrl,
+      updatedAt: plants.updatedAt,
+      viewCount: sql<number>`COALESCE(${plantViews.viewCount}, 0)`,
+      diseaseCount: sql<number>`(SELECT COUNT(*) FROM ${diseases} WHERE ${diseases.plantId} = ${plants.id})`,
+    })
+    .from(plants)
+    .leftJoin(plantViews, eq(plantViews.plantId, plants.id));
+
+  const rows = await selectFeatured
+    .where(inArray(plants.id, FEATURED_IDS))
+    .orderBy(plants.commonName);
+
+  if (rows.length < 6) {
+    const padRows = await selectFeatured
+      .where(notInArray(plants.id, FEATURED_IDS))
+      .orderBy(plants.commonName)
+      .limit(12 - rows.length);
+    return [...rows, ...padRows].map(mapRow);
  }
-  return featured.slice(0, 12);
+  return rows.slice(0, 12).map(mapRow);
+}
+
+function mapRow(r: Record<string, unknown>): PlantCardData {
+  return {
+    id: r.id as string,
+    commonName: r.commonName as string,
+    scientificName: r.scientificName as string,
+    family: r.family as string,
+    category: r.category as string,
+    imageUrl: r.imageUrl as string,
+    updatedAt: r.updatedAt as string | undefined,
+    viewCount: r.viewCount as number,
+    diseaseCount: r.diseaseCount as number,
+  };
 }
--- a/apps/web/src/lib/api/diseases-db.ts
+++ b/apps/web/src/lib/api/diseases-db.ts
@@ -280,7 +280,7 @@ export async function validateKnowledgeBase(): Promise<string[]> {
    "environmental",
  ];
  const validSeverities: Severity[] = ["low", "moderate", "high", "critical"];
-  const validPrevalences: Prevalence[] = ["common", "uncommon", "rare"];
+  const validPrevalences: Prevalence[] = ["common", "uncommon", "rare", "very_rare"];

  const db = getDb();

--- a/apps/web/src/lib/db/schema.ts
+++ b/apps/web/src/lib/db/schema.ts
@@ -55,10 +55,11 @@ export const diseases = sqliteTable(
    prevention: text("prevention", { mode: "json" }).notNull().default([]).$type<string[]>(),
    lookalikeIds: text("lookalike_ids", { mode: "json" }).notNull().default([]).$type<string[]>(),
    prevalence: text("prevalence", {
-      enum: ["common", "uncommon", "rare"],
+      enum: ["common", "uncommon", "rare", "very_rare"],
    })
      .notNull()
      .default("uncommon"),
+    prevalenceScore: integer("prevalence_score").notNull().default(0),
    severity: text("severity", {
      enum: ["low", "moderate", "high", "critical"],
    }).notNull(),
--- a/apps/web/src/lib/types.ts
+++ b/apps/web/src/lib/types.ts
@@ -10,7 +10,7 @@ export type CausalAgentType = "fungal" | "bacterial" | "viral" | "environmental"
 export type Severity = "low" | "moderate" | "high" | "critical";

 /** How common/prevalent a disease is in the field */
-export type Prevalence = "common" | "uncommon" | "rare";
+export type Prevalence = "common" | "uncommon" | "rare" | "very_rare";

 /** Plant category for grouping and filtering */
 export type PlantCategory =
				`@@ -0,0 +1 @@`
				ALTER TABLE `diseases` ADD COLUMN `prevalence_score` integer DEFAULT 0 NOT NULL;