plant-disease-id/apps/web/scripts/scrape-training-dataset.ts

#!/usr/bin/env node
/**
 * scrape-training-dataset.ts
 *
 * Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
 *
 * Target: Top 200 most common plant diseases (ranked by iNaturalist observation counts)
 *   - 200 images per disease
 *   - 200 healthy plant images
 *   - Processes 5 diseases in parallel with 30 concurrent downloads each
 *
 * Sources (all free, no API keys):
 *   1. DB image_url — existing images already found
 *   2. DuckDuckGo  — general web image search
 *   3. iNaturalist — real-world plant observation photos
 *   4. Wikimedia Commons — curated scientific/educational images
 *
 * Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
 * Progress: data/dataset/.progress.json — interrupt and resume safely.
 */

import "dotenv/config";
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "fs";
import { resolve, extname } from "path";

// Load .env.development for DB creds
const envPath = resolve(__dirname, "../.env.development");
try {
  const env = readFileSync(envPath, "utf-8");
  for (const line of env.split("\n")) {
    const trimmed = line.trim();
    if (trimmed && !trimmed.startsWith("#")) {
      const eqIdx = trimmed.indexOf("=");
      if (eqIdx > 0) {
        const key = trimmed.slice(0, eqIdx).trim();
        const val = trimmed.slice(eqIdx + 1).trim();
        if (!process.env[key]) process.env[key] = val;
      }
    }
  }
} catch {}

import { getDb, closeDb } from "@/lib/db/index";
import { diseases } from "@/lib/db/schema";
import { sql } from "drizzle-orm";

// ─── Config ─────────────────────────────────────────────────────────────────

const DATASET_DIR = resolve(__dirname, "../data/dataset");
const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");

/** Target images per disease */
const TARGET_PER_DISEASE = 200;

/** Number of diseases to target (most common first) */
const TARGET_DISEASE_COUNT = 200;

/** Target images for the "healthy" class */
const TARGET_HEALTHY = 400;

/** Delay between DuckDuckGo search API calls (ms) */
const SEARCH_DELAY = 1500;

/** Max concurrent image downloads per disease */
const CONCURRENT_DOWNLOADS = 30;

/** Number of diseases to process in parallel */
const DISEASE_CONCURRENCY = 5;

/** Minimum image size in bytes to accept */
const MIN_IMAGE_SIZE = 10_000; // 10KB

/** Maximum image size in bytes */
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB

/** Allowed file extensions */
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];

/** User agent for requests */
const UA =
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";

/** Class ID for healthy plants */
const HEALTHY_CLASS = "healthy";

// ─── Types ──────────────────────────────────────────────────────────────────

interface DbDisease {
  id: string;
  plantId: string;
  name: string;
  imageUrl: string | null;
}

interface DuckDuckGoImageResult {
  image: string;
  title: string;
  url: string;
  thumbnail: string;
  height: number;
  width: number;
}

interface SourceState {
  exhausted: boolean;
}

interface ClassProgress {
  count: number;
  downloaded: number;
  failed: number;
  seenUrls: string[];
  exhausted: boolean;
  /** Per-source exhaustion tracking — prevents re-scraping exhausted sources on resume */
  sources: {
    db: SourceState;
    duckduckgo: SourceState;
    inaturalist: SourceState;
    wikimedia: SourceState;
  };
}

interface Progress {
  lastUpdated: string;
  classes: Record<string, ClassProgress>;
  /** Phase checkpoint: 0=core, 1=full, 2=healthy. On resume, skip to this phase. */
  phase: number;
  /** Index within the current phase's disease array. On resume, skip to this index. */
  phaseIndex: number;
}

// ─── DB Loading ──────────────────────────────────────────────────────────────

const INAT_CACHE_FILE = resolve(DATASET_DIR, ".inat-prevalence-cache.json");

/**
 * Query iNaturalist for real-world prevalence of a disease.
 * Returns observation count (higher = more common in the real world).
 */
async function getInatPrevalence(diseaseName: string, plantName?: string): Promise<number> {
  try {
    const headers = { "User-Agent": UA, Accept: "application/json" };
    const signal = AbortSignal.timeout(10_000);
    const baseUrl = "https://api.inaturalist.org/v1/observations";

    // Tier 1: disease + plant name, research-grade, Plantae/Fungi/Chromista
    // This is the most specific and reliable query — filters to relevant kingdoms
    // and only counts community-verified observations.
    if (plantName) {
      const q = `${diseaseName} ${plantName}`;
      const url =
        `${baseUrl}?q=${encodeURIComponent(q)}` +
        `&quality_grade=research` +
        `&iconic_taxon_id=47126,47158,47686` +
        `&photos_only=true&per_page=1`;
      const res = await fetch(url, { headers, signal });
      if (res.ok) {
        const data = (await res.json()) as { total_results: number };
        if ((data.total_results ?? 0) > 0) return data.total_results!;
      }
    }

    // Fallback: disease name only, all quality grades (original behavior)
    const url = `${baseUrl}?q=${encodeURIComponent(diseaseName.toLowerCase())}&photos_only=true&per_page=1`;
    const res = await fetch(url, { headers, signal });
    if (!res.ok) return 0;
    const data = (await res.json()) as { total_results: number };
    return data.total_results ?? 0;
  } catch {
    return 0;
  }
}

/**
 * Load prevalence data from cache or build it by querying iNaturalist.
 * Caches results to avoid re-querying on every run.
 */
async function loadPrevalenceData(
  uniqueNames: string[],
  plantMap?: Map<string, string>,
): Promise<Map<string, number>> {
  // Load cache if exists
  let cache: Record<string, number> = {};
  if (existsSync(INAT_CACHE_FILE)) {
    try {
      cache = JSON.parse(readFileSync(INAT_CACHE_FILE, "utf-8"));
    } catch {}
  }

  const prevalenceMap = new Map<string, number>();
  const toQuery: string[] = [];

  // Check which names need querying
  for (const name of uniqueNames) {
    const key = name.toLowerCase();
    if (key in cache) {
      prevalenceMap.set(name, cache[key]);
    } else {
      toQuery.push(name);
    }
  }

  if (toQuery.length > 0) {
    console.log(`\n  Querying iNaturalist for ${toQuery.length} disease prevalence scores...`);
    let queried = 0;

    for (const name of toQuery) {
      const count = await getInatPrevalence(name, plantMap?.get(name));
      const key = name.toLowerCase();
      cache[key] = count;
      prevalenceMap.set(name, count);
      queried++;

      // Save cache every 10 queries
      if (queried % 10 === 0) {
        writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2));
        console.log(`    Queried ${queried}/${toQuery.length}...`);
      }

      // Rate limit: ~100 req/min
      await sleep(600);
    }

    // Final cache save
    writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2));
    console.log(`    ✓ Queried ${queried} diseases, cached to ${INAT_CACHE_FILE}`);
  }

  return prevalenceMap;
}

/**
 * Persist prevalence scores to the database and update prevalence enum.
 * Maps observation counts to common/uncommon/rare based on thresholds.
 */
async function persistPrevalenceData(
  db: ReturnType<typeof getDb>,
  prevalenceMap: Map<string, number>,
): Promise<void> {
  // Load all diseases to update
  const allDiseases = await db
    .select({
      id: diseases.id,
      name: diseases.name,
    })
    .from(diseases);

  // Compute percentile-based thresholds from actual score distribution.
  // Top 25% → common, bottom 25% → rare, middle 50% → uncommon.
  // This guarantees meaningful classification regardless of absolute scale.
  const scores = Array.from(prevalenceMap.values())
    .filter((s) => s > 0)
    .sort((a, b) => a - b);
  const n = scores.length;
  const commonThreshold = n > 0 ? scores[Math.floor(n * 0.75)] : 1000;
  const rareThreshold = n > 0 ? scores[Math.floor(n * 0.25)] : 10;

  console.log(
    `\n  Prevalence distribution: ${n} non-zero scores` +
      `, p25=${rareThreshold.toLocaleString()}` +
      `, p75=${commonThreshold.toLocaleString()}`,
  );
  console.log(`  Persisting prevalence data for ${allDiseases.length} diseases...`);
  let updated = 0;

  for (const disease of allDiseases) {
    const score = prevalenceMap.get(disease.name) ?? 0;

    // Map score to prevalence enum using distribution-based thresholds.
    // Score of 0 means no iNaturalist observations found — genuinely rare.
    let prevalence: "common" | "uncommon" | "rare" | "very_rare";
    if (score === 0) {
      prevalence = "very_rare";
    } else if (score >= commonThreshold) {
      prevalence = "common";
    } else if (score > rareThreshold) {
      prevalence = "uncommon";
    } else {
      prevalence = "rare";
    }

    await db
      .update(diseases)
      .set({
        prevalenceScore: score,
        prevalence,
        updatedAt: sql`(datetime('now'))`,
      })
      .where(sql`${diseases.id} = ${disease.id}`);

    updated++;
    if (updated % 100 === 0) {
      console.log(`    Updated ${updated}/${allDiseases.length}...`);
    }
  }

  console.log(`    ✓ Updated ${updated} diseases with prevalence data`);
}

/**
 * Load the top 200 most common diseases from the database.
 * Ranks by iNaturalist observation counts (real-world prevalence data).
 */
async function loadDiseasesFromDb(): Promise<DbDisease[]> {
  const db = getDb();

  // Get unique disease names and their most common host plant for better iNaturalist queries.
  const nameStats = await db
    .select({
      name: diseases.name,
      plantId: diseases.plantId,
      count: sql<number>`COUNT(*)`.mapWith(Number),
    })
    .from(diseases)
    .groupBy(diseases.name, diseases.plantId);

  // Aggregate: unique names, name frequency (across all plants), and most common plant per name
  const seenNames = new Set<string>();
  const nameFrequency = new Map<string, number>();
  const plantFreq = new Map<string, Map<string, number>>();
  let totalDiseases = 0;

  for (const row of nameStats) {
    seenNames.add(row.name);
    nameFrequency.set(row.name, (nameFrequency.get(row.name) ?? 0) + row.count);
    totalDiseases += row.count;

    if (!plantFreq.has(row.name)) plantFreq.set(row.name, new Map());
    plantFreq.get(row.name)!.set(row.plantId, row.count);
  }

  const uniqueNames = [...seenNames];

  // For each disease name, pick the most frequent host plant for more specific iNaturalist queries
  const plantMap = new Map<string, string>();
  for (const [name, freq] of plantFreq) {
    const top = [...freq.entries()].sort((a, b) => b[1] - a[1])[0];
    plantMap.set(name, top[0]);
  }

  console.log(
    `  Found ${uniqueNames.length} unique disease names across ${totalDiseases} diseases`,
  );

  // Load or build prevalence data from iNaturalist (with plant context for better queries)
  const prevalenceMap = await loadPrevalenceData(uniqueNames, plantMap);

  // Persist prevalence scores to database
  await persistPrevalenceData(db, prevalenceMap);

  // Load all diseases
  const allDiseases = await db
    .select({
      id: diseases.id,
      plantId: diseases.plantId,
      name: diseases.name,
      imageUrl: diseases.imageUrl,
    })
    .from(diseases);

  // Sort by iNaturalist prevalence (descending), then by name frequency as tiebreaker
  allDiseases.sort((a, b) => {
    const prevA = prevalenceMap.get(a.name) ?? 0;
    const prevB = prevalenceMap.get(b.name) ?? 0;
    if (prevA !== prevB) return prevB - prevA;
    // Tiebreaker: name frequency
    const freqA = nameFrequency.get(a.name) ?? 0;
    const freqB = nameFrequency.get(b.name) ?? 0;
    return freqB - freqA;
  });

  // Return top TARGET_DISEASE_COUNT
  return allDiseases.slice(0, TARGET_DISEASE_COUNT);
}

// ─── DuckDuckGo API ─────────────────────────────────────────────────────────

async function getVqdToken(query: string): Promise<string> {
  const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;

  const res = await fetch(url, {
    headers: { "User-Agent": UA, Accept: "text/html" },
    signal: AbortSignal.timeout(15_000),
  });

  if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);

  const html = await res.text();
  const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
  if (!match) throw new Error(`Could not extract vqd token for "${query}"`);

  return match[1];
}

async function searchImagesDuckDuckGo(
  query: string,
  vqd: string,
  page: number,
): Promise<DuckDuckGoImageResult[]> {
  const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
    query,
  )}&vqd=${vqd}&o=json&p=${page}&f=,,,`;

  const res = await fetch(url, {
    headers: {
      "User-Agent": UA,
      Accept: "application/json",
      Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
    },
    signal: AbortSignal.timeout(15_000),
  });

  if (!res.ok) {
    if (res.status === 429) {
      console.warn("  ⚠ Rate limited (429). Waiting 10s...");
      await sleep(10_000);
      return searchImagesDuckDuckGo(query, vqd, page);
    }
    if (res.status === 403) return [];
    throw new Error(`DuckDuckGo search failed: ${res.status}`);
  }

  const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
  return data.results ?? [];
}

async function collectImagesDuckDuckGo(
  query: string,
  target: number,
  seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
  const results: string[] = [];
  let page = 1;
  let exhausted = false;
  let consecutiveEmpty = 0;

  let vqd: string;
  try {
    vqd = await getVqdToken(query);
  } catch (err) {
    console.warn(`  ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
    return { urls: [], exhausted: true };
  }

  const MAX_PAGES = 5;
  let lowNoveltyCount = 0;

  while (results.length < target && page <= MAX_PAGES) {
    await sleep(SEARCH_DELAY);

    let pageResults: DuckDuckGoImageResult[];
    try {
      pageResults = await searchImagesDuckDuckGo(query, vqd, page);
    } catch (err) {
      console.warn(`  ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
      break;
    }

    if (!pageResults || pageResults.length === 0) {
      consecutiveEmpty++;
      if (consecutiveEmpty >= 3) {
        exhausted = true;
        break;
      }
      page++;
      continue;
    }

    consecutiveEmpty = 0;
    let newCount = 0;

    for (const r of pageResults) {
      if (results.length >= target) break;
      const imgUrl = r.image || r.url;
      if (!imgUrl || typeof imgUrl !== "string") continue;
      if (seenUrls.has(imgUrl)) continue;
      try {
        new URL(imgUrl);
      } catch {
        continue;
      }
      seenUrls.add(imgUrl);
      results.push(imgUrl);
      newCount++;
    }

    const newRatio = newCount / pageResults.length;
    if (newRatio < 0.05) {
      lowNoveltyCount++;
      if (lowNoveltyCount >= 2) break;
    } else {
      lowNoveltyCount = 0;
    }

    if (results.length < target) page++;
  }

  return { urls: results.slice(0, target), exhausted };
}

// ─── iNaturalist API ─────────────────────────────────────────────────────────

async function searchImagesInaturalist(
  query: string,
  target: number,
  seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
  const results: string[] = [];
  const perPage = Math.min(target, 200);

  const apiUrl =
    `https://api.inaturalist.org/v1/observations` +
    `?q=${encodeURIComponent(query)}` +
    `&photos_only=true` +
    `&quality_grade=research` +
    `&per_page=${perPage}` +
    `&order_by=observed_on&order=desc`;

  try {
    const res = await fetch(apiUrl, {
      headers: { "User-Agent": UA, Accept: "application/json" },
      signal: AbortSignal.timeout(15_000),
    });
    if (!res.ok) return { urls: [], exhausted: false };

    const data = (await res.json()) as {
      results: Array<{ photos: Array<{ url: string }> }>;
    };

    for (const obs of data.results ?? []) {
      if (results.length >= target) break;
      for (const photo of obs.photos ?? []) {
        if (results.length >= target) break;
        const url = photo.url;
        if (!url || seenUrls.has(url)) continue;
        const fullUrl = url.replace("/medium.", "/original.");
        seenUrls.add(fullUrl);
        results.push(fullUrl);
      }
    }

    return { urls: results, exhausted: results.length < target };
  } catch {
    return { urls: results, exhausted: false };
  }
}

// ─── Wikimedia Commons API ──────────────────────────────────────────────────

async function searchImagesCommons(
  query: string,
  target: number,
  seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
  const results: string[] = [];
  let sroffset = 0;

  while (results.length < target) {
    const params = new URLSearchParams({
      action: "query",
      list: "search",
      srsearch: query,
      srnamespace: "6",
      srlimit: "50",
      sroffset: String(sroffset),
      format: "json",
      // No origin needed — server-side fetch, Wikimedia ignores CORS headers on API calls
    });

    const url = `https://commons.wikimedia.org/w/api.php?${params}`;

    try {
      const res = await fetch(url, {
        headers: { "User-Agent": UA },
        signal: AbortSignal.timeout(10_000),
      });
      if (!res.ok) break;

      const data = (await res.json()) as {
        query?: { search?: Array<{ title: string }> };
        continue?: { sroffset?: number };
      };

      const hits = data.query?.search ?? [];
      if (hits.length === 0) break;

      for (const hit of hits) {
        if (results.length >= target) break;
        const filename = hit.title.replace(/^File:/, "");
        const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
          filename,
        )}`;
        if (seenUrls.has(imgUrl)) continue;
        seenUrls.add(imgUrl);
        results.push(imgUrl);
      }

      sroffset = data.continue?.sroffset ?? sroffset + hits.length;
    } catch {
      break;
    }
  }

  return { urls: results, exhausted: results.length < target };
}

// ─── Image Download ─────────────────────────────────────────────────────────

async function downloadImage(url: string, destPath: string): Promise<boolean> {
  try {
    const res = await fetch(url, {
      headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
      signal: AbortSignal.timeout(15_000),
    });
    if (!res.ok) return false;

    const contentType = res.headers.get("content-type") || "";
    if (contentType.includes("text/html")) return false;

    const buffer = Buffer.from(await res.arrayBuffer());
    if (buffer.length < MIN_IMAGE_SIZE) return false;
    if (buffer.length > MAX_IMAGE_SIZE) return false;

    let ext = extname(new URL(url).pathname).toLowerCase();
    if (!ALLOWED_EXTENSIONS.includes(ext)) {
      if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
      else if (contentType.includes("png")) ext = ".png";
      else if (contentType.includes("webp")) ext = ".webp";
      else ext = ".jpg";
    }

    const filePath = destPath.replace(/\.\w+$/, ext);
    writeFileSync(filePath, buffer);
    return true;
  } catch {
    return false;
  }
}

async function downloadBatch(
  urls: string[],
  classDir: string,
  startIndex: number,
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
  let downloaded = 0;
  let failed = 0;
  let index = startIndex;

  for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
    const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);

    const results = await Promise.all(
      chunk.map(async (url) => {
        const paddedIndex = String(index).padStart(4, "0");
        const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
        const success = await downloadImage(url, destPath);
        return { success, index: index++, url: url.substring(0, 50) };
      }),
    );

    for (const r of results) {
      if (r.success) downloaded++;
      else {
        failed++;
        if (failed % 20 === 1) console.log(`    ⚠ Failed: ${r.url}...`);
      }
    }

    const total = downloaded + failed;
    if (total % 30 === 0 || total === urls.length) {
      console.log(`    Progress: ${downloaded}/${urls.length} (${failed} failed)`);
    }
  }

  return { downloaded, failed, lastIndex: index };
}

// ─── Progress Tracking ──────────────────────────────────────────────────────

function loadProgress(): Progress {
  if (!existsSync(PROGRESS_FILE)) {
    return {
      lastUpdated: new Date().toISOString(),
      classes: {},
      phase: 0,
      phaseIndex: 0,
    };
  }
  try {
    const raw = JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Partial<Progress>;
    raw.classes ??= {};

    // Migration: detect old tiered system (phaseIndex > 200 means it's from old core/full system)
    const isOldFormat = (raw.phaseIndex ?? 0) > 200 || !raw.phase;
    if (isOldFormat) {
      console.warn("  ↻ Migrating progress file from old tiered system to new format");
      console.warn("    Phase checkpoint reset to 0 (will re-scan all 200 diseases)");
      console.warn("    Per-class progress (seenUrls, counts) preserved");
      raw.phase = 0;
      raw.phaseIndex = 0;
    } else {
      raw.phase ??= 0;
      raw.phaseIndex ??= 0;
    }

    // Ensure each class has the sources field
    for (const key of Object.keys(raw.classes)) {
      const cp = raw.classes[key] as Partial<ClassProgress>;

      // Migrate class-level exhausted to per-source exhausted if needed
      if (!cp.sources) {
        const classExhausted = cp.exhausted ?? false;
        cp.sources = {
          db: { exhausted: classExhausted },
          duckduckgo: { exhausted: classExhausted },
          inaturalist: { exhausted: classExhausted },
          wikimedia: { exhausted: classExhausted },
        };
      }

      cp.seenUrls ??= [];
    }
    return raw as Progress;
  } catch {
    console.warn("  ⚠ Corrupt progress file, starting fresh");
    return {
      lastUpdated: new Date().toISOString(),
      classes: {},
      phase: 0,
      phaseIndex: 0,
    };
  }
}

function saveProgress(progress: Progress): void {
  progress.lastUpdated = new Date().toISOString();
  writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2));
}

function getClassProgress(progress: Progress, classId: string): ClassProgress {
  if (!progress.classes[classId]) {
    progress.classes[classId] = {
      count: 0,
      downloaded: 0,
      failed: 0,
      seenUrls: [],
      exhausted: false,
      sources: {
        db: { exhausted: false },
        duckduckgo: { exhausted: false },
        inaturalist: { exhausted: false },
        wikimedia: { exhausted: false },
      },
    };
  }
  return progress.classes[classId];
}

// ─── Query Building ─────────────────────────────────────────────────────────

function buildSearchQueries(disease: DbDisease): string[] {
  const name = disease.name || disease.id.replace(/-/g, " ");
  const plant = disease.plantId.replace(/-/g, " ");
  // Every query keeps the disease NAME to avoid noisy labels
  return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
}

function buildHealthyQueries(plant: string): string[] {
  const name = plant.replace(/-/g, " ");
  return [
    `healthy ${name} leaf`,
    `${name} leaf closeup`,
    `healthy ${name} plant`,
    `${name} foliage`,
  ];
}

// ─── File Reconciliation ───────────────────────────────────────────────────

/**
 * Count actual image files in a class directory.
 * Returns the count of files matching img_* pattern, OR 0 if dir doesn't exist.
 */
function countImagesInDir(classDir: string): number {
  if (!existsSync(classDir)) return 0;
  try {
    const files = readdirSync(classDir);
    return files.filter((f) => f.startsWith("img_")).length;
  } catch {
    return 0;
  }
}

/**
 * Reconcile a class's progress count with actual files on disk.
 * If files were deleted after the progress file was saved, this
 * adjusts the count downward so we re-download the missing ones.
 * Returns the reconciled count.
 */
function reconcileClassCount(classDir: string, progressCount: number): number {
  const fileCount = countImagesInDir(classDir);
  if (fileCount < progressCount) {
    console.log(
      `    ↻ File count (${fileCount}) < progress count (${progressCount}) — reconciling`,
    );
    return fileCount;
  }
  return progressCount;
}

// ─── Dataset Collection ─────────────────────────────────────────────────────

async function collectClassImages(
  classId: string,
  queries: string[],
  target: number,
  progress: Progress,
  classDir: string,
  existingUrls: string[] = [],
): Promise<void> {
  const cp = getClassProgress(progress, classId);

  // ── Reconcile with actual files on disk ─────────────────────────────────
  const actualCount = reconcileClassCount(classDir, cp.count);
  if (actualCount !== cp.count) {
    cp.count = actualCount;
    saveProgress(progress);
  }

  const seenUrls = new Set(cp.seenUrls);
  const sources = cp.sources;

  if (cp.count >= target) {
    console.log(`  ✓ Already have ${cp.count}/${target}`);
    return;
  }

  // Check if ALL sources are exhausted
  const allExhausted =
    sources.db.exhausted &&
    sources.duckduckgo.exhausted &&
    sources.inaturalist.exhausted &&
    sources.wikimedia.exhausted;

  if (allExhausted) {
    cp.exhausted = true;
    saveProgress(progress);
    console.log(`  ✓ Exhausted (${cp.count}/${target})`);
    return;
  }

  mkdirSync(classDir, { recursive: true });

  const allUrls: string[] = [];
  let anyNewResults = false;
  const needed = target - cp.count;

  // ── Source 0: Existing DB URLs ──────────────────────────────────────────
  if (!sources.db.exhausted) {
    const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
    if (freshDbUrls.length > 0) {
      console.log(`  DB: ${freshDbUrls.length} existing URLs`);
      for (const url of freshDbUrls) {
        if (allUrls.length >= needed) break;
        seenUrls.add(url);
        allUrls.push(url);
      }
      if (freshDbUrls.length > 0) anyNewResults = true;
    }
    // DB source is always "exhausted" after processing its initial URLs
    sources.db.exhausted = true;
  }

  // ── Source 1: DuckDuckGo ──────────────────────────────────────────────
  if (!sources.duckduckgo.exhausted && allUrls.length < needed) {
    for (const query of queries) {
      if (allUrls.length >= needed) break;
      process.stdout.write(`  DDG: "${query.substring(0, 40)}"... `);
      const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
      allUrls.push(...result.urls);
      if (result.exhausted) {
        sources.duckduckgo.exhausted = true;
      }
      if (result.urls.length > 0) anyNewResults = true;
      console.log(`${result.urls.length} new`);
      if (allUrls.length >= needed) break;
    }
    // If DDG never gave us anything, mark exhausted to avoid re-trying
    if (!anyNewResults && sources.duckduckgo.exhausted) {
      /* already marked */
    }
  }

  // ── Source 2: iNaturalist ──────────────────────────────────────────────
  if (!sources.inaturalist.exhausted && allUrls.length < needed) {
    const primaryQuery = queries[0];
    console.log(`  iNat: Searching...`);
    const result = await searchImagesInaturalist(primaryQuery, needed - allUrls.length, seenUrls);
    allUrls.push(...result.urls);
    if (result.exhausted) sources.inaturalist.exhausted = true;
    if (result.urls.length > 0) anyNewResults = true;
    console.log(`  iNat: ${result.urls.length} images`);
  }

  // ── Source 3: Wikimedia Commons ────────────────────────────────────────
  if (!sources.wikimedia.exhausted && allUrls.length < needed) {
    const primaryQuery = queries[0];
    console.log(`  Commons: Searching...`);
    const result = await searchImagesCommons(primaryQuery, needed - allUrls.length, seenUrls);
    allUrls.push(...result.urls);
    if (result.exhausted) sources.wikimedia.exhausted = true;
    if (result.urls.length > 0) anyNewResults = true;
    console.log(`  Commons: ${result.urls.length} images`);
  }

  if (allUrls.length === 0) {
    cp.exhausted = true;
    saveProgress(progress);
    console.log(`  ✗ No images found — exhausted`);
    return;
  }

  if (!anyNewResults && allUrls.length > 0) {
    // Only DB URLs survived — nothing more will come from searches
    cp.exhausted = true;
    saveProgress(progress);
  }

  // Save progress with seen URLs BEFORE downloading
  cp.seenUrls = Array.from(seenUrls);
  saveProgress(progress);

  console.log(`  Downloading ${allUrls.length} images...`);

  // Use actual file count as start index so filenames don't have gaps
  const startIndex = countImagesInDir(classDir);
  const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);

  // Re-count actual files on disk after download (more reliable than tracking)
  const newTotal = countImagesInDir(classDir);
  cp.count = newTotal;
  cp.downloaded += downloaded;
  cp.failed += failed;

  // Check if all sources exhausted
  if (
    sources.db.exhausted &&
    sources.duckduckgo.exhausted &&
    sources.inaturalist.exhausted &&
    sources.wikimedia.exhausted
  ) {
    cp.exhausted = true;
  }

  // Don't mark exhausted if we still have room to grow
  if (cp.count >= target) {
    cp.exhausted = true;
  }

  saveProgress(progress);

  const pct = Math.round((cp.count / target) * 100);
  console.log(
    `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${
      allUrls.length
    } (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
  );
}

// ─── Main ───────────────────────────────────────────────────────────────────

async function main() {
  console.log("=".repeat(60));
  console.log("PLANT DISEASE DATASET COLLECTOR — TOP 200 COMMON DISEASES");
  console.log("=".repeat(60));

  // Ensure dataset directory exists before any cache writes
  mkdirSync(DATASET_DIR, { recursive: true });

  // Load diseases from DB
  console.log("\nLoading top 200 most common diseases from database...");
  const dbDiseases = await loadDiseasesFromDb();
  console.log(`  ${dbDiseases.length} diseases loaded`);

  // Load progress
  const progress = loadProgress();

  // If all phases complete, exit early
  if (progress.phase === 3) {
    console.log("  ✓ All phases already complete. Delete .progress.json to re-run.");
    await closeDb();
    return;
  }

  const startTime = Date.now();

  // ── Phase 1: Common diseases (200 images each) ──────────────────────────

  console.log("\n" + "─".repeat(60));
  console.log("PHASE 1: Common Diseases (200 images each)");
  console.log("─".repeat(60));

  const diseaseStart = progress.phase === 0 ? progress.phaseIndex : 0;
  if (diseaseStart > 0) {
    console.log(
      `  Resuming from disease #${diseaseStart + 1} (${(
        (diseaseStart / dbDiseases.length) *
        100
      ).toFixed(0)}% done)`,
    );
  }

  // Process diseases in parallel batches
  for (let i = diseaseStart; i < dbDiseases.length; i += DISEASE_CONCURRENCY) {
    const batch = dbDiseases.slice(i, i + DISEASE_CONCURRENCY);
    const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
    const totalBatches = Math.ceil(dbDiseases.length / DISEASE_CONCURRENCY);
    const pct = Math.round((i / dbDiseases.length) * 100);

    console.log(
      `\n[Batch ${batchNum}/${totalBatches}] (${pct}%) Processing ${batch.length} diseases in parallel...`,
    );

    // Process all diseases in this batch concurrently
    await Promise.all(
      batch.map(async (d, batchIdx) => {
        const diseaseIdx = i + batchIdx;
        const classDir = resolve(DATASET_DIR, d.id);
        const queries = buildSearchQueries(d);
        const existingUrls = d.imageUrl ? [d.imageUrl] : [];

        console.log(`  [${diseaseIdx + 1}/${dbDiseases.length}] ${d.name || d.id} (${d.plantId})`);

        await collectClassImages(
          d.id,
          queries,
          TARGET_PER_DISEASE,
          progress,
          classDir,
          existingUrls,
        );
      }),
    );

    // Save checkpoint: phase 0, at index i + batch.length
    progress.phase = 0;
    progress.phaseIndex = i + batch.length;
    saveProgress(progress);
  }

  // ── Phase 3: Healthy class ──────────────────────────────────────────────

  console.log("\n" + "─".repeat(60));
  console.log("PHASE 3: Healthy Plant Images");
  console.log("─".repeat(60));

  const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
  const healthyCp = getClassProgress(progress, HEALTHY_CLASS);

  // Reconcile healthy class with files on disk
  const healthyActualCount = reconcileClassCount(healthyDir, healthyCp.count);
  if (healthyActualCount !== healthyCp.count) {
    healthyCp.count = healthyActualCount;
    saveProgress(progress);
  }

  const healthySeen = new Set(healthyCp.seenUrls);

  if (healthyCp.count >= TARGET_HEALTHY) {
    console.log(`\n  ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`);
  } else {
    // Collect all unique plants
    const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))];
    const allHealthyQueries: string[] = [];
    for (const plant of allPlants) {
      allHealthyQueries.push(...buildHealthyQueries(plant));
    }

    const healthySources = [
      { name: "DDG", collector: collectImagesDuckDuckGo },
      { name: "iNat", collector: searchImagesInaturalist },
      { name: "Commons", collector: searchImagesCommons },
    ] as const;

    const totalHealthyUrls: string[] = [];
    let anyRemaining = false;

    for (const source of healthySources) {
      if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
      console.log(`\n  Source: ${source.name}`);

      for (const query of allHealthyQueries.slice(0, 20)) {
        if (totalHealthyUrls.length >= TARGET_HEALTHY) break;

        process.stdout.write(`    "${query}"... `);
        const result = await source.collector(
          query,
          TARGET_HEALTHY - totalHealthyUrls.length,
          healthySeen,
        );
        totalHealthyUrls.push(...result.urls);
        if (!result.exhausted) anyRemaining = true;
        console.log(`${result.urls.length} new`);
      }
    }

    healthyCp.seenUrls = Array.from(healthySeen);

    if (totalHealthyUrls.length > 0) {
      healthyCp.exhausted = !anyRemaining;
      saveProgress(progress);

      console.log(`\n  Downloading ${totalHealthyUrls.length} healthy images...`);
      const healthyStartIndex = countImagesInDir(healthyDir);
      const { downloaded, failed } = await downloadBatch(
        totalHealthyUrls,
        healthyDir,
        healthyStartIndex,
      );

      // Re-count actual files on disk
      const newHealthyTotal = countImagesInDir(healthyDir);
      healthyCp.count = newHealthyTotal;
      healthyCp.downloaded += downloaded;
      healthyCp.failed += failed;

      if (healthyCp.count >= TARGET_HEALTHY) {
        healthyCp.exhausted = true;
      }

      const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
      console.log(
        `  Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
      );
    } else {
      console.log(`  ✗ No healthy images found`);
    }

    saveProgress(progress);
  }

  // ── Summary ────────────────────────────────────────────────────────────────

  // Mark all phases complete
  progress.phase = 3;
  progress.phaseIndex = 0;
  saveProgress(progress);

  const elapsed = Math.round((Date.now() - startTime) / 1000);
  const mins = Math.floor(elapsed / 60);
  const hrs = Math.floor(mins / 60);

  let totalDownloaded = 0;
  let totalFailed = 0;
  for (const [, cp] of Object.entries(progress.classes)) {
    totalDownloaded += cp.downloaded || 0;
    totalFailed += cp.failed || 0;
  }

  console.log("\n" + "=".repeat(60));
  console.log("  ✅ ALL PHASES COMPLETE");
  console.log("=".repeat(60));
  console.log(`  Time:       ${hrs}h ${mins % 60}m`);
  console.log(`  Downloaded: ${totalDownloaded} images`);
  console.log(`  Failed:     ${totalFailed} images`);
  console.log(`  Dataset:    ${DATASET_DIR}/`);

  await closeDb();
  console.log("=".repeat(60));
}

function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

main().catch((err) => {
  console.error("Fatal error:", err);
  process.exit(1);
});