plant-disease-id/apps/web/scripts/fill-training-dataset.ts

#!/usr/bin/env node
/**
 * fill-training-dataset.ts
 *
 * Scans the existing dataset directory and downloads any missing images
 * to reach the target counts (200 per disease, 400 for healthy).
 *
 * Does NOT re-run prevalence queries — just fills gaps from image sources.
 * Each run scans the directory, reports deficits, then fills them.
 * Interrupt-safe: re-run to pick up where you left off.
 *
 * Usage: cd apps/web && npx tsx scripts/fill-training-dataset.ts
 */

import "dotenv/config";
import { readFileSync, readdirSync, writeFileSync, existsSync, mkdirSync } from "fs";
import { resolve, extname } from "path";

// Load .env.development for DB creds
const envPath = resolve(__dirname, "../.env.development");
try {
  const env = readFileSync(envPath, "utf-8");
  for (const line of env.split("\n")) {
    const trimmed = line.trim();
    if (trimmed && !trimmed.startsWith("#")) {
      const eqIdx = trimmed.indexOf("=");
      if (eqIdx > 0) {
        const key = trimmed.slice(0, eqIdx).trim();
        const val = trimmed.slice(eqIdx + 1).trim();
        if (!process.env[key]) process.env[key] = val;
      }
    }
  }
} catch {}

import { getDb, closeDb } from "@/lib/db/index";
import { diseases } from "@/lib/db/schema";
import { sql } from "drizzle-orm";

// ─── Config ─────────────────────────────────────────────────────────────────

const DATASET_DIR = resolve(__dirname, "../data/dataset");
const SEEN_CACHE_FILE = resolve(DATASET_DIR, ".fill-seen-urls.json");

/** Target images per disease */
const TARGET_PER_DISEASE = 200;

/** Target images for the "healthy" class */
const TARGET_HEALTHY = 400;

/** Delay between DuckDuckGo search API calls (ms) */
const SEARCH_DELAY = 1500;

/** Max concurrent image downloads per disease */
const CONCURRENT_DOWNLOADS = 30;

/** Number of diseases to process in parallel */
const DISEASE_CONCURRENCY = 5;

/** Minimum image size in bytes to accept */
const MIN_IMAGE_SIZE = 10_000; // 10KB

/** Maximum image size in bytes */
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB

/** Allowed file extensions */
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];

/** User agent for requests */
const UA =
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";

/** Healthy class directory name */
const HEALTHY_CLASS = "healthy";

// ─── Types ──────────────────────────────────────────────────────────────────

interface DuckDuckGoImageResult {
  image: string;
  title: string;
  url: string;
  thumbnail: string;
  height: number;
  width: number;
}

interface DiseaseInfo {
  id: string;
  name: string;
  plantId: string;
  have: number;
  needed: number;
}

// ─── Helpers ────────────────────────────────────────────────────────────────

function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

/** Count actual image files in a directory (matching img_* pattern). */
function countImagesInDir(dir: string): number {
  if (!existsSync(dir)) return 0;
  try {
    const files = readdirSync(dir);
    return files.filter((f) => f.startsWith("img_")).length;
  } catch {
    return 0;
  }
}

/** Format bytes for display */
function formatBytes(bytes: number): string {
  if (bytes < 1024) return `${bytes} B`;
  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}

// ─── Seen-URLs Cache ──────────────────────────────────────────────────────

/**
 * Load the per-disease seen-URLs cache from disk.
 * This prevents re-fetching the same URLs across runs.
 */
function loadSeenUrlsCache(): Record<string, string[]> {
  if (existsSync(SEEN_CACHE_FILE)) {
    try {
      return JSON.parse(readFileSync(SEEN_CACHE_FILE, "utf-8"));
    } catch {}
  }
  return {};
}

/**
 * Save the seen-URLs cache to disk.
 */
function saveSeenUrlsCache(cache: Record<string, string[]>): void {
  writeFileSync(SEEN_CACHE_FILE, JSON.stringify(cache, null, 2));
}

// ─── DuckDuckGo API ─────────────────────────────────────────────────────────

async function getVqdToken(query: string): Promise<string> {
  const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;

  const res = await fetch(url, {
    headers: { "User-Agent": UA, Accept: "text/html" },
    signal: AbortSignal.timeout(15_000),
  });

  if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);

  const html = await res.text();
  const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
  if (!match) throw new Error(`Could not extract vqd token for "${query}"`);

  return match[1];
}

async function searchImagesDuckDuckGo(
  query: string,
  vqd: string,
  page: number,
): Promise<DuckDuckGoImageResult[]> {
  const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
    query,
  )}&vqd=${vqd}&o=json&p=${page}&f=,,,`;

  const res = await fetch(url, {
    headers: {
      "User-Agent": UA,
      Accept: "application/json",
      Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
    },
    signal: AbortSignal.timeout(15_000),
  });

  if (!res.ok) {
    if (res.status === 429) {
      console.warn("    ⚠ DDG rate limited (429). Waiting 10s...");
      await sleep(10_000);
      return searchImagesDuckDuckGo(query, vqd, page);
    }
    if (res.status === 403) return [];
    throw new Error(`DuckDuckGo search failed: ${res.status}`);
  }

  const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
  return data.results ?? [];
}

async function collectImagesDuckDuckGo(
  query: string,
  target: number,
  seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
  const results: string[] = [];
  let page = 1;
  let exhausted = false;
  let consecutiveEmpty = 0;

  let vqd: string;
  try {
    vqd = await getVqdToken(query);
  } catch (err) {
    console.warn(`    ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
    return { urls: [], exhausted: true };
  }

  const MAX_PAGES = 5;
  let lowNoveltyCount = 0;

  while (results.length < target && page <= MAX_PAGES) {
    await sleep(SEARCH_DELAY);

    let pageResults: DuckDuckGoImageResult[];
    try {
      pageResults = await searchImagesDuckDuckGo(query, vqd, page);
    } catch (err) {
      console.warn(`    ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
      break;
    }

    if (!pageResults || pageResults.length === 0) {
      consecutiveEmpty++;
      if (consecutiveEmpty >= 3) {
        exhausted = true;
        break;
      }
      page++;
      continue;
    }

    consecutiveEmpty = 0;
    let newCount = 0;

    for (const r of pageResults) {
      if (results.length >= target) break;
      const imgUrl = r.image || r.url;
      if (!imgUrl || typeof imgUrl !== "string") continue;
      if (seenUrls.has(imgUrl)) continue;
      try {
        new URL(imgUrl);
      } catch {
        continue;
      }
      seenUrls.add(imgUrl);
      results.push(imgUrl);
      newCount++;
    }

    const newRatio = newCount / pageResults.length;
    if (newRatio < 0.05) {
      lowNoveltyCount++;
      if (lowNoveltyCount >= 2) break;
    } else {
      lowNoveltyCount = 0;
    }

    if (results.length < target) page++;
  }

  return { urls: results.slice(0, target), exhausted };
}

// ─── iNaturalist API ───────────────────────────────────────────────────────

async function searchImagesInaturalist(
  query: string,
  target: number,
  seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
  const results: string[] = [];
  const perPage = Math.min(target, 200);

  const apiUrl =
    `https://api.inaturalist.org/v1/observations` +
    `?q=${encodeURIComponent(query)}` +
    `&photos_only=true` +
    `&quality_grade=research` +
    `&per_page=${perPage}` +
    `&order_by=observed_on&order=desc`;

  try {
    const res = await fetch(apiUrl, {
      headers: { "User-Agent": UA, Accept: "application/json" },
      signal: AbortSignal.timeout(15_000),
    });
    if (!res.ok) return { urls: [], exhausted: false };

    const data = (await res.json()) as {
      results: Array<{ photos: Array<{ url: string }> }>;
    };

    for (const obs of data.results ?? []) {
      if (results.length >= target) break;
      for (const photo of obs.photos ?? []) {
        if (results.length >= target) break;
        const url = photo.url;
        if (!url || seenUrls.has(url)) continue;
        const fullUrl = url.replace("/medium.", "/original.");
        seenUrls.add(fullUrl);
        results.push(fullUrl);
      }
    }

    return { urls: results, exhausted: results.length < target };
  } catch {
    return { urls: results, exhausted: false };
  }
}

// ─── Wikimedia Commons API ─────────────────────────────────────────────────

async function searchImagesCommons(
  query: string,
  target: number,
  seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
  const results: string[] = [];
  let sroffset = 0;

  while (results.length < target) {
    const params = new URLSearchParams({
      action: "query",
      list: "search",
      srsearch: query,
      srnamespace: "6",
      srlimit: "50",
      sroffset: String(sroffset),
      format: "json",
    });

    const url = `https://commons.wikimedia.org/w/api.php?${params}`;

    try {
      const res = await fetch(url, {
        headers: { "User-Agent": UA },
        signal: AbortSignal.timeout(10_000),
      });
      if (!res.ok) break;

      const data = (await res.json()) as {
        query?: { search?: Array<{ title: string }> };
        continue?: { sroffset?: number };
      };

      const hits = data.query?.search ?? [];
      if (hits.length === 0) break;

      for (const hit of hits) {
        if (results.length >= target) break;
        const filename = hit.title.replace(/^File:/, "");
        const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
          filename,
        )}`;
        if (seenUrls.has(imgUrl)) continue;
        seenUrls.add(imgUrl);
        results.push(imgUrl);
      }

      sroffset = data.continue?.sroffset ?? sroffset + hits.length;
    } catch {
      break;
    }
  }

  return { urls: results, exhausted: results.length < target };
}

// ─── Image Download ─────────────────────────────────────────────────────────

async function downloadImage(url: string, destPath: string): Promise<boolean> {
  try {
    const res = await fetch(url, {
      headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
      signal: AbortSignal.timeout(15_000),
    });
    if (!res.ok) return false;

    const contentType = res.headers.get("content-type") || "";
    if (contentType.includes("text/html")) return false;

    const buffer = Buffer.from(await res.arrayBuffer());
    if (buffer.length < MIN_IMAGE_SIZE) return false;
    if (buffer.length > MAX_IMAGE_SIZE) return false;

    let ext = extname(new URL(url).pathname).toLowerCase();
    if (!ALLOWED_EXTENSIONS.includes(ext)) {
      if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
      else if (contentType.includes("png")) ext = ".png";
      else if (contentType.includes("webp")) ext = ".webp";
      else ext = ".jpg";
    }

    const filePath = destPath.replace(/\.\w+$/, ext);
    writeFileSync(filePath, buffer);
    return true;
  } catch {
    return false;
  }
}

async function downloadBatch(
  urls: string[],
  classDir: string,
  startIndex: number,
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
  let downloaded = 0;
  let failed = 0;
  let index = startIndex;

  for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
    const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);

    const results = await Promise.all(
      chunk.map(async (url) => {
        const paddedIndex = String(index).padStart(4, "0");
        const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
        const success = await downloadImage(url, destPath);
        return { success, index: index++ };
      }),
    );

    for (const r of results) {
      if (r.success) downloaded++;
      else failed++;
    }

    const total = downloaded + failed;
    if (total % 30 === 0 || total === urls.length) {
      process.stdout.write(`\r    Progress: ${downloaded}/${urls.length} (${failed} failed)`);
    }
  }
  console.log();

  return { downloaded, failed, lastIndex: index };
}

// ─── Query Building ─────────────────────────────────────────────────────────

function buildSearchQueries(name: string, plant: string): string[] {
  return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
}

function buildHealthyQueries(plant: string): string[] {
  const name = plant.replace(/-/g, " ");
  return [
    `healthy ${name} leaf`,
    `${name} leaf closeup`,
    `healthy ${name} plant`,
    `${name} foliage`,
  ];
}

// ─── Fill Logic ─────────────────────────────────────────────────────────────

/**
 * Try to collect up to `needed` images for a disease by hitting all three
 * sources in order. Returns how many new images were actually downloaded.
 */
async function fillClass(
  diseaseId: string,
  queries: string[],
  needed: number,
  classDir: string,
  seenUrls: Set<string>,
): Promise<number> {
  if (needed <= 0) return 0;

  mkdirSync(classDir, { recursive: true });

  const allUrls: string[] = [];

  // ── Source 1: DuckDuckGo ───────────────────────────────────────────────
  if (allUrls.length < needed) {
    for (const query of queries) {
      if (allUrls.length >= needed) break;
      process.stdout.write(`    DDG: "${query.substring(0, 40)}"... `);
      const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
      allUrls.push(...result.urls);
      console.log(`${result.urls.length} new`);
      if (result.exhausted) break;
    }
  }

  // ── Source 2: iNaturalist ──────────────────────────────────────────────
  if (allUrls.length < needed) {
    process.stdout.write(`    iNat: Searching... `);
    const result = await searchImagesInaturalist(queries[0], needed - allUrls.length, seenUrls);
    allUrls.push(...result.urls);
    console.log(`${result.urls.length} new`);
  }

  // ── Source 3: Wikimedia Commons ────────────────────────────────────────
  if (allUrls.length < needed) {
    process.stdout.write(`    Commons: Searching... `);
    const result = await searchImagesCommons(queries[0], needed - allUrls.length, seenUrls);
    allUrls.push(...result.urls);
    console.log(`${result.urls.length} new`);
  }

  if (allUrls.length === 0) {
    console.log(`    ✗ No new images found from any source`);
    return 0;
  }

  console.log(`    Downloading ${allUrls.length} images...`);
  const startIndex = countImagesInDir(classDir);
  const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);

  const newTotal = countImagesInDir(classDir);
  const gained = newTotal - startIndex;
  console.log(
    `    ${downloaded > 0 ? "✓" : "✗"} Downloaded ${downloaded}/${allUrls.length}` +
      ` (${failed} failed, ${gained} new files)`,
  );

  return gained;
}

// ─── Directory Scanner ─────────────────────────────────────────────────────

interface ScanResult {
  /** Disease id → how many images currently on disk */
  diseaseCounts: Map<string, number>;
  /** How many healthy images on disk */
  healthyCount: number;
}

function scanDataset(): ScanResult {
  const diseaseCounts = new Map<string, number>();
  let healthyCount = 0;

  if (!existsSync(DATASET_DIR)) {
    return { diseaseCounts, healthyCount: 0 };
  }

  const entries = readdirSync(DATASET_DIR, { withFileTypes: true });

  for (const entry of entries) {
    if (!entry.isDirectory()) continue;
    if (entry.name.startsWith(".")) continue;

    if (entry.name === HEALTHY_CLASS) {
      healthyCount = countImagesInDir(resolve(DATASET_DIR, entry.name));
    } else {
      const count = countImagesInDir(resolve(DATASET_DIR, entry.name));
      if (count > 0) {
        diseaseCounts.set(entry.name, count);
      }
    }
  }

  return { diseaseCounts, healthyCount };
}

// ─── Main ───────────────────────────────────────────────────────────────────

async function main() {
  console.log("=".repeat(60));
  console.log("TRAINING DATASET FILL — Gap-filling download");
  console.log("=".repeat(60));

  // Ensure dataset directory exists
  mkdirSync(DATASET_DIR, { recursive: true });

  // ── Step 1: Scan what we already have ────────────────────────────────────
  console.log("\nScanning existing dataset...");
  const { diseaseCounts, healthyCount } = scanDataset();
  console.log(`  Found ${diseaseCounts.size} disease directories, ${healthyCount} healthy images`);

  // ── Step 2: Load disease info from DB ────────────────────────────────────
  console.log("\nLoading disease info from database...");
  const db = getDb();

  const allDiseases = await db
    .select({
      id: diseases.id,
      plantId: diseases.plantId,
      name: diseases.name,
    })
    .from(diseases);

  // Build a deduplicated map: disease id → first disease info found
  const diseaseInfo = new Map<string, { name: string; plantId: string }>();
  for (const d of allDiseases) {
    if (!diseaseInfo.has(d.id)) {
      diseaseInfo.set(d.id, { name: d.name, plantId: d.plantId });
    }
  }
  console.log(`  Loaded ${diseaseInfo.size} unique diseases from DB`);

  // ── Step 3: Build deficit list ──────────────────────────────────────────
  const deficits: DiseaseInfo[] = [];

  for (const [id, info] of diseaseInfo) {
    const have = diseaseCounts.get(id) ?? 0;
    const needed = TARGET_PER_DISEASE - have;
    if (needed > 0) {
      deficits.push({ id, name: info.name, plantId: info.plantId, have, needed });
    }
  }

  // Sort by deficit size (largest first) so we prioritize the neediest diseases
  deficits.sort((a, b) => b.needed - a.needed);

  const healthyDeficit = TARGET_HEALTHY - healthyCount;

  console.log(`\n${"=".repeat(60)}`);
  console.log("DEFICIT REPORT");
  console.log(`${"=".repeat(60)}`);
  console.log(`  Diseases needing images: ${deficits.length}/${diseaseInfo.size}`);
  console.log(`  Total images missing:   ${deficits.reduce((s, d) => s + d.needed, 0)}`);
  console.log(`  Healthy deficit:        ${Math.max(0, healthyDeficit)}`);
  console.log(`${"=".repeat(60)}`);

  if (deficits.length === 0 && healthyDeficit <= 0) {
    console.log("\n  ✓ Nothing to do — all targets met!\n");
    await closeDb();
    return;
  }

  // ── Step 4: Load seen-URLs cache ────────────────────────────────────────
  const seenUrlsCache = loadSeenUrlsCache();
  let totalDownloaded = 0;
  let totalFailed = 0;
  const startTime = Date.now();

  // ── Step 5: Fill disease deficits ───────────────────────────────────────
  if (deficits.length > 0) {
    console.log("\n" + "─".repeat(60));
    console.log(`FILLING ${deficits.length} DISEASES (target: ${TARGET_PER_DISEASE} each)`);
    console.log("─".repeat(60));

    // Process in parallel batches
    for (let i = 0; i < deficits.length; i += DISEASE_CONCURRENCY) {
      const batch = deficits.slice(i, i + DISEASE_CONCURRENCY);
      const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
      const totalBatches = Math.ceil(deficits.length / DISEASE_CONCURRENCY);

      console.log(`\n[Batch ${batchNum}/${totalBatches}] Processing ${batch.length} diseases...`);

      await Promise.all(
        batch.map(async (d) => {
          const classDir = resolve(DATASET_DIR, d.id);
          const queries = buildSearchQueries(d.name, d.plantId);
          const seen = new Set<string>(seenUrlsCache[d.id] ?? []);

          console.log(
            `  [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
          );

          const gained = await fillClass(d.id, queries, d.needed, classDir, seen);

          // Update seen-URLs cache for this disease
          seenUrlsCache[d.id] = Array.from(seen);
          saveSeenUrlsCache(seenUrlsCache);

          totalDownloaded += gained;
        }),
      );

      // Save seen cache after every batch
      saveSeenUrlsCache(seenUrlsCache);

      const elapsed = Math.round((Date.now() - startTime) / 1000);
      console.log(
        `  [Batch ${batchNum}/${totalBatches}] checkpoint — ` +
          `${totalDownloaded} downloaded so far (${elapsed}s elapsed)`,
      );
    }
  }

  // ── Step 6: Fill healthy deficit ────────────────────────────────────────
  if (healthyDeficit > 0) {
    console.log("\n" + "─".repeat(60));
    console.log(`FILLING HEALTHY CLASS (target: ${TARGET_HEALTHY})`);
    console.log("─".repeat(60));

    const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
    mkdirSync(healthyDir, { recursive: true });

    // Collect all unique plants from the disease info
    const allPlants = [...new Set(diseaseInfo.values())].map((d) => d.plantId);
    const allHealthyQueries: string[] = [];
    for (const plant of allPlants) {
      allHealthyQueries.push(...buildHealthyQueries(plant));
    }

    const healthySeen = new Set<string>(seenUrlsCache[HEALTHY_CLASS] ?? []);
    const healthyNeeded = TARGET_HEALTHY - countImagesInDir(healthyDir);
    const allUrls: string[] = [];

    // Try each source with up to 20 healthy queries
    const sources = [
      { name: "DDG", collector: collectImagesDuckDuckGo },
      { name: "iNat", collector: searchImagesInaturalist },
      { name: "Commons", collector: searchImagesCommons },
    ] as const;

    for (const source of sources) {
      if (allUrls.length >= healthyNeeded) break;
      console.log(`\n  Source: ${source.name}`);

      for (const query of allHealthyQueries.slice(0, 20)) {
        if (allUrls.length >= healthyNeeded) break;

        process.stdout.write(`    "${query}"... `);
        const result = await source.collector(query, healthyNeeded - allUrls.length, healthySeen);
        allUrls.push(...result.urls);
        console.log(`${result.urls.length} new`);
      }
    }

    if (allUrls.length > 0) {
      console.log(`\n  Downloading ${allUrls.length} healthy images...`);
      const startIdx = countImagesInDir(healthyDir);
      const { downloaded, failed } = await downloadBatch(allUrls, healthyDir, startIdx);

      const newTotal = countImagesInDir(healthyDir);
      const gained = newTotal - healthyCount;
      totalDownloaded += gained;
      totalFailed += failed;

      console.log(
        `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images.` +
          ` Total healthy: ${newTotal}/${TARGET_HEALTHY} (${gained} new)`,
      );
    } else {
      console.log(`\n  ✗ No healthy images found`);
    }

    // Update seen-URLs cache
    seenUrlsCache[HEALTHY_CLASS] = Array.from(healthySeen);
    saveSeenUrlsCache(seenUrlsCache);
  }

  // ── Summary ──────────────────────────────────────────────────────────────
  const elapsed = Math.round((Date.now() - startTime) / 1000);
  const mins = Math.floor(elapsed / 60);
  const hrs = Math.floor(mins / 60);

  // Final scan
  const finalScan = scanDataset();
  const totalHave = [...finalScan.diseaseCounts.values()].reduce((s, c) => s + c, 0);
  const atTarget = [...finalScan.diseaseCounts.values()].filter(
    (c) => c >= TARGET_PER_DISEASE,
  ).length;

  console.log("\n" + "=".repeat(60));
  console.log("  ✅ FILL COMPLETE");
  console.log("=".repeat(60));
  console.log(`  Time:              ${hrs}h ${mins % 60}m`);
  console.log(`  Diseases at target: ${atTarget}/${diseaseInfo.size}`);
  console.log(`  Total images:       ${totalHave}`);
  console.log(`  Healthy images:     ${finalScan.healthyCount}/${TARGET_HEALTHY}`);
  console.log(`  New downloads:      ${totalDownloaded}`);
  console.log(`  Dataset dir:        ${DATASET_DIR}/`);

  await closeDb();
  console.log("=".repeat(60));
}

main().catch((err) => {
  console.error("\nFatal error:", err);
  process.exit(1);
});