plant-disease-id/apps/web/scripts/scrape-training-dataset.ts

#!/usr/bin/env node
/**
 * scrape-training-dataset.ts
 *
 * Collects a training dataset for fine-tuning by scraping DuckDuckGo image search.
 *
 * Targets:
 *   - 200 images per disease class (93 diseases)
 *   - 400 images for the "healthy" class
 *   - Full resolution images stored in data/dataset/{class_id}/
 *
 * DuckDuckGo approach (no API key needed):
 *   1. Fetch the main search page to extract a vqd (query) token
 *   2. Use the vqd token to paginate through image results
 *   3. Download each image to the dataset directory
 *
 * Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
 *
 * Progress is tracked in data/dataset/.progress.json — interrupt and resume safely.
 */

import "dotenv/config";
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
import { resolve, extname, join } from "path";

// ─── Config ─────────────────────────────────────────────────────────────────

const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
const PLANTS_JSON = resolve(__dirname, "../src/data/plants.json");

const DATASET_DIR = resolve(__dirname, "../data/dataset");
const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");

/** Target images per disease class */
const TARGET_PER_DISEASE = 200;

/** Target images for the "healthy" class (2× normal) */
const TARGET_HEALTHY = 400;

/** Delay between DuckDuckGo search API calls (ms) */
const SEARCH_DELAY = 1500;

/** Delay between image downloads (ms) */
const DOWNLOAD_DELAY = 300;

/** Max concurrent downloads */
const CONCURRENT_DOWNLOADS = 5;

/** Minimum image size in bytes to accept (reject tiny placeholders) */
const MIN_IMAGE_SIZE = 10_000; // 10KB

/** Maximum image size in bytes */
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB

/** Allowed image content types */
const ALLOWED_CONTENT_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif"];

/** Allowed file extensions */
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];

/** User agent for requests */
const UA =
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";

// ─── Types ──────────────────────────────────────────────────────────────────

interface DiseaseSeed {
  id: string;
  plantId: string;
  name: string;
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  [key: string]: any;
}

interface PlantSeed {
  id: string;
  commonName: string;
  scientificName: string;
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  [key: string]: any;
}

interface DuckDuckGoImageResult {
  image: string;
  title: string;
  url: string;
  thumbnail: string;
  height: number;
  width: number;
}

interface ClassProgress {
  count: number;
  downloaded: number;
  failed: number;
  skipped: number;
  /** URLs we've already seen (to avoid duplicates) */
  seenUrls: string[];
  /** Whether we've exhausted search results */
  exhausted: boolean;
}

interface Progress {
  lastUpdated: string;
  classes: Record<string, ClassProgress>;
}

/** Class ID for healthy plants */
const HEALTHY_CLASS = "healthy";

// ─── DuckDuckGo API ─────────────────────────────────────────────────────────

/**
 * Extract the vqd token from DuckDuckGo's search page.
 * Required for paginating image results.
 */
async function getVqdToken(query: string): Promise<string> {
  const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;

  const res = await fetch(url, {
    headers: { "User-Agent": UA, Accept: "text/html" },
    signal: AbortSignal.timeout(15_000),
  });

  if (!res.ok) {
    throw new Error(`Failed to get vqd token: ${res.status}`);
  }

  const html = await res.text();

  // Extract vqd token from the HTML
  // Format: vqd='<token>' or vqd="<token>"
  const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
  if (!match) {
    throw new Error(`Could not extract vqd token from DuckDuckGo response for "${query}"`);
  }

  return match[1];
}

/**
 * Fetch a page of DuckDuckGo image results.
 */
async function searchImagesDuckDuckGo(
  query: string,
  vqd: string,
  page: number,
): Promise<DuckDuckGoImageResult[]> {
  const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(query)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;

  const res = await fetch(url, {
    headers: {
      "User-Agent": UA,
      Accept: "application/json",
      Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
    },
    signal: AbortSignal.timeout(15_000),
  });

  if (!res.ok) {
    if (res.status === 429) {
      console.warn("  ⚠ Rate limited (429). Waiting 10s...");
      await sleep(10_000);
      return searchImagesDuckDuckGo(query, vqd, page); // Retry
    }
    if (res.status === 403) {
      console.warn("  ⚠ Forbidden (403). Token may have expired.");
      return []; // Token expired — no more pages
    }
    throw new Error(`DuckDuckGo search failed: ${res.status}`);
  }

  const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
  return data.results ?? [];
}

/**
 * Search DuckDuckGo images, automatically paginating to collect up to `target` results.
 * Returns unique image URLs.
 */
async function collectImages(
  query: string,
  target: number,
  seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
  const results: string[] = [];
  let page = 1;
  let exhausted = false;
  let consecutiveEmpty = 0;

  // Get vqd token
  let vqd: string;
  try {
    vqd = await getVqdToken(query);
  } catch (err) {
    console.warn(`  ⚠ Failed to get vqd token: ${err instanceof Error ? err.message : "unknown"}`);
    return { urls: [], exhausted: true };
  }

  while (results.length < target) {
    await sleep(SEARCH_DELAY);

    let pageResults: DuckDuckGoImageResult[];
    try {
      pageResults = await searchImagesDuckDuckGo(query, vqd, page);
    } catch (err) {
      console.warn(`  ⚠ Search error: ${err instanceof Error ? err.message : "unknown"}`);
      break;
    }

    if (pageResults.length === 0) {
      consecutiveEmpty++;
      if (consecutiveEmpty >= 3) {
        exhausted = true;
        break;
      }
      page++;
      continue;
    }

    consecutiveEmpty = 0;
    let newCount = 0;

    for (const r of pageResults) {
      if (results.length >= target) break;

      const imgUrl = r.image || r.url;

      // Skip if we've already seen this URL
      if (seenUrls.has(imgUrl)) continue;

      // Validate URL looks like an image
      const ext = extname(new URL(imgUrl).pathname).toLowerCase();
      if (!ALLOWED_EXTENSIONS.includes(ext) && !ext) {
        // No extension - still try, could be a CDN URL
      }

      seenUrls.add(imgUrl);
      results.push(imgUrl);
      newCount++;
    }

    if (newCount === 0 && pageResults.every((r) => seenUrls.has(r.image || r.url))) {
      // All results on this page were already seen
      page++;
      continue;
    }

    if (results.length < target) {
      page++;
    }
  }

  return { urls: results.slice(0, target), exhausted };
}

// ─── Image Download ─────────────────────────────────────────────────────────

/**
 * Download a single image from a URL to the target path.
 * Returns true if successful, false otherwise.
 */
async function downloadImage(url: string, destPath: string): Promise<boolean> {
  try {
    const res = await fetch(url, {
      headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg" },
      signal: AbortSignal.timeout(15_000),
    });

    if (!res.ok) return false;

    const contentType = res.headers.get("content-type") || "";
    const contentLength = parseInt(res.headers.get("content-length") || "0", 10);

    // Validate content type
    if (!ALLOWED_CONTENT_TYPES.some((t) => contentType.includes(t))) {
      return false;
    }

    // Validate size
    if (contentLength > 0 && contentLength < MIN_IMAGE_SIZE) return false;
    if (contentLength > MAX_IMAGE_SIZE) return false;

    const buffer = Buffer.from(await res.arrayBuffer());

    // Double-check actual buffer size
    if (buffer.length < MIN_IMAGE_SIZE) return false;
    if (buffer.length > MAX_IMAGE_SIZE) return false;

    // Determine correct extension from content type or URL
    let ext = extname(new URL(url).pathname).toLowerCase();
    if (!ALLOWED_EXTENSIONS.includes(ext)) {
      // Map from content type
      if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
      else if (contentType.includes("png")) ext = ".png";
      else if (contentType.includes("webp")) ext = ".webp";
      else ext = ".jpg"; // Default
    }

    const filePath = destPath.replace(/\.\w+$/, ext);
    writeFileSync(filePath, buffer);
    return true;
  } catch {
    return false;
  }
}

/**
 * Download multiple images concurrently, respecting a per-download delay.
 */
async function downloadBatch(
  urls: string[],
  classDir: string,
  startIndex: number,
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
  let downloaded = 0;
  let failed = 0;
  let index = startIndex;

  // Process in chunks to control concurrency
  for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
    const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);

    const results = await Promise.all(
      chunk.map(async (url) => {
        const paddedIndex = String(index).padStart(4, "0");
        const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);

        const success = await downloadImage(url, destPath);
        await sleep(DOWNLOAD_DELAY);
        return { success, index: index++ };
      }),
    );

    for (const r of results) {
      if (r.success) downloaded++;
      else failed++;
    }
  }

  return { downloaded, failed, lastIndex: index };
}

// ─── Progress Tracking ──────────────────────────────────────────────────────

function loadProgress(): Progress {
  if (!existsSync(PROGRESS_FILE)) {
    return { lastUpdated: new Date().toISOString(), classes: {} };
  }
  return JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Progress;
}

function saveProgress(progress: Progress): void {
  progress.lastUpdated = new Date().toISOString();
  writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2));
}

function getClassProgress(progress: Progress, classId: string): ClassProgress {
  if (!progress.classes[classId]) {
    progress.classes[classId] = {
      count: 0,
      downloaded: 0,
      failed: 0,
      skipped: 0,
      seenUrls: [],
      exhausted: false,
    };
  }
  return progress.classes[classId];
}

// ─── Search Query Building ──────────────────────────────────────────────────

function buildSearchQueries(disease: DiseaseSeed, plant: PlantSeed | null): string[] {
  const name = disease.name;
  const plantName = plant?.commonName || disease.plantId;

  return [
    `${name} ${plantName} leaf disease`,
    `${plantName} ${name} symptoms`,
    `${name} plant disease`,
    `${plantName} diseased leaf`,
  ];
}

function buildHealthyQueries(plant: PlantSeed): string[] {
  return [
    `healthy ${plant.commonName} leaf`,
    `${plant.commonName} leaf closeup`,
    `healthy ${plant.commonName} plant`,
    `${plant.commonName} foliage`,
  ];
}

// ─── Dataset Collection ─────────────────────────────────────────────────────

async function collectClassImages(
  classId: string,
  queries: string[],
  target: number,
  progress: Progress,
  classDir: string,
): Promise<void> {
  const cp = getClassProgress(progress, classId);
  const seenUrls = new Set(cp.seenUrls);

  if (cp.count >= target) {
    console.log(`  ✓ Already have ${cp.count}/${target} images`);
    return;
  }

  if (cp.exhausted) {
    console.log(`  ✓ Already exhausted search results (${cp.count}/${target} images)`);
    return;
  }

  mkdirSync(classDir, { recursive: true });

  const totalUrls: string[] = [];
  let exhausted = false;

  // Search with each query until we hit the target
  for (const query of queries) {
    if (totalUrls.length >= target) break;

    console.log(`  Searching: "${query}"...`);
    const result = await collectImages(query, target - totalUrls.length, seenUrls);

    totalUrls.push(...result.urls);
    cp.seenUrls = Array.from(seenUrls);

    if (result.exhausted) {
      exhausted = true;
    }

    if (totalUrls.length >= target) break;
  }

  if (totalUrls.length === 0) {
    cp.exhausted = exhausted;
    saveProgress(progress);
    console.log(`  ✗ No images found for "${classId}"`);
    return;
  }

  console.log(`  Found ${totalUrls.length} unique image URLs. Downloading...`);

  // Download the images
  const { downloaded, failed } = await downloadBatch(totalUrls, classDir, cp.count);

  cp.count += downloaded;
  cp.downloaded += downloaded;
  cp.failed += failed;
  cp.exhausted = exhausted;

  saveProgress(progress);

  const pct = Math.round((cp.count / target) * 100);
  console.log(
    `  ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
  );
}

// ─── Main ───────────────────────────────────────────────────────────────────

async function main() {
  console.log("=".repeat(60));
  console.log("PLANT DISEASE DATASET COLLECTOR");
  console.log("=".repeat(60));

  // Load knowledge base
  const diseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
  const plants = JSON.parse(readFileSync(PLANTS_JSON, "utf-8")) as PlantSeed[];

  const plantMap = new Map<string, PlantSeed>();
  for (const p of plants) {
    plantMap.set(p.id, p);
  }

  console.log(`\nLoaded ${diseases.length} diseases, ${plants.length} plants`);
  console.log(
    `Target: ${TARGET_PER_DISEASE} images/disease (×${diseases.length} = ${diseases.length * TARGET_PER_DISEASE})`,
  );
  console.log(`Target: ${TARGET_HEALTHY} images for "healthy" class`);
  console.log(`Output: ${DATASET_DIR}/`);
  console.log("");

  // Load progress
  mkdirSync(DATASET_DIR, { recursive: true });
  const progress = loadProgress();

  const startTime = Date.now();

  // ── Phase 1: Disease classes ──────────────────────────────────────────────

  console.log("─".repeat(60));
  console.log("PHASE 1: Disease Images");
  console.log("─".repeat(60));

  for (let i = 0; i < diseases.length; i++) {
    const disease = diseases[i];
    const plant = plantMap.get(disease.plantId) ?? null;
    const classDir = resolve(DATASET_DIR, disease.id);
    const queries = buildSearchQueries(disease, plant);

    const pct = Math.round((i / diseases.length) * 100);
    console.log(`\n[${i + 1}/${diseases.length}] (${pct}%) ${disease.name} (${disease.id})`);

    await collectClassImages(disease.id, queries, TARGET_PER_DISEASE, progress, classDir);
  }

  // ── Phase 2: Healthy class ────────────────────────────────────────────────

  console.log("\n" + "─".repeat(60));
  console.log("PHASE 2: Healthy Plant Images");
  console.log("─".repeat(60));

  const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
  const healthyCp = getClassProgress(progress, HEALTHY_CLASS);
  const healthySeen = new Set(healthyCp.seenUrls);

  if (healthyCp.count >= TARGET_HEALTHY) {
    console.log(`\n  ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY} healthy images`);
  } else {
    // Build a pool of healthy plant queries
    const allHealthyQueries: string[] = [];
    for (const plant of plants) {
      allHealthyQueries.push(...buildHealthyQueries(plant));
    }

    const totalHealthyUrls: string[] = [];
    let healthyExhausted = false;

    for (const query of allHealthyQueries) {
      if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
      if (healthyExhausted) break;

      console.log(`\n  Searching: "${query}"...`);
      const result = await collectImages(
        query,
        TARGET_HEALTHY - totalHealthyUrls.length,
        healthySeen,
      );

      totalHealthyUrls.push(...result.urls);

      if (result.exhausted) {
        healthyExhausted = true;
      }
    }

    healthyCp.seenUrls = Array.from(healthySeen);

    if (totalHealthyUrls.length > 0) {
      console.log(`\n  Found ${totalHealthyUrls.length} healthy image URLs. Downloading...`);
      const { downloaded, failed } = await downloadBatch(
        totalHealthyUrls,
        healthyDir,
        healthyCp.count,
      );

      healthyCp.count += downloaded;
      healthyCp.downloaded += downloaded;
      healthyCp.failed += failed;
      healthyCp.exhausted = healthyExhausted;

      const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
      console.log(
        `  Got ${downloaded} images (${failed} failed). Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
      );
    } else {
      healthyCp.exhausted = true;
      console.log(`  ✗ No healthy images found`);
    }

    saveProgress(progress);
  }

  // ── Summary ────────────────────────────────────────────────────────────────

  const elapsed = Math.round((Date.now() - startTime) / 1000);
  const mins = Math.floor(elapsed / 60);
  const secs = elapsed % 60;

  let totalDownloaded = 0;
  let totalFailed = 0;
  let totalTarget = 0;

  for (const [classId, cp] of Object.entries(progress.classes)) {
    totalDownloaded += cp.downloaded || 0;
    totalFailed += cp.failed || 0;
    totalTarget += classId === HEALTHY_CLASS ? TARGET_HEALTHY : TARGET_PER_DISEASE;
  }

  const totalSize = await getDatasetSize();
  const sizeGb = (totalSize / (1024 * 1024 * 1024)).toFixed(2);

  console.log("\n" + "=".repeat(60));
  console.log("COMPLETE");
  console.log("=".repeat(60));
  console.log(`  Time:       ${mins}m ${secs}s`);
  console.log(`  Downloaded: ${totalDownloaded} images`);
  console.log(`  Failed:     ${totalFailed} images`);
  console.log(`  Target:     ${totalTarget} images`);
  console.log(`  Dataset size: ${sizeGb} GB`);
  console.log(`  Dataset location: ${DATASET_DIR}/`);
  console.log("");
  console.log("Next steps:");
  console.log("  1. Run the fine-tuning script to train on this dataset");
  console.log("  2. The fine-tuning script will resize to 160×160 and augment");
  console.log("=".repeat(60));
}

/**
 * Calculate total size of the dataset directory.
 */
async function getDatasetSize(): Promise<number> {
  let total = 0;
  if (!existsSync(DATASET_DIR)) return 0;

  const entries = readdirSync(DATASET_DIR, { withFileTypes: true });

  for (const entry of entries) {
    if (!entry.name.startsWith(".")) {
      const fullPath = resolve(DATASET_DIR, entry.name);
      if (entry.isDirectory()) {
        total += dirSize(fullPath);
      }
    }
  }

  return total;
}

function dirSize(dirPath: string): number {
  let total = 0;
  try {
    const entries = readdirSync(dirPath, { withFileTypes: true });
    for (const entry of entries) {
      const fullPath = join(dirPath, entry.name);
      if (entry.isFile()) {
        total += statSync(fullPath).size;
      } else if (entry.isDirectory()) {
        total += dirSize(fullPath);
      }
    }
  } catch {
    // skip errors
  }
  return total;
}

function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

main().catch((err) => {
  console.error("Fatal error:", err);
  process.exit(1);
});