plant-disease-id/apps/web/scripts/scrape-disease-images.ts

#!/usr/bin/env node
/**
 * Fetch disease images from Wikipedia using batch page-title queries.
 *
 * Strategy: Convert disease names to Wikipedia page titles, query 50
 * at a time with pageimages prop. Wikipedia resolves redirects automatically.
 * Covers 10K+ diseases in ~200 API calls (7 minutes).
 *
 * Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
 */

import "dotenv/config";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
import { getDb, closeDb } from "../src/lib/db/index";
import { diseases } from "../src/lib/db/schema";

const API = "https://en.wikipedia.org/w/api.php";
const BATCH_SIZE = 50; // Max titles per query
const DELAY_MS = 2000; // Between batches

/** Convert disease name to Wikipedia page title format */
function toPageTitle(name: string): string {
  return name
    .trim()
    .replace(/\s+/g, " ")
    .split(" ")
    .map((w) => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase())
    .join("_")
    .replace(/[()]/g, "");
}

/** Fetch thumbnails for up to 50 page titles in one call */
async function batchFetchImages(titles: string[]): Promise<Map<string, string>> {
  const url = `${API}?action=query&titles=${encodeURIComponent(titles.join("|"))}&prop=pageimages&pithumbsize=400&redirects=1&format=json&origin=*`;

  for (let attempt = 0; attempt < 5; attempt++) {
    try {
      const res = await fetch(url, {
        headers: { "User-Agent": "PlantHealthKB/1.0 (plant-id)" },
      });
      if (res.status === 429) {
        const wait = Math.min(60000, 3000 * Math.pow(2, attempt));
        console.log(`   429 — waiting ${wait / 1000}s...`);
        await new Promise((r) => setTimeout(r, wait));
        continue;
      }
      if (!res.ok) return new Map();
      const data = (await res.json()) as any;
      const pages = data?.query?.pages;
      const result = new Map<string, string>();

      if (pages) {
        for (const [, page] of Object.entries(pages) as any) {
          if (page?.missing || page?.invalid) continue;
          const originalTitle = page.title.replace(/_/g, " ");
          const thumb = page?.thumbnail?.source;
          if (thumb) {
            result.set(originalTitle.toLowerCase(), thumb);
          }
        }
      }

      // Apply redirect resolution
      const normalized = data?.query?.normalized;
      if (normalized) {
        for (const n of normalized) {
          const from = n.from.toLowerCase();
          const to = n.to.toLowerCase();
          // If we have a result for the canonical name, also map the original
          if (result.has(to) && !result.has(from)) {
            result.set(from, result.get(to)!);
          }
        }
      }

      return result;
    } catch {
      await new Promise((r) => setTimeout(r, 2000));
    }
  }
  return new Map();
}

/** Generate candidate page titles from disease name + scientific name */
function getTitleCandidates(name: string, sciName: string): string[] {
  const candidates: string[] = [];
  candidates.push(toPageTitle(name));

  // Try scientific name
  if (sciName && sciName.length > 3) {
    // Full scientific name as page title (e.g., "Phytophthora infestans")
    candidates.push(sciName.trim());

    // Genus alone (e.g., "Alternaria")
    const genus = sciName.split(/\s+/)[0];
    if (genus && genus.length > 3) {
      candidates.push(genus);
    }
  }

  // Deduplicate
  return [...new Set(candidates)];
}

async function main() {
  console.log("🔍 Fetching disease images from Wikipedia (batch mode)\n");
  const db = getDb();

  const rows = await db
    .select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName })
    .from(diseases)
    .where(sql`(image_url IS NULL OR image_url = '')`);

  console.log(`📋 ${rows.length} diseases need images\n`);

  const rawClient = createClient({
    url: process.env.DATABASE_URL!,
    authToken: process.env.DATABASE_TOKEN!,
  });

  let found = 0;
  let pending = 0;
  let updates: { id: string; url: string }[] = [];

  for (let i = 0; i < rows.length; i += BATCH_SIZE) {
    const chunk = rows.slice(i, i + BATCH_SIZE);

    // Collect all unique candidate titles for this batch
    const titleMap = new Map<string, { id: string; name: string; sciName: string }[]>();
    for (const r of chunk) {
      const candidates = getTitleCandidates(r.name, r.sciName || "");
      for (const t of candidates) {
        const key = t.toLowerCase();
        if (!titleMap.has(key)) titleMap.set(key, []);
        titleMap.get(key)!.push(r);
      }
    }

    // Try exact disease name titles (first candidate for each)
    const primaryTitles = chunk.map((r) => getTitleCandidates(r.name, r.sciName || "")[0]);
    const imageMap = await batchFetchImages(primaryTitles);

    // For unmatched, try additional candidates
    const unmatched = chunk.filter(
      (r) => !imageMap.has(getTitleCandidates(r.name, r.sciName || "")[0].toLowerCase()),
    );
    let secondPassMap = new Map<string, string>();
    if (unmatched.length > 0) {
      const altTitles = unmatched
        .map((r) => getTitleCandidates(r.name, r.sciName || "").slice(1))
        .flat()
        .filter((t) => t.length > 0);
      if (altTitles.length > 0) {
        secondPassMap = await batchFetchImages([...new Set(altTitles)]);
      }
    }

    // Collect results
    for (const r of chunk) {
      const candidates = getTitleCandidates(r.name, r.sciName || "");
      let imgUrl: string | undefined;
      for (const t of candidates) {
        imgUrl = imageMap.get(t.toLowerCase()) || secondPassMap.get(t.toLowerCase());
        if (imgUrl) break;
      }
      if (imgUrl) {
        updates.push({ id: r.id, url: imgUrl });
        found++;
      }
      pending++;
    }

    // Flush updates to DB when we have enough
    if (updates.length >= 100 || (i + BATCH_SIZE >= rows.length && updates.length > 0)) {
      await rawClient.batch(
        updates.map((u) => ({
          sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
          args: [u.url, u.id],
        })),
        "write",
      );
      updates = [];
    }

    // Progress
    const pct = ((Math.min(i + BATCH_SIZE, rows.length) / rows.length) * 100).toFixed(1);
    process.stdout.write(
      `  [${pct}%] ${Math.min(i + BATCH_SIZE, rows.length)}/${rows.length}  found=${found}\n`,
    );

    // Rate limit
    if (i + BATCH_SIZE < rows.length) {
      await new Promise((r) => setTimeout(r, DELAY_MS));
    }
  }

  // Mark remaining as empty
  if (pending < rows.length) {
    const remaining = rows.slice(pending);
    await rawClient.batch(
      remaining.map((r) => ({
        sql: "UPDATE diseases SET image_url = '' WHERE id = ? AND (image_url IS NULL OR image_url = '')",
        args: [r.id],
      })),
      "write",
    );
  }

  rawClient.close();
  closeDb();

  console.log(`\n✅ Done! Found images: ${found} / ${rows.length}`);
}

main().catch((err) => {
  console.error("❌ Fatal:", err);
  process.exit(1);
});