ooooeee

2026-06-06 10:15:53 -04:00
parent 71d7a9d6f0
commit 78220d3568
11 changed files with 1315 additions and 335 deletions
--- a/apps/web/scripts/scrape-disease-images.ts
+++ b/apps/web/scripts/scrape-disease-images.ts
@@ -1,13 +1,12 @@
 #!/usr/bin/env node
 /**
- * Fetch disease images from Wikipedia/Wikimedia Commons.
+ * Fetch disease images from Wikipedia using batch page-title queries.
 *
- * For each disease in the database, searches Wikipedia for its page
- * and retrieves the main infobox image.
+ * Strategy: Convert disease names to Wikipedia page titles, query 50
+ * at a time with pageimages prop. Wikipedia resolves redirects automatically.
+ * Covers 10K+ diseases in ~200 API calls (7 minutes).
 *
 * Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
- *
- * Rate-limited to 1 request per 300ms to be respectful.
 */

 import "dotenv/config";
@@ -16,200 +15,205 @@ import { sql } from "drizzle-orm";
 import { getDb, closeDb } from "../src/lib/db/index";
 import { diseases } from "../src/lib/db/schema";

-const WIKI_API = "https://en.wikipedia.org/w/api.php";
-const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
-const MIN_DELAY_MS = 350; // Be respectful
+const API = "https://en.wikipedia.org/w/api.php";
+const BATCH_SIZE = 50; // Max titles per query
+const DELAY_MS = 2000; // Between batches

-let lastCall = 0;
-
-async function rateLimit() {
-  const now = Date.now();
-  const elapsed = now - lastCall;
-  if (elapsed < MIN_DELAY_MS) {
-    await new Promise((r) => setTimeout(r, MIN_DELAY_MS - elapsed));
-  }
-  lastCall = Date.now();
+/** Convert disease name to Wikipedia page title format */
+function toPageTitle(name: string): string {
+  return name
+    .trim()
+    .replace(/\s+/g, " ")
+    .split(" ")
+    .map((w) => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase())
+    .join("_")
+    .replace(/[()]/g, "");
 }

-interface WikiSearchResult {
-  title: string;
-  pageid: number;
-}
+/** Fetch thumbnails for up to 50 page titles in one call */
+async function batchFetchImages(titles: string[]): Promise<Map<string, string>> {
+  const url = `${API}?action=query&titles=${encodeURIComponent(titles.join("|"))}&prop=pageimages&pithumbsize=400&redirects=1&format=json&origin=*`;

-async function searchWikipedia(term: string): Promise<WikiSearchResult | null> {
-  await rateLimit();
-  const url = `${WIKI_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
-  try {
-    const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
-    const data = await res.json() as any;
-    const results = data?.query?.search;
-    if (results && results.length > 0) {
-      return { title: results[0].title, pageid: results[0].pageid };
-    }
-  } catch {
-    // ignore
-  }
-  return null;
-}
-
-async function getPageImage(title: string): Promise<string | null> {
-  await rateLimit();
-  const url = `${WIKI_API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
-  try {
-    const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
-    const data = await res.json() as any;
-    const pages = data?.query?.pages;
-    if (pages) {
-      const page = Object.values(pages)[0] as any;
-      if (page?.thumbnail?.source) {
-        return page.thumbnail.source;
+  for (let attempt = 0; attempt < 5; attempt++) {
+    try {
+      const res = await fetch(url, {
+        headers: { "User-Agent": "PlantHealthKB/1.0 (plant-id)" },
+      });
+      if (res.status === 429) {
+        const wait = Math.min(60000, 3000 * Math.pow(2, attempt));
+        console.log(`   429 — waiting ${wait / 1000}s...`);
+        await new Promise((r) => setTimeout(r, wait));
+        continue;
      }
+      if (!res.ok) return new Map();
+      const data = (await res.json()) as any;
+      const pages = data?.query?.pages;
+      const result = new Map<string, string>();
+
+      if (pages) {
+        for (const [, page] of Object.entries(pages) as any) {
+          if (page?.missing || page?.invalid) continue;
+          const originalTitle = page.title.replace(/_/g, " ");
+          const thumb = page?.thumbnail?.source;
+          if (thumb) {
+            result.set(originalTitle.toLowerCase(), thumb);
+          }
+        }
+      }
+
+      // Apply redirect resolution
+      const normalized = data?.query?.normalized;
+      if (normalized) {
+        for (const n of normalized) {
+          const from = n.from.toLowerCase();
+          const to = n.to.toLowerCase();
+          // If we have a result for the canonical name, also map the original
+          if (result.has(to) && !result.has(from)) {
+            result.set(from, result.get(to)!);
+          }
+        }
+      }
+
+      return result;
+    } catch {
+      await new Promise((r) => setTimeout(r, 2000));
    }
-  } catch {
-    // ignore
  }
-  return null;
+  return new Map();
 }

-async function searchCommons(term: string): Promise<string | null> {
-  await rateLimit();
-  const url = `${COMMONS_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=3&origin=*`;
-  try {
-    const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
-    const data = await res.json() as any;
-    const results = data?.query?.search;
-    if (results && results.length > 0) {
-      // Try to get thumbnail for best match
-      for (const r of results.slice(0, 2)) {
-        const imgUrl = await getCommonsImage(r.title);
-        if (imgUrl) return imgUrl;
-      }
-    }
-  } catch {
-    // ignore
-  }
-  return null;
-}
+/** Generate candidate page titles from disease name + scientific name */
+function getTitleCandidates(name: string, sciName: string): string[] {
+  const candidates: string[] = [];
+  candidates.push(toPageTitle(name));

-async function getCommonsImage(title: string): Promise<string | null> {
-  await rateLimit();
-  const url = `${COMMONS_API}?action=query&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url&iiurlwidth=400&format=json&origin=*`;
-  try {
-    const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
-    const data = await res.json() as any;
-    const pages = data?.query?.pages;
-    if (pages) {
-      const page = Object.values(pages)[0] as any;
-      if (page?.imageinfo?.[0]?.thumburl) {
-        return page.imageinfo[0].thumburl;
-      }
-      if (page?.imageinfo?.[0]?.url) {
-        return page.imageinfo[0].url;
-      }
+  // Try scientific name
+  if (sciName && sciName.length > 3) {
+    // Full scientific name as page title (e.g., "Phytophthora infestans")
+    candidates.push(sciName.trim());
+
+    // Genus alone (e.g., "Alternaria")
+    const genus = sciName.split(/\s+/)[0];
+    if (genus && genus.length > 3) {
+      candidates.push(genus);
    }
-  } catch {
-    // ignore
  }
-  return null;
+
+  // Deduplicate
+  return [...new Set(candidates)];
 }

 async function main() {
-  console.log("🔍 Fetching disease images from Wikipedia\n");
+  console.log("🔍 Fetching disease images from Wikipedia (batch mode)\n");
  const db = getDb();
+
+  const rows = await db
+    .select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName })
+    .from(diseases)
+    .where(sql`(image_url IS NULL OR image_url = '')`);
+
+  console.log(`📋 ${rows.length} diseases need images\n`);
+
  const rawClient = createClient({
    url: process.env.DATABASE_URL!,
    authToken: process.env.DATABASE_TOKEN!,
  });

-  // Get all diseases without images
-  const rows = await db
-    .select({
-      id: diseases.id,
-      name: diseases.name,
-      sciName: diseases.scientificName,
-      plantId: diseases.plantId,
-    })
-    .from(diseases)
-    .where(sql`image_url IS NULL OR image_url = ''`);
-
-  console.log(`📋 ${rows.length} diseases missing images`);
-  if (rows.length === 0) {
-    console.log("✅ All diseases already have images!");
-    process.exit(0);
-  }
-
  let found = 0;
-  let skipped = 0;
-  let batch: { sql: string; args: any[] }[] = [];
+  let pending = 0;
+  let updates: { id: string; url: string }[] = [];

-  const BATCH_SIZE = 50;
-  let i = 0;
+  for (let i = 0; i < rows.length; i += BATCH_SIZE) {
+    const chunk = rows.slice(i, i + BATCH_SIZE);

-  for (const row of rows) {
-    i++;
-    // Build search terms: try scientific name + disease name, then disease name alone
-    const searchTerms = [
-      `${row.sciName || ""} ${row.name}`.trim(),
-      row.name,
-      `${row.name} (${row.sciName})`.trim(),
-    ].filter(Boolean);
-
-    let imageUrl: string | null = null;
-
-    for (const term of searchTerms) {
-      if (term.length < 3) continue;
-      // Try Wikipedia first
-      const page = await searchWikipedia(term);
-      if (page) {
-        imageUrl = await getPageImage(page.title);
-        if (imageUrl) break;
+    // Collect all unique candidate titles for this batch
+    const titleMap = new Map<string, { id: string; name: string; sciName: string }[]>();
+    for (const r of chunk) {
+      const candidates = getTitleCandidates(r.name, r.sciName || "");
+      for (const t of candidates) {
+        const key = t.toLowerCase();
+        if (!titleMap.has(key)) titleMap.set(key, []);
+        titleMap.get(key)!.push(r);
      }
-      // Try Commons directly
-      imageUrl = await searchCommons(term);
-      if (imageUrl) break;
    }

-    if (imageUrl && !imageUrl.startsWith("https://")) {
-      imageUrl = null;
-    }
+    // Try exact disease name titles (first candidate for each)
+    const primaryTitles = chunk.map((r) => getTitleCandidates(r.name, r.sciName || "")[0]);
+    const imageMap = await batchFetchImages(primaryTitles);

-    if (imageUrl) {
-      batch.push({
-        sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
-        args: [imageUrl, row.id],
-      });
-      if (i % 100 === 0) {
-        process.stdout.write(`  🔍 found ${found} so far...\n`);
+    // For unmatched, try additional candidates
+    const unmatched = chunk.filter(
+      (r) => !imageMap.has(getTitleCandidates(r.name, r.sciName || "")[0].toLowerCase()),
+    );
+    let secondPassMap = new Map<string, string>();
+    if (unmatched.length > 0) {
+      const altTitles = unmatched
+        .map((r) => getTitleCandidates(r.name, r.sciName || "").slice(1))
+        .flat()
+        .filter((t) => t.length > 0);
+      if (altTitles.length > 0) {
+        secondPassMap = await batchFetchImages([...new Set(altTitles)]);
      }
-      found++;
-    } else {
-      skipped++;
    }

-    // Flush batch
-    if (batch.length >= BATCH_SIZE) {
+    // Collect results
+    for (const r of chunk) {
+      const candidates = getTitleCandidates(r.name, r.sciName || "");
+      let imgUrl: string | undefined;
+      for (const t of candidates) {
+        imgUrl = imageMap.get(t.toLowerCase()) || secondPassMap.get(t.toLowerCase());
+        if (imgUrl) break;
+      }
+      if (imgUrl) {
+        updates.push({ id: r.id, url: imgUrl });
+        found++;
+      }
+      pending++;
+    }
+
+    // Flush updates to DB when we have enough
+    if (updates.length >= 100 || (i + BATCH_SIZE >= rows.length && updates.length > 0)) {
      await rawClient.batch(
-        batch.map((b) => ({ sql: b.sql, args: b.args })),
+        updates.map((u) => ({
+          sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
+          args: [u.url, u.id],
+        })),
        "write",
      );
-      process.stdout.write(`  📦 flushed ${batch.length} updates (${i}/${rows.length})\n`);
-      batch = [];
+      updates = [];
+    }
+
+    // Progress
+    const pct = ((Math.min(i + BATCH_SIZE, rows.length) / rows.length) * 100).toFixed(1);
+    process.stdout.write(
+      `  [${pct}%] ${Math.min(i + BATCH_SIZE, rows.length)}/${rows.length}  found=${found}\n`,
+    );
+
+    // Rate limit
+    if (i + BATCH_SIZE < rows.length) {
+      await new Promise((r) => setTimeout(r, DELAY_MS));
    }
  }

-  // Flush remaining
-  if (batch.length > 0) {
+  // Mark remaining as empty
+  if (pending < rows.length) {
+    const remaining = rows.slice(pending);
    await rawClient.batch(
-      batch.map((b) => ({ sql: b.sql, args: b.args })),
+      remaining.map((r) => ({
+        sql: "UPDATE diseases SET image_url = '' WHERE id = ? AND (image_url IS NULL OR image_url = '')",
+        args: [r.id],
+      })),
      "write",
    );
-    process.stdout.write(`  📦 final flush: ${batch.length} updates\n`);
  }

  rawClient.close();
  closeDb();

-  console.log(`\n✅ Done! Found images: ${found} | Skipped: ${skipped}`);
+  console.log(`\n✅ Done! Found images: ${found} / ${rows.length}`);
 }

-main().catch((err) => { console.error("❌ Fatal:", err); process.exit(1); });
+main().catch((err) => {
+  console.error("❌ Fatal:", err);
+  process.exit(1);
+});