#!/usr/bin/env node /** * Fetch disease images from Wikipedia using batch page-title queries. * * Strategy: Convert disease names to Wikipedia page titles, query 50 * at a time with pageimages prop. Wikipedia resolves redirects automatically. * Covers 10K+ diseases in ~200 API calls (7 minutes). * * Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts */ import "dotenv/config"; import { createClient } from "@libsql/client"; import { sql } from "drizzle-orm"; import { getDb, closeDb } from "../src/lib/db/index"; import { diseases } from "../src/lib/db/schema"; const API = "https://en.wikipedia.org/w/api.php"; const BATCH_SIZE = 50; // Max titles per query const DELAY_MS = 2000; // Between batches /** Convert disease name to Wikipedia page title format */ function toPageTitle(name: string): string { return name .trim() .replace(/\s+/g, " ") .split(" ") .map((w) => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase()) .join("_") .replace(/[()]/g, ""); } /** Fetch thumbnails for up to 50 page titles in one call */ async function batchFetchImages(titles: string[]): Promise> { const url = `${API}?action=query&titles=${encodeURIComponent(titles.join("|"))}&prop=pageimages&pithumbsize=400&redirects=1&format=json&origin=*`; for (let attempt = 0; attempt < 5; attempt++) { try { const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0 (plant-id)" }, }); if (res.status === 429) { const wait = Math.min(60000, 3000 * Math.pow(2, attempt)); console.log(` 429 — waiting ${wait / 1000}s...`); await new Promise((r) => setTimeout(r, wait)); continue; } if (!res.ok) return new Map(); const data = (await res.json()) as any; const pages = data?.query?.pages; const result = new Map(); if (pages) { for (const [, page] of Object.entries(pages) as any) { if (page?.missing || page?.invalid) continue; const originalTitle = page.title.replace(/_/g, " "); const thumb = page?.thumbnail?.source; if (thumb) { result.set(originalTitle.toLowerCase(), thumb); } } } // Apply redirect resolution const normalized = data?.query?.normalized; if (normalized) { for (const n of normalized) { const from = n.from.toLowerCase(); const to = n.to.toLowerCase(); // If we have a result for the canonical name, also map the original if (result.has(to) && !result.has(from)) { result.set(from, result.get(to)!); } } } return result; } catch { await new Promise((r) => setTimeout(r, 2000)); } } return new Map(); } /** Generate candidate page titles from disease name + scientific name */ function getTitleCandidates(name: string, sciName: string): string[] { const candidates: string[] = []; candidates.push(toPageTitle(name)); // Try scientific name if (sciName && sciName.length > 3) { // Full scientific name as page title (e.g., "Phytophthora infestans") candidates.push(sciName.trim()); // Genus alone (e.g., "Alternaria") const genus = sciName.split(/\s+/)[0]; if (genus && genus.length > 3) { candidates.push(genus); } } // Deduplicate return [...new Set(candidates)]; } async function main() { console.log("šŸ” Fetching disease images from Wikipedia (batch mode)\n"); const db = getDb(); const rows = await db .select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName }) .from(diseases) .where(sql`(image_url IS NULL OR image_url = '')`); console.log(`šŸ“‹ ${rows.length} diseases need images\n`); const rawClient = createClient({ url: process.env.DATABASE_URL!, authToken: process.env.DATABASE_TOKEN!, }); let found = 0; let pending = 0; let updates: { id: string; url: string }[] = []; for (let i = 0; i < rows.length; i += BATCH_SIZE) { const chunk = rows.slice(i, i + BATCH_SIZE); // Collect all unique candidate titles for this batch const titleMap = new Map(); for (const r of chunk) { const candidates = getTitleCandidates(r.name, r.sciName || ""); for (const t of candidates) { const key = t.toLowerCase(); if (!titleMap.has(key)) titleMap.set(key, []); titleMap.get(key)!.push(r); } } // Try exact disease name titles (first candidate for each) const primaryTitles = chunk.map((r) => getTitleCandidates(r.name, r.sciName || "")[0]); const imageMap = await batchFetchImages(primaryTitles); // For unmatched, try additional candidates const unmatched = chunk.filter( (r) => !imageMap.has(getTitleCandidates(r.name, r.sciName || "")[0].toLowerCase()), ); let secondPassMap = new Map(); if (unmatched.length > 0) { const altTitles = unmatched .map((r) => getTitleCandidates(r.name, r.sciName || "").slice(1)) .flat() .filter((t) => t.length > 0); if (altTitles.length > 0) { secondPassMap = await batchFetchImages([...new Set(altTitles)]); } } // Collect results for (const r of chunk) { const candidates = getTitleCandidates(r.name, r.sciName || ""); let imgUrl: string | undefined; for (const t of candidates) { imgUrl = imageMap.get(t.toLowerCase()) || secondPassMap.get(t.toLowerCase()); if (imgUrl) break; } if (imgUrl) { updates.push({ id: r.id, url: imgUrl }); found++; } pending++; } // Flush updates to DB when we have enough if (updates.length >= 100 || (i + BATCH_SIZE >= rows.length && updates.length > 0)) { await rawClient.batch( updates.map((u) => ({ sql: "UPDATE diseases SET image_url = ? WHERE id = ?", args: [u.url, u.id], })), "write", ); updates = []; } // Progress const pct = ((Math.min(i + BATCH_SIZE, rows.length) / rows.length) * 100).toFixed(1); process.stdout.write( ` [${pct}%] ${Math.min(i + BATCH_SIZE, rows.length)}/${rows.length} found=${found}\n`, ); // Rate limit if (i + BATCH_SIZE < rows.length) { await new Promise((r) => setTimeout(r, DELAY_MS)); } } // Mark remaining as empty if (pending < rows.length) { const remaining = rows.slice(pending); await rawClient.batch( remaining.map((r) => ({ sql: "UPDATE diseases SET image_url = '' WHERE id = ? AND (image_url IS NULL OR image_url = '')", args: [r.id], })), "write", ); } rawClient.close(); closeDb(); console.log(`\nāœ… Done! Found images: ${found} / ${rows.length}`); } main().catch((err) => { console.error("āŒ Fatal:", err); process.exit(1); });