Files
plant-disease-id/apps/web/scripts/scrape-disease-images.ts
2026-06-06 10:15:53 -04:00

220 lines
6.8 KiB
JavaScript

#!/usr/bin/env node
/**
* Fetch disease images from Wikipedia using batch page-title queries.
*
* Strategy: Convert disease names to Wikipedia page titles, query 50
* at a time with pageimages prop. Wikipedia resolves redirects automatically.
* Covers 10K+ diseases in ~200 API calls (7 minutes).
*
* Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
*/
import "dotenv/config";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
import { getDb, closeDb } from "../src/lib/db/index";
import { diseases } from "../src/lib/db/schema";
const API = "https://en.wikipedia.org/w/api.php";
const BATCH_SIZE = 50; // Max titles per query
const DELAY_MS = 2000; // Between batches
/** Convert disease name to Wikipedia page title format */
function toPageTitle(name: string): string {
return name
.trim()
.replace(/\s+/g, " ")
.split(" ")
.map((w) => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase())
.join("_")
.replace(/[()]/g, "");
}
/** Fetch thumbnails for up to 50 page titles in one call */
async function batchFetchImages(titles: string[]): Promise<Map<string, string>> {
const url = `${API}?action=query&titles=${encodeURIComponent(titles.join("|"))}&prop=pageimages&pithumbsize=400&redirects=1&format=json&origin=*`;
for (let attempt = 0; attempt < 5; attempt++) {
try {
const res = await fetch(url, {
headers: { "User-Agent": "PlantHealthKB/1.0 (plant-id)" },
});
if (res.status === 429) {
const wait = Math.min(60000, 3000 * Math.pow(2, attempt));
console.log(` 429 — waiting ${wait / 1000}s...`);
await new Promise((r) => setTimeout(r, wait));
continue;
}
if (!res.ok) return new Map();
const data = (await res.json()) as any;
const pages = data?.query?.pages;
const result = new Map<string, string>();
if (pages) {
for (const [, page] of Object.entries(pages) as any) {
if (page?.missing || page?.invalid) continue;
const originalTitle = page.title.replace(/_/g, " ");
const thumb = page?.thumbnail?.source;
if (thumb) {
result.set(originalTitle.toLowerCase(), thumb);
}
}
}
// Apply redirect resolution
const normalized = data?.query?.normalized;
if (normalized) {
for (const n of normalized) {
const from = n.from.toLowerCase();
const to = n.to.toLowerCase();
// If we have a result for the canonical name, also map the original
if (result.has(to) && !result.has(from)) {
result.set(from, result.get(to)!);
}
}
}
return result;
} catch {
await new Promise((r) => setTimeout(r, 2000));
}
}
return new Map();
}
/** Generate candidate page titles from disease name + scientific name */
function getTitleCandidates(name: string, sciName: string): string[] {
const candidates: string[] = [];
candidates.push(toPageTitle(name));
// Try scientific name
if (sciName && sciName.length > 3) {
// Full scientific name as page title (e.g., "Phytophthora infestans")
candidates.push(sciName.trim());
// Genus alone (e.g., "Alternaria")
const genus = sciName.split(/\s+/)[0];
if (genus && genus.length > 3) {
candidates.push(genus);
}
}
// Deduplicate
return [...new Set(candidates)];
}
async function main() {
console.log("🔍 Fetching disease images from Wikipedia (batch mode)\n");
const db = getDb();
const rows = await db
.select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName })
.from(diseases)
.where(sql`(image_url IS NULL OR image_url = '')`);
console.log(`📋 ${rows.length} diseases need images\n`);
const rawClient = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
let found = 0;
let pending = 0;
let updates: { id: string; url: string }[] = [];
for (let i = 0; i < rows.length; i += BATCH_SIZE) {
const chunk = rows.slice(i, i + BATCH_SIZE);
// Collect all unique candidate titles for this batch
const titleMap = new Map<string, { id: string; name: string; sciName: string }[]>();
for (const r of chunk) {
const candidates = getTitleCandidates(r.name, r.sciName || "");
for (const t of candidates) {
const key = t.toLowerCase();
if (!titleMap.has(key)) titleMap.set(key, []);
titleMap.get(key)!.push(r);
}
}
// Try exact disease name titles (first candidate for each)
const primaryTitles = chunk.map((r) => getTitleCandidates(r.name, r.sciName || "")[0]);
const imageMap = await batchFetchImages(primaryTitles);
// For unmatched, try additional candidates
const unmatched = chunk.filter(
(r) => !imageMap.has(getTitleCandidates(r.name, r.sciName || "")[0].toLowerCase()),
);
let secondPassMap = new Map<string, string>();
if (unmatched.length > 0) {
const altTitles = unmatched
.map((r) => getTitleCandidates(r.name, r.sciName || "").slice(1))
.flat()
.filter((t) => t.length > 0);
if (altTitles.length > 0) {
secondPassMap = await batchFetchImages([...new Set(altTitles)]);
}
}
// Collect results
for (const r of chunk) {
const candidates = getTitleCandidates(r.name, r.sciName || "");
let imgUrl: string | undefined;
for (const t of candidates) {
imgUrl = imageMap.get(t.toLowerCase()) || secondPassMap.get(t.toLowerCase());
if (imgUrl) break;
}
if (imgUrl) {
updates.push({ id: r.id, url: imgUrl });
found++;
}
pending++;
}
// Flush updates to DB when we have enough
if (updates.length >= 100 || (i + BATCH_SIZE >= rows.length && updates.length > 0)) {
await rawClient.batch(
updates.map((u) => ({
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
updates = [];
}
// Progress
const pct = ((Math.min(i + BATCH_SIZE, rows.length) / rows.length) * 100).toFixed(1);
process.stdout.write(
` [${pct}%] ${Math.min(i + BATCH_SIZE, rows.length)}/${rows.length} found=${found}\n`,
);
// Rate limit
if (i + BATCH_SIZE < rows.length) {
await new Promise((r) => setTimeout(r, DELAY_MS));
}
}
// Mark remaining as empty
if (pending < rows.length) {
const remaining = rows.slice(pending);
await rawClient.batch(
remaining.map((r) => ({
sql: "UPDATE diseases SET image_url = '' WHERE id = ? AND (image_url IS NULL OR image_url = '')",
args: [r.id],
})),
"write",
);
}
rawClient.close();
closeDb();
console.log(`\n✅ Done! Found images: ${found} / ${rows.length}`);
}
main().catch((err) => {
console.error("❌ Fatal:", err);
process.exit(1);
});