This commit is contained in:
2026-06-06 10:15:53 -04:00
parent 71d7a9d6f0
commit 78220d3568
11 changed files with 1315 additions and 335 deletions

View File

@@ -1,13 +1,12 @@
#!/usr/bin/env node
/**
* Fetch disease images from Wikipedia/Wikimedia Commons.
* Fetch disease images from Wikipedia using batch page-title queries.
*
* For each disease in the database, searches Wikipedia for its page
* and retrieves the main infobox image.
* Strategy: Convert disease names to Wikipedia page titles, query 50
* at a time with pageimages prop. Wikipedia resolves redirects automatically.
* Covers 10K+ diseases in ~200 API calls (7 minutes).
*
* Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
*
* Rate-limited to 1 request per 300ms to be respectful.
*/
import "dotenv/config";
@@ -16,200 +15,205 @@ import { sql } from "drizzle-orm";
import { getDb, closeDb } from "../src/lib/db/index";
import { diseases } from "../src/lib/db/schema";
const WIKI_API = "https://en.wikipedia.org/w/api.php";
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
const MIN_DELAY_MS = 350; // Be respectful
const API = "https://en.wikipedia.org/w/api.php";
const BATCH_SIZE = 50; // Max titles per query
const DELAY_MS = 2000; // Between batches
let lastCall = 0;
async function rateLimit() {
const now = Date.now();
const elapsed = now - lastCall;
if (elapsed < MIN_DELAY_MS) {
await new Promise((r) => setTimeout(r, MIN_DELAY_MS - elapsed));
}
lastCall = Date.now();
/** Convert disease name to Wikipedia page title format */
function toPageTitle(name: string): string {
return name
.trim()
.replace(/\s+/g, " ")
.split(" ")
.map((w) => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase())
.join("_")
.replace(/[()]/g, "");
}
interface WikiSearchResult {
title: string;
pageid: number;
}
/** Fetch thumbnails for up to 50 page titles in one call */
async function batchFetchImages(titles: string[]): Promise<Map<string, string>> {
const url = `${API}?action=query&titles=${encodeURIComponent(titles.join("|"))}&prop=pageimages&pithumbsize=400&redirects=1&format=json&origin=*`;
async function searchWikipedia(term: string): Promise<WikiSearchResult | null> {
await rateLimit();
const url = `${WIKI_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const results = data?.query?.search;
if (results && results.length > 0) {
return { title: results[0].title, pageid: results[0].pageid };
}
} catch {
// ignore
}
return null;
}
async function getPageImage(title: string): Promise<string | null> {
await rateLimit();
const url = `${WIKI_API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const pages = data?.query?.pages;
if (pages) {
const page = Object.values(pages)[0] as any;
if (page?.thumbnail?.source) {
return page.thumbnail.source;
for (let attempt = 0; attempt < 5; attempt++) {
try {
const res = await fetch(url, {
headers: { "User-Agent": "PlantHealthKB/1.0 (plant-id)" },
});
if (res.status === 429) {
const wait = Math.min(60000, 3000 * Math.pow(2, attempt));
console.log(` 429 — waiting ${wait / 1000}s...`);
await new Promise((r) => setTimeout(r, wait));
continue;
}
if (!res.ok) return new Map();
const data = (await res.json()) as any;
const pages = data?.query?.pages;
const result = new Map<string, string>();
if (pages) {
for (const [, page] of Object.entries(pages) as any) {
if (page?.missing || page?.invalid) continue;
const originalTitle = page.title.replace(/_/g, " ");
const thumb = page?.thumbnail?.source;
if (thumb) {
result.set(originalTitle.toLowerCase(), thumb);
}
}
}
// Apply redirect resolution
const normalized = data?.query?.normalized;
if (normalized) {
for (const n of normalized) {
const from = n.from.toLowerCase();
const to = n.to.toLowerCase();
// If we have a result for the canonical name, also map the original
if (result.has(to) && !result.has(from)) {
result.set(from, result.get(to)!);
}
}
}
return result;
} catch {
await new Promise((r) => setTimeout(r, 2000));
}
} catch {
// ignore
}
return null;
return new Map();
}
async function searchCommons(term: string): Promise<string | null> {
await rateLimit();
const url = `${COMMONS_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=3&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const results = data?.query?.search;
if (results && results.length > 0) {
// Try to get thumbnail for best match
for (const r of results.slice(0, 2)) {
const imgUrl = await getCommonsImage(r.title);
if (imgUrl) return imgUrl;
}
}
} catch {
// ignore
}
return null;
}
/** Generate candidate page titles from disease name + scientific name */
function getTitleCandidates(name: string, sciName: string): string[] {
const candidates: string[] = [];
candidates.push(toPageTitle(name));
async function getCommonsImage(title: string): Promise<string | null> {
await rateLimit();
const url = `${COMMONS_API}?action=query&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url&iiurlwidth=400&format=json&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const pages = data?.query?.pages;
if (pages) {
const page = Object.values(pages)[0] as any;
if (page?.imageinfo?.[0]?.thumburl) {
return page.imageinfo[0].thumburl;
}
if (page?.imageinfo?.[0]?.url) {
return page.imageinfo[0].url;
}
// Try scientific name
if (sciName && sciName.length > 3) {
// Full scientific name as page title (e.g., "Phytophthora infestans")
candidates.push(sciName.trim());
// Genus alone (e.g., "Alternaria")
const genus = sciName.split(/\s+/)[0];
if (genus && genus.length > 3) {
candidates.push(genus);
}
} catch {
// ignore
}
return null;
// Deduplicate
return [...new Set(candidates)];
}
async function main() {
console.log("🔍 Fetching disease images from Wikipedia\n");
console.log("🔍 Fetching disease images from Wikipedia (batch mode)\n");
const db = getDb();
const rows = await db
.select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName })
.from(diseases)
.where(sql`(image_url IS NULL OR image_url = '')`);
console.log(`📋 ${rows.length} diseases need images\n`);
const rawClient = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
// Get all diseases without images
const rows = await db
.select({
id: diseases.id,
name: diseases.name,
sciName: diseases.scientificName,
plantId: diseases.plantId,
})
.from(diseases)
.where(sql`image_url IS NULL OR image_url = ''`);
console.log(`📋 ${rows.length} diseases missing images`);
if (rows.length === 0) {
console.log("✅ All diseases already have images!");
process.exit(0);
}
let found = 0;
let skipped = 0;
let batch: { sql: string; args: any[] }[] = [];
let pending = 0;
let updates: { id: string; url: string }[] = [];
const BATCH_SIZE = 50;
let i = 0;
for (let i = 0; i < rows.length; i += BATCH_SIZE) {
const chunk = rows.slice(i, i + BATCH_SIZE);
for (const row of rows) {
i++;
// Build search terms: try scientific name + disease name, then disease name alone
const searchTerms = [
`${row.sciName || ""} ${row.name}`.trim(),
row.name,
`${row.name} (${row.sciName})`.trim(),
].filter(Boolean);
let imageUrl: string | null = null;
for (const term of searchTerms) {
if (term.length < 3) continue;
// Try Wikipedia first
const page = await searchWikipedia(term);
if (page) {
imageUrl = await getPageImage(page.title);
if (imageUrl) break;
// Collect all unique candidate titles for this batch
const titleMap = new Map<string, { id: string; name: string; sciName: string }[]>();
for (const r of chunk) {
const candidates = getTitleCandidates(r.name, r.sciName || "");
for (const t of candidates) {
const key = t.toLowerCase();
if (!titleMap.has(key)) titleMap.set(key, []);
titleMap.get(key)!.push(r);
}
// Try Commons directly
imageUrl = await searchCommons(term);
if (imageUrl) break;
}
if (imageUrl && !imageUrl.startsWith("https://")) {
imageUrl = null;
}
// Try exact disease name titles (first candidate for each)
const primaryTitles = chunk.map((r) => getTitleCandidates(r.name, r.sciName || "")[0]);
const imageMap = await batchFetchImages(primaryTitles);
if (imageUrl) {
batch.push({
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
args: [imageUrl, row.id],
});
if (i % 100 === 0) {
process.stdout.write(` 🔍 found ${found} so far...\n`);
// For unmatched, try additional candidates
const unmatched = chunk.filter(
(r) => !imageMap.has(getTitleCandidates(r.name, r.sciName || "")[0].toLowerCase()),
);
let secondPassMap = new Map<string, string>();
if (unmatched.length > 0) {
const altTitles = unmatched
.map((r) => getTitleCandidates(r.name, r.sciName || "").slice(1))
.flat()
.filter((t) => t.length > 0);
if (altTitles.length > 0) {
secondPassMap = await batchFetchImages([...new Set(altTitles)]);
}
found++;
} else {
skipped++;
}
// Flush batch
if (batch.length >= BATCH_SIZE) {
// Collect results
for (const r of chunk) {
const candidates = getTitleCandidates(r.name, r.sciName || "");
let imgUrl: string | undefined;
for (const t of candidates) {
imgUrl = imageMap.get(t.toLowerCase()) || secondPassMap.get(t.toLowerCase());
if (imgUrl) break;
}
if (imgUrl) {
updates.push({ id: r.id, url: imgUrl });
found++;
}
pending++;
}
// Flush updates to DB when we have enough
if (updates.length >= 100 || (i + BATCH_SIZE >= rows.length && updates.length > 0)) {
await rawClient.batch(
batch.map((b) => ({ sql: b.sql, args: b.args })),
updates.map((u) => ({
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
process.stdout.write(` 📦 flushed ${batch.length} updates (${i}/${rows.length})\n`);
batch = [];
updates = [];
}
// Progress
const pct = ((Math.min(i + BATCH_SIZE, rows.length) / rows.length) * 100).toFixed(1);
process.stdout.write(
` [${pct}%] ${Math.min(i + BATCH_SIZE, rows.length)}/${rows.length} found=${found}\n`,
);
// Rate limit
if (i + BATCH_SIZE < rows.length) {
await new Promise((r) => setTimeout(r, DELAY_MS));
}
}
// Flush remaining
if (batch.length > 0) {
// Mark remaining as empty
if (pending < rows.length) {
const remaining = rows.slice(pending);
await rawClient.batch(
batch.map((b) => ({ sql: b.sql, args: b.args })),
remaining.map((r) => ({
sql: "UPDATE diseases SET image_url = '' WHERE id = ? AND (image_url IS NULL OR image_url = '')",
args: [r.id],
})),
"write",
);
process.stdout.write(` 📦 final flush: ${batch.length} updates\n`);
}
rawClient.close();
closeDb();
console.log(`\n✅ Done! Found images: ${found} | Skipped: ${skipped}`);
console.log(`\n✅ Done! Found images: ${found} / ${rows.length}`);
}
main().catch((err) => { console.error("❌ Fatal:", err); process.exit(1); });
main().catch((err) => {
console.error("❌ Fatal:", err);
process.exit(1);
});