220 lines
6.8 KiB
JavaScript
220 lines
6.8 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Fetch disease images from Wikipedia using batch page-title queries.
|
|
*
|
|
* Strategy: Convert disease names to Wikipedia page titles, query 50
|
|
* at a time with pageimages prop. Wikipedia resolves redirects automatically.
|
|
* Covers 10K+ diseases in ~200 API calls (7 minutes).
|
|
*
|
|
* Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
|
|
*/
|
|
|
|
import "dotenv/config";
|
|
import { createClient } from "@libsql/client";
|
|
import { sql } from "drizzle-orm";
|
|
import { getDb, closeDb } from "../src/lib/db/index";
|
|
import { diseases } from "../src/lib/db/schema";
|
|
|
|
const API = "https://en.wikipedia.org/w/api.php";
|
|
const BATCH_SIZE = 50; // Max titles per query
|
|
const DELAY_MS = 2000; // Between batches
|
|
|
|
/** Convert disease name to Wikipedia page title format */
|
|
function toPageTitle(name: string): string {
|
|
return name
|
|
.trim()
|
|
.replace(/\s+/g, " ")
|
|
.split(" ")
|
|
.map((w) => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase())
|
|
.join("_")
|
|
.replace(/[()]/g, "");
|
|
}
|
|
|
|
/** Fetch thumbnails for up to 50 page titles in one call */
|
|
async function batchFetchImages(titles: string[]): Promise<Map<string, string>> {
|
|
const url = `${API}?action=query&titles=${encodeURIComponent(titles.join("|"))}&prop=pageimages&pithumbsize=400&redirects=1&format=json&origin=*`;
|
|
|
|
for (let attempt = 0; attempt < 5; attempt++) {
|
|
try {
|
|
const res = await fetch(url, {
|
|
headers: { "User-Agent": "PlantHealthKB/1.0 (plant-id)" },
|
|
});
|
|
if (res.status === 429) {
|
|
const wait = Math.min(60000, 3000 * Math.pow(2, attempt));
|
|
console.log(` 429 — waiting ${wait / 1000}s...`);
|
|
await new Promise((r) => setTimeout(r, wait));
|
|
continue;
|
|
}
|
|
if (!res.ok) return new Map();
|
|
const data = (await res.json()) as any;
|
|
const pages = data?.query?.pages;
|
|
const result = new Map<string, string>();
|
|
|
|
if (pages) {
|
|
for (const [, page] of Object.entries(pages) as any) {
|
|
if (page?.missing || page?.invalid) continue;
|
|
const originalTitle = page.title.replace(/_/g, " ");
|
|
const thumb = page?.thumbnail?.source;
|
|
if (thumb) {
|
|
result.set(originalTitle.toLowerCase(), thumb);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Apply redirect resolution
|
|
const normalized = data?.query?.normalized;
|
|
if (normalized) {
|
|
for (const n of normalized) {
|
|
const from = n.from.toLowerCase();
|
|
const to = n.to.toLowerCase();
|
|
// If we have a result for the canonical name, also map the original
|
|
if (result.has(to) && !result.has(from)) {
|
|
result.set(from, result.get(to)!);
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
} catch {
|
|
await new Promise((r) => setTimeout(r, 2000));
|
|
}
|
|
}
|
|
return new Map();
|
|
}
|
|
|
|
/** Generate candidate page titles from disease name + scientific name */
|
|
function getTitleCandidates(name: string, sciName: string): string[] {
|
|
const candidates: string[] = [];
|
|
candidates.push(toPageTitle(name));
|
|
|
|
// Try scientific name
|
|
if (sciName && sciName.length > 3) {
|
|
// Full scientific name as page title (e.g., "Phytophthora infestans")
|
|
candidates.push(sciName.trim());
|
|
|
|
// Genus alone (e.g., "Alternaria")
|
|
const genus = sciName.split(/\s+/)[0];
|
|
if (genus && genus.length > 3) {
|
|
candidates.push(genus);
|
|
}
|
|
}
|
|
|
|
// Deduplicate
|
|
return [...new Set(candidates)];
|
|
}
|
|
|
|
async function main() {
|
|
console.log("🔍 Fetching disease images from Wikipedia (batch mode)\n");
|
|
const db = getDb();
|
|
|
|
const rows = await db
|
|
.select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName })
|
|
.from(diseases)
|
|
.where(sql`(image_url IS NULL OR image_url = '')`);
|
|
|
|
console.log(`📋 ${rows.length} diseases need images\n`);
|
|
|
|
const rawClient = createClient({
|
|
url: process.env.DATABASE_URL!,
|
|
authToken: process.env.DATABASE_TOKEN!,
|
|
});
|
|
|
|
let found = 0;
|
|
let pending = 0;
|
|
let updates: { id: string; url: string }[] = [];
|
|
|
|
for (let i = 0; i < rows.length; i += BATCH_SIZE) {
|
|
const chunk = rows.slice(i, i + BATCH_SIZE);
|
|
|
|
// Collect all unique candidate titles for this batch
|
|
const titleMap = new Map<string, { id: string; name: string; sciName: string }[]>();
|
|
for (const r of chunk) {
|
|
const candidates = getTitleCandidates(r.name, r.sciName || "");
|
|
for (const t of candidates) {
|
|
const key = t.toLowerCase();
|
|
if (!titleMap.has(key)) titleMap.set(key, []);
|
|
titleMap.get(key)!.push(r);
|
|
}
|
|
}
|
|
|
|
// Try exact disease name titles (first candidate for each)
|
|
const primaryTitles = chunk.map((r) => getTitleCandidates(r.name, r.sciName || "")[0]);
|
|
const imageMap = await batchFetchImages(primaryTitles);
|
|
|
|
// For unmatched, try additional candidates
|
|
const unmatched = chunk.filter(
|
|
(r) => !imageMap.has(getTitleCandidates(r.name, r.sciName || "")[0].toLowerCase()),
|
|
);
|
|
let secondPassMap = new Map<string, string>();
|
|
if (unmatched.length > 0) {
|
|
const altTitles = unmatched
|
|
.map((r) => getTitleCandidates(r.name, r.sciName || "").slice(1))
|
|
.flat()
|
|
.filter((t) => t.length > 0);
|
|
if (altTitles.length > 0) {
|
|
secondPassMap = await batchFetchImages([...new Set(altTitles)]);
|
|
}
|
|
}
|
|
|
|
// Collect results
|
|
for (const r of chunk) {
|
|
const candidates = getTitleCandidates(r.name, r.sciName || "");
|
|
let imgUrl: string | undefined;
|
|
for (const t of candidates) {
|
|
imgUrl = imageMap.get(t.toLowerCase()) || secondPassMap.get(t.toLowerCase());
|
|
if (imgUrl) break;
|
|
}
|
|
if (imgUrl) {
|
|
updates.push({ id: r.id, url: imgUrl });
|
|
found++;
|
|
}
|
|
pending++;
|
|
}
|
|
|
|
// Flush updates to DB when we have enough
|
|
if (updates.length >= 100 || (i + BATCH_SIZE >= rows.length && updates.length > 0)) {
|
|
await rawClient.batch(
|
|
updates.map((u) => ({
|
|
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
|
args: [u.url, u.id],
|
|
})),
|
|
"write",
|
|
);
|
|
updates = [];
|
|
}
|
|
|
|
// Progress
|
|
const pct = ((Math.min(i + BATCH_SIZE, rows.length) / rows.length) * 100).toFixed(1);
|
|
process.stdout.write(
|
|
` [${pct}%] ${Math.min(i + BATCH_SIZE, rows.length)}/${rows.length} found=${found}\n`,
|
|
);
|
|
|
|
// Rate limit
|
|
if (i + BATCH_SIZE < rows.length) {
|
|
await new Promise((r) => setTimeout(r, DELAY_MS));
|
|
}
|
|
}
|
|
|
|
// Mark remaining as empty
|
|
if (pending < rows.length) {
|
|
const remaining = rows.slice(pending);
|
|
await rawClient.batch(
|
|
remaining.map((r) => ({
|
|
sql: "UPDATE diseases SET image_url = '' WHERE id = ? AND (image_url IS NULL OR image_url = '')",
|
|
args: [r.id],
|
|
})),
|
|
"write",
|
|
);
|
|
}
|
|
|
|
rawClient.close();
|
|
closeDb();
|
|
|
|
console.log(`\n✅ Done! Found images: ${found} / ${rows.length}`);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("❌ Fatal:", err);
|
|
process.exit(1);
|
|
});
|