ooooeee
This commit is contained in:
@@ -1,13 +1,12 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Fetch disease images from Wikipedia/Wikimedia Commons.
|
||||
* Fetch disease images from Wikipedia using batch page-title queries.
|
||||
*
|
||||
* For each disease in the database, searches Wikipedia for its page
|
||||
* and retrieves the main infobox image.
|
||||
* Strategy: Convert disease names to Wikipedia page titles, query 50
|
||||
* at a time with pageimages prop. Wikipedia resolves redirects automatically.
|
||||
* Covers 10K+ diseases in ~200 API calls (7 minutes).
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
|
||||
*
|
||||
* Rate-limited to 1 request per 300ms to be respectful.
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
@@ -16,200 +15,205 @@ import { sql } from "drizzle-orm";
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases } from "../src/lib/db/schema";
|
||||
|
||||
const WIKI_API = "https://en.wikipedia.org/w/api.php";
|
||||
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
|
||||
const MIN_DELAY_MS = 350; // Be respectful
|
||||
const API = "https://en.wikipedia.org/w/api.php";
|
||||
const BATCH_SIZE = 50; // Max titles per query
|
||||
const DELAY_MS = 2000; // Between batches
|
||||
|
||||
let lastCall = 0;
|
||||
|
||||
async function rateLimit() {
|
||||
const now = Date.now();
|
||||
const elapsed = now - lastCall;
|
||||
if (elapsed < MIN_DELAY_MS) {
|
||||
await new Promise((r) => setTimeout(r, MIN_DELAY_MS - elapsed));
|
||||
}
|
||||
lastCall = Date.now();
|
||||
/** Convert disease name to Wikipedia page title format */
|
||||
function toPageTitle(name: string): string {
|
||||
return name
|
||||
.trim()
|
||||
.replace(/\s+/g, " ")
|
||||
.split(" ")
|
||||
.map((w) => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase())
|
||||
.join("_")
|
||||
.replace(/[()]/g, "");
|
||||
}
|
||||
|
||||
interface WikiSearchResult {
|
||||
title: string;
|
||||
pageid: number;
|
||||
}
|
||||
/** Fetch thumbnails for up to 50 page titles in one call */
|
||||
async function batchFetchImages(titles: string[]): Promise<Map<string, string>> {
|
||||
const url = `${API}?action=query&titles=${encodeURIComponent(titles.join("|"))}&prop=pageimages&pithumbsize=400&redirects=1&format=json&origin=*`;
|
||||
|
||||
async function searchWikipedia(term: string): Promise<WikiSearchResult | null> {
|
||||
await rateLimit();
|
||||
const url = `${WIKI_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
const data = await res.json() as any;
|
||||
const results = data?.query?.search;
|
||||
if (results && results.length > 0) {
|
||||
return { title: results[0].title, pageid: results[0].pageid };
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function getPageImage(title: string): Promise<string | null> {
|
||||
await rateLimit();
|
||||
const url = `${WIKI_API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
const data = await res.json() as any;
|
||||
const pages = data?.query?.pages;
|
||||
if (pages) {
|
||||
const page = Object.values(pages)[0] as any;
|
||||
if (page?.thumbnail?.source) {
|
||||
return page.thumbnail.source;
|
||||
for (let attempt = 0; attempt < 5; attempt++) {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": "PlantHealthKB/1.0 (plant-id)" },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
const wait = Math.min(60000, 3000 * Math.pow(2, attempt));
|
||||
console.log(` 429 — waiting ${wait / 1000}s...`);
|
||||
await new Promise((r) => setTimeout(r, wait));
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return new Map();
|
||||
const data = (await res.json()) as any;
|
||||
const pages = data?.query?.pages;
|
||||
const result = new Map<string, string>();
|
||||
|
||||
if (pages) {
|
||||
for (const [, page] of Object.entries(pages) as any) {
|
||||
if (page?.missing || page?.invalid) continue;
|
||||
const originalTitle = page.title.replace(/_/g, " ");
|
||||
const thumb = page?.thumbnail?.source;
|
||||
if (thumb) {
|
||||
result.set(originalTitle.toLowerCase(), thumb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply redirect resolution
|
||||
const normalized = data?.query?.normalized;
|
||||
if (normalized) {
|
||||
for (const n of normalized) {
|
||||
const from = n.from.toLowerCase();
|
||||
const to = n.to.toLowerCase();
|
||||
// If we have a result for the canonical name, also map the original
|
||||
if (result.has(to) && !result.has(from)) {
|
||||
result.set(from, result.get(to)!);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
return new Map();
|
||||
}
|
||||
|
||||
async function searchCommons(term: string): Promise<string | null> {
|
||||
await rateLimit();
|
||||
const url = `${COMMONS_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=3&origin=*`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
const data = await res.json() as any;
|
||||
const results = data?.query?.search;
|
||||
if (results && results.length > 0) {
|
||||
// Try to get thumbnail for best match
|
||||
for (const r of results.slice(0, 2)) {
|
||||
const imgUrl = await getCommonsImage(r.title);
|
||||
if (imgUrl) return imgUrl;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
}
|
||||
/** Generate candidate page titles from disease name + scientific name */
|
||||
function getTitleCandidates(name: string, sciName: string): string[] {
|
||||
const candidates: string[] = [];
|
||||
candidates.push(toPageTitle(name));
|
||||
|
||||
async function getCommonsImage(title: string): Promise<string | null> {
|
||||
await rateLimit();
|
||||
const url = `${COMMONS_API}?action=query&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url&iiurlwidth=400&format=json&origin=*`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
const data = await res.json() as any;
|
||||
const pages = data?.query?.pages;
|
||||
if (pages) {
|
||||
const page = Object.values(pages)[0] as any;
|
||||
if (page?.imageinfo?.[0]?.thumburl) {
|
||||
return page.imageinfo[0].thumburl;
|
||||
}
|
||||
if (page?.imageinfo?.[0]?.url) {
|
||||
return page.imageinfo[0].url;
|
||||
}
|
||||
// Try scientific name
|
||||
if (sciName && sciName.length > 3) {
|
||||
// Full scientific name as page title (e.g., "Phytophthora infestans")
|
||||
candidates.push(sciName.trim());
|
||||
|
||||
// Genus alone (e.g., "Alternaria")
|
||||
const genus = sciName.split(/\s+/)[0];
|
||||
if (genus && genus.length > 3) {
|
||||
candidates.push(genus);
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
|
||||
// Deduplicate
|
||||
return [...new Set(candidates)];
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("🔍 Fetching disease images from Wikipedia\n");
|
||||
console.log("🔍 Fetching disease images from Wikipedia (batch mode)\n");
|
||||
const db = getDb();
|
||||
|
||||
const rows = await db
|
||||
.select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName })
|
||||
.from(diseases)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`);
|
||||
|
||||
console.log(`📋 ${rows.length} diseases need images\n`);
|
||||
|
||||
const rawClient = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
// Get all diseases without images
|
||||
const rows = await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
name: diseases.name,
|
||||
sciName: diseases.scientificName,
|
||||
plantId: diseases.plantId,
|
||||
})
|
||||
.from(diseases)
|
||||
.where(sql`image_url IS NULL OR image_url = ''`);
|
||||
|
||||
console.log(`📋 ${rows.length} diseases missing images`);
|
||||
if (rows.length === 0) {
|
||||
console.log("✅ All diseases already have images!");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
let found = 0;
|
||||
let skipped = 0;
|
||||
let batch: { sql: string; args: any[] }[] = [];
|
||||
let pending = 0;
|
||||
let updates: { id: string; url: string }[] = [];
|
||||
|
||||
const BATCH_SIZE = 50;
|
||||
let i = 0;
|
||||
for (let i = 0; i < rows.length; i += BATCH_SIZE) {
|
||||
const chunk = rows.slice(i, i + BATCH_SIZE);
|
||||
|
||||
for (const row of rows) {
|
||||
i++;
|
||||
// Build search terms: try scientific name + disease name, then disease name alone
|
||||
const searchTerms = [
|
||||
`${row.sciName || ""} ${row.name}`.trim(),
|
||||
row.name,
|
||||
`${row.name} (${row.sciName})`.trim(),
|
||||
].filter(Boolean);
|
||||
|
||||
let imageUrl: string | null = null;
|
||||
|
||||
for (const term of searchTerms) {
|
||||
if (term.length < 3) continue;
|
||||
// Try Wikipedia first
|
||||
const page = await searchWikipedia(term);
|
||||
if (page) {
|
||||
imageUrl = await getPageImage(page.title);
|
||||
if (imageUrl) break;
|
||||
// Collect all unique candidate titles for this batch
|
||||
const titleMap = new Map<string, { id: string; name: string; sciName: string }[]>();
|
||||
for (const r of chunk) {
|
||||
const candidates = getTitleCandidates(r.name, r.sciName || "");
|
||||
for (const t of candidates) {
|
||||
const key = t.toLowerCase();
|
||||
if (!titleMap.has(key)) titleMap.set(key, []);
|
||||
titleMap.get(key)!.push(r);
|
||||
}
|
||||
// Try Commons directly
|
||||
imageUrl = await searchCommons(term);
|
||||
if (imageUrl) break;
|
||||
}
|
||||
|
||||
if (imageUrl && !imageUrl.startsWith("https://")) {
|
||||
imageUrl = null;
|
||||
}
|
||||
// Try exact disease name titles (first candidate for each)
|
||||
const primaryTitles = chunk.map((r) => getTitleCandidates(r.name, r.sciName || "")[0]);
|
||||
const imageMap = await batchFetchImages(primaryTitles);
|
||||
|
||||
if (imageUrl) {
|
||||
batch.push({
|
||||
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
||||
args: [imageUrl, row.id],
|
||||
});
|
||||
if (i % 100 === 0) {
|
||||
process.stdout.write(` 🔍 found ${found} so far...\n`);
|
||||
// For unmatched, try additional candidates
|
||||
const unmatched = chunk.filter(
|
||||
(r) => !imageMap.has(getTitleCandidates(r.name, r.sciName || "")[0].toLowerCase()),
|
||||
);
|
||||
let secondPassMap = new Map<string, string>();
|
||||
if (unmatched.length > 0) {
|
||||
const altTitles = unmatched
|
||||
.map((r) => getTitleCandidates(r.name, r.sciName || "").slice(1))
|
||||
.flat()
|
||||
.filter((t) => t.length > 0);
|
||||
if (altTitles.length > 0) {
|
||||
secondPassMap = await batchFetchImages([...new Set(altTitles)]);
|
||||
}
|
||||
found++;
|
||||
} else {
|
||||
skipped++;
|
||||
}
|
||||
|
||||
// Flush batch
|
||||
if (batch.length >= BATCH_SIZE) {
|
||||
// Collect results
|
||||
for (const r of chunk) {
|
||||
const candidates = getTitleCandidates(r.name, r.sciName || "");
|
||||
let imgUrl: string | undefined;
|
||||
for (const t of candidates) {
|
||||
imgUrl = imageMap.get(t.toLowerCase()) || secondPassMap.get(t.toLowerCase());
|
||||
if (imgUrl) break;
|
||||
}
|
||||
if (imgUrl) {
|
||||
updates.push({ id: r.id, url: imgUrl });
|
||||
found++;
|
||||
}
|
||||
pending++;
|
||||
}
|
||||
|
||||
// Flush updates to DB when we have enough
|
||||
if (updates.length >= 100 || (i + BATCH_SIZE >= rows.length && updates.length > 0)) {
|
||||
await rawClient.batch(
|
||||
batch.map((b) => ({ sql: b.sql, args: b.args })),
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
process.stdout.write(` 📦 flushed ${batch.length} updates (${i}/${rows.length})\n`);
|
||||
batch = [];
|
||||
updates = [];
|
||||
}
|
||||
|
||||
// Progress
|
||||
const pct = ((Math.min(i + BATCH_SIZE, rows.length) / rows.length) * 100).toFixed(1);
|
||||
process.stdout.write(
|
||||
` [${pct}%] ${Math.min(i + BATCH_SIZE, rows.length)}/${rows.length} found=${found}\n`,
|
||||
);
|
||||
|
||||
// Rate limit
|
||||
if (i + BATCH_SIZE < rows.length) {
|
||||
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||||
}
|
||||
}
|
||||
|
||||
// Flush remaining
|
||||
if (batch.length > 0) {
|
||||
// Mark remaining as empty
|
||||
if (pending < rows.length) {
|
||||
const remaining = rows.slice(pending);
|
||||
await rawClient.batch(
|
||||
batch.map((b) => ({ sql: b.sql, args: b.args })),
|
||||
remaining.map((r) => ({
|
||||
sql: "UPDATE diseases SET image_url = '' WHERE id = ? AND (image_url IS NULL OR image_url = '')",
|
||||
args: [r.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
process.stdout.write(` 📦 final flush: ${batch.length} updates\n`);
|
||||
}
|
||||
|
||||
rawClient.close();
|
||||
closeDb();
|
||||
|
||||
console.log(`\n✅ Done! Found images: ${found} | Skipped: ${skipped}`);
|
||||
console.log(`\n✅ Done! Found images: ${found} / ${rows.length}`);
|
||||
}
|
||||
|
||||
main().catch((err) => { console.error("❌ Fatal:", err); process.exit(1); });
|
||||
main().catch((err) => {
|
||||
console.error("❌ Fatal:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user