#!/usr/bin/env node /** * Fetch disease images from Wikipedia/Wikimedia Commons. * * For each disease in the database, searches Wikipedia for its page * and retrieves the main infobox image. * * Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts * * Rate-limited to 1 request per 300ms to be respectful. */ import "dotenv/config"; import { createClient } from "@libsql/client"; import { sql } from "drizzle-orm"; import { getDb, closeDb } from "../src/lib/db/index"; import { diseases } from "../src/lib/db/schema"; const WIKI_API = "https://en.wikipedia.org/w/api.php"; const COMMONS_API = "https://commons.wikimedia.org/w/api.php"; const MIN_DELAY_MS = 350; // Be respectful let lastCall = 0; async function rateLimit() { const now = Date.now(); const elapsed = now - lastCall; if (elapsed < MIN_DELAY_MS) { await new Promise((r) => setTimeout(r, MIN_DELAY_MS - elapsed)); } lastCall = Date.now(); } interface WikiSearchResult { title: string; pageid: number; } async function searchWikipedia(term: string): Promise { await rateLimit(); const url = `${WIKI_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`; try { const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } }); const data = await res.json() as any; const results = data?.query?.search; if (results && results.length > 0) { return { title: results[0].title, pageid: results[0].pageid }; } } catch { // ignore } return null; } async function getPageImage(title: string): Promise { await rateLimit(); const url = `${WIKI_API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`; try { const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } }); const data = await res.json() as any; const pages = data?.query?.pages; if (pages) { const page = Object.values(pages)[0] as any; if (page?.thumbnail?.source) { return page.thumbnail.source; } } } catch { // ignore } return null; } async function searchCommons(term: string): Promise { await rateLimit(); const url = `${COMMONS_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=3&origin=*`; try { const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } }); const data = await res.json() as any; const results = data?.query?.search; if (results && results.length > 0) { // Try to get thumbnail for best match for (const r of results.slice(0, 2)) { const imgUrl = await getCommonsImage(r.title); if (imgUrl) return imgUrl; } } } catch { // ignore } return null; } async function getCommonsImage(title: string): Promise { await rateLimit(); const url = `${COMMONS_API}?action=query&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url&iiurlwidth=400&format=json&origin=*`; try { const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } }); const data = await res.json() as any; const pages = data?.query?.pages; if (pages) { const page = Object.values(pages)[0] as any; if (page?.imageinfo?.[0]?.thumburl) { return page.imageinfo[0].thumburl; } if (page?.imageinfo?.[0]?.url) { return page.imageinfo[0].url; } } } catch { // ignore } return null; } async function main() { console.log("šŸ” Fetching disease images from Wikipedia\n"); const db = getDb(); const rawClient = createClient({ url: process.env.DATABASE_URL!, authToken: process.env.DATABASE_TOKEN!, }); // Get all diseases without images const rows = await db .select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName, plantId: diseases.plantId, }) .from(diseases) .where(sql`image_url IS NULL OR image_url = ''`); console.log(`šŸ“‹ ${rows.length} diseases missing images`); if (rows.length === 0) { console.log("āœ… All diseases already have images!"); process.exit(0); } let found = 0; let skipped = 0; let batch: { sql: string; args: any[] }[] = []; const BATCH_SIZE = 50; let i = 0; for (const row of rows) { i++; // Build search terms: try scientific name + disease name, then disease name alone const searchTerms = [ `${row.sciName || ""} ${row.name}`.trim(), row.name, `${row.name} (${row.sciName})`.trim(), ].filter(Boolean); let imageUrl: string | null = null; for (const term of searchTerms) { if (term.length < 3) continue; // Try Wikipedia first const page = await searchWikipedia(term); if (page) { imageUrl = await getPageImage(page.title); if (imageUrl) break; } // Try Commons directly imageUrl = await searchCommons(term); if (imageUrl) break; } if (imageUrl && !imageUrl.startsWith("https://")) { imageUrl = null; } if (imageUrl) { batch.push({ sql: "UPDATE diseases SET image_url = ? WHERE id = ?", args: [imageUrl, row.id], }); if (i % 100 === 0) { process.stdout.write(` šŸ” found ${found} so far...\n`); } found++; } else { skipped++; } // Flush batch if (batch.length >= BATCH_SIZE) { await rawClient.batch( batch.map((b) => ({ sql: b.sql, args: b.args })), "write", ); process.stdout.write(` šŸ“¦ flushed ${batch.length} updates (${i}/${rows.length})\n`); batch = []; } } // Flush remaining if (batch.length > 0) { await rawClient.batch( batch.map((b) => ({ sql: b.sql, args: b.args })), "write", ); process.stdout.write(` šŸ“¦ final flush: ${batch.length} updates\n`); } rawClient.close(); closeDb(); console.log(`\nāœ… Done! Found images: ${found} | Skipped: ${skipped}`); } main().catch((err) => { console.error("āŒ Fatal:", err); process.exit(1); });