Files
plant-disease-id/apps/web/scripts/scrape-disease-images.ts
2026-06-05 21:47:00 -04:00

216 lines
6.0 KiB
JavaScript

#!/usr/bin/env node
/**
* Fetch disease images from Wikipedia/Wikimedia Commons.
*
* For each disease in the database, searches Wikipedia for its page
* and retrieves the main infobox image.
*
* Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
*
* Rate-limited to 1 request per 300ms to be respectful.
*/
import "dotenv/config";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
import { getDb, closeDb } from "../src/lib/db/index";
import { diseases } from "../src/lib/db/schema";
const WIKI_API = "https://en.wikipedia.org/w/api.php";
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
const MIN_DELAY_MS = 350; // Be respectful
let lastCall = 0;
async function rateLimit() {
const now = Date.now();
const elapsed = now - lastCall;
if (elapsed < MIN_DELAY_MS) {
await new Promise((r) => setTimeout(r, MIN_DELAY_MS - elapsed));
}
lastCall = Date.now();
}
interface WikiSearchResult {
title: string;
pageid: number;
}
async function searchWikipedia(term: string): Promise<WikiSearchResult | null> {
await rateLimit();
const url = `${WIKI_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const results = data?.query?.search;
if (results && results.length > 0) {
return { title: results[0].title, pageid: results[0].pageid };
}
} catch {
// ignore
}
return null;
}
async function getPageImage(title: string): Promise<string | null> {
await rateLimit();
const url = `${WIKI_API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const pages = data?.query?.pages;
if (pages) {
const page = Object.values(pages)[0] as any;
if (page?.thumbnail?.source) {
return page.thumbnail.source;
}
}
} catch {
// ignore
}
return null;
}
async function searchCommons(term: string): Promise<string | null> {
await rateLimit();
const url = `${COMMONS_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=3&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const results = data?.query?.search;
if (results && results.length > 0) {
// Try to get thumbnail for best match
for (const r of results.slice(0, 2)) {
const imgUrl = await getCommonsImage(r.title);
if (imgUrl) return imgUrl;
}
}
} catch {
// ignore
}
return null;
}
async function getCommonsImage(title: string): Promise<string | null> {
await rateLimit();
const url = `${COMMONS_API}?action=query&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url&iiurlwidth=400&format=json&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const pages = data?.query?.pages;
if (pages) {
const page = Object.values(pages)[0] as any;
if (page?.imageinfo?.[0]?.thumburl) {
return page.imageinfo[0].thumburl;
}
if (page?.imageinfo?.[0]?.url) {
return page.imageinfo[0].url;
}
}
} catch {
// ignore
}
return null;
}
async function main() {
console.log("🔍 Fetching disease images from Wikipedia\n");
const db = getDb();
const rawClient = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
// Get all diseases without images
const rows = await db
.select({
id: diseases.id,
name: diseases.name,
sciName: diseases.scientificName,
plantId: diseases.plantId,
})
.from(diseases)
.where(sql`image_url IS NULL OR image_url = ''`);
console.log(`📋 ${rows.length} diseases missing images`);
if (rows.length === 0) {
console.log("✅ All diseases already have images!");
process.exit(0);
}
let found = 0;
let skipped = 0;
let batch: { sql: string; args: any[] }[] = [];
const BATCH_SIZE = 50;
let i = 0;
for (const row of rows) {
i++;
// Build search terms: try scientific name + disease name, then disease name alone
const searchTerms = [
`${row.sciName || ""} ${row.name}`.trim(),
row.name,
`${row.name} (${row.sciName})`.trim(),
].filter(Boolean);
let imageUrl: string | null = null;
for (const term of searchTerms) {
if (term.length < 3) continue;
// Try Wikipedia first
const page = await searchWikipedia(term);
if (page) {
imageUrl = await getPageImage(page.title);
if (imageUrl) break;
}
// Try Commons directly
imageUrl = await searchCommons(term);
if (imageUrl) break;
}
if (imageUrl && !imageUrl.startsWith("https://")) {
imageUrl = null;
}
if (imageUrl) {
batch.push({
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
args: [imageUrl, row.id],
});
if (i % 100 === 0) {
process.stdout.write(` 🔍 found ${found} so far...\n`);
}
found++;
} else {
skipped++;
}
// Flush batch
if (batch.length >= BATCH_SIZE) {
await rawClient.batch(
batch.map((b) => ({ sql: b.sql, args: b.args })),
"write",
);
process.stdout.write(` 📦 flushed ${batch.length} updates (${i}/${rows.length})\n`);
batch = [];
}
}
// Flush remaining
if (batch.length > 0) {
await rawClient.batch(
batch.map((b) => ({ sql: b.sql, args: b.args })),
"write",
);
process.stdout.write(` 📦 final flush: ${batch.length} updates\n`);
}
rawClient.close();
closeDb();
console.log(`\n✅ Done! Found images: ${found} | Skipped: ${skipped}`);
}
main().catch((err) => { console.error("❌ Fatal:", err); process.exit(1); });