#!/usr/bin/env node /** * fill-disease-images.ts — Three-stage disease image pipeline * * For every disease without an imageUrl, tries: * Stage 1 — Wikipedia search → pageimages * Stage 2 — Wikimedia Commons search * Stage 3 — Brave Image Search API (fallback, 1 req/sec, 2000/mo) * * Updates both diseases.json (seed) and the Turso DB. * Flags anything found only via Brave for human review. * * Usage: cd apps/web && npx tsx scripts/fill-disease-images.ts */ import "dotenv/config"; import { readFileSync, writeFileSync, existsSync } from "fs"; import { resolve } from "path"; import { createClient } from "@libsql/client"; import { closeDb } from "../src/lib/db/index"; // ─── Types & Config ────────────────────────────────────────────────────────── interface DiseaseSeed { id: string; plantId: string; name: string; scientificName: string; commonName?: string; [key: string]: unknown; } interface ImageResult { url: string; source: "wikipedia" | "commons" | "brave" | "missing"; quality: "good" | "fallback" | "missing"; } const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json"); const RESULTS_FILE = resolve(__dirname, ".image-results.json"); const REPORT_FILE = resolve(__dirname, ".image-review-needed.md"); const WIKI_API = "https://en.wikipedia.org/w/api.php"; const COMMONS_API = "https://commons.wikimedia.org/w/api.php"; const BRAVE_KEY = process.env.BRAVE_API_KEY ?? ""; const BRAVE_DELAY = 1100; const MAX_BRAVE = 2000; const UA = "PlantHealthKB/1.0 (plant-disease-id)"; const ORIGIN = "*"; let braveCount = 0; // ─── Wikipedia Stage ───────────────────────────────────────────────────────── /** * Search Wikipedia and get thumbnails in ONE API call using generator=search. * Returns first thumbnail found, or null. */ async function wikiSearchAndThumb(query: string): Promise { const params = new URLSearchParams({ action: "query", generator: "search", gsrsearch: query, gsrlimit: "5", prop: "pageimages", pithumbsize: "600", format: "json", origin: ORIGIN, }); for (let attempt = 0; attempt < 3; attempt++) { try { const res = await fetchWithTimeout(`${WIKI_API}?${params}`, { headers: { "User-Agent": UA }, }); if (res.status === 429) { await delay(3000 * 2 ** attempt); continue; } if (!res.ok) return null; const data = (await res.json()) as { query?: { pages?: Record }; }; const pages = data?.query?.pages; if (!pages) return null; for (const [, p] of Object.entries(pages)) { const src = (p as { thumbnail?: { source: string } })?.thumbnail?.source; if (src) return src; } return null; } catch { await delay(2000); } } return null; } /** * Try to find a Wikipedia image for a disease. * Uses generator=search which combines search + thumbnails in one call. */ async function wikiStage(d: DiseaseSeed, plantName: string): Promise { // Try 1: disease name + plant name (most specific) return wikiSearchAndThumb(`"${d.name}" ${plantName}`); } // ─── Commons Stage ─────────────────────────────────────────────────────────── /** Fetch with timeout. Aborts after `ms` milliseconds. */ async function fetchWithTimeout(url: string, opts: RequestInit, ms = 15000): Promise { const ctrl = new AbortController(); const timer = setTimeout(() => ctrl.abort(), ms); try { const res = await fetch(url, { ...opts, signal: ctrl.signal }); return res; } finally { clearTimeout(timer); } } async function commonsSearchAndThumb(query: string): Promise { const params = new URLSearchParams({ action: "query", list: "search", srsearch: query, srnamespace: "6", srlimit: "5", format: "json", origin: ORIGIN, }); for (let attempt = 0; attempt < 3; attempt++) { try { const res = await fetchWithTimeout(`${COMMONS_API}?${params}`, { headers: { "User-Agent": UA }, }); if (res.status === 429) { await delay(3000 * 2 ** attempt); continue; } if (!res.ok) return null; const data = (await res.json()) as { query?: { search?: Array<{ pageid: number; title: string }> }; }; const hits = data?.query?.search ?? []; if (hits.length === 0) return null; // Batch-fetch imageinfo for all found page IDs const pageids = hits.map((h) => h.pageid).join("|"); const imgParams = new URLSearchParams({ action: "query", pageids, prop: "imageinfo", iiprop: "url", iiurlwidth: "600", format: "json", origin: ORIGIN, }); const imgRes = await fetchWithTimeout(`${COMMONS_API}?${imgParams}`, { headers: { "User-Agent": UA }, }); if (!imgRes.ok) return null; const imgData = (await imgRes.json()) as { query?: { pages?: Record }; }; const imgPages = imgData?.query?.pages; if (!imgPages) return null; for (const [, pg] of Object.entries(imgPages)) { const p = pg as Record; const info = (p.imageinfo as Array> | undefined)?.[0]; if (info?.thumburl) return info.thumburl as string; if (info?.url) return info.url as string; } return null; } catch { await delay(2000); } } return null; } async function commonsStage(d: DiseaseSeed, plantName: string): Promise { let q: string; if (d.scientificName && !d.scientificName.includes("spp.") && !d.scientificName.includes("/")) { q = `${d.scientificName} ${plantName}`; } else { q = `${d.name} ${plantName} disease`; } const url = await commonsSearchAndThumb(q); return url ?? null; } // ─── Brave Stage ───────────────────────────────────────────────────────────── async function braveStage(d: DiseaseSeed, plantName: string): Promise { if (!BRAVE_KEY || braveCount >= MAX_BRAVE) return null; const url = new URL("https://api.search.brave.com/res/v1/images/search"); url.searchParams.set("q", `${d.name} ${plantName} plant disease symptom`); url.searchParams.set("count", "5"); for (let attempt = 0; attempt < 3; attempt++) { try { const res = await fetchWithTimeout(url.toString(), { headers: { "X-Subscription-Token": BRAVE_KEY, Accept: "application/json" }, }); if (res.status === 429) { await delay(5000 * 2 ** attempt); continue; } if (!res.ok) return null; braveCount++; const data = (await res.json()) as { results?: Array<{ url: string; thumbnail?: { src?: string } }>; }; const results = data?.results ?? []; if (results.length === 0) return null; // Prefer non-stock thumbnails for (const r of results) { const src = r.thumbnail?.src ?? r.url; if (src && !src.includes("dreamstime") && !src.includes("shutterstock") && !src.includes("alamy") && !src.includes("istock") && !src.includes("123rf")) { return src; } } return results[0].thumbnail?.src ?? results[0].url; } catch { await delay(2000); } } return null; } // ─── Helpers ───────────────────────────────────────────────────────────────── function delay(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); } function loadDiseases(): DiseaseSeed[] { return JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[]; } function getPlantName(diseases: DiseaseSeed[], diseaseId: string): string { const plant = diseases.find((p) => p.id === diseaseId); return plant?.commonName ?? plant?.name ?? diseaseId; } // ─── Main ──────────────────────────────────────────────────────────────────── async function main() { console.log("\n🔍 Plant Disease Image Filler\n"); const diseases = loadDiseases(); console.log(`📋 ${diseases.length} diseases loaded\n`); // Load existing results let results: Record = {}; if (existsSync(RESULTS_FILE)) { try { results = JSON.parse(readFileSync(RESULTS_FILE, "utf-8")); } catch { /* fresh */ } } const pending = diseases.filter((d) => { if ((d.imageUrl as string)?.length) return false; return !results[d.id]; }); if (pending.length === 0) { console.log("✅ All done\n"); await applyResults(diseases, results); return; } console.log(`⏳ ${pending.length} need images\n`); // ── Stage 1: Wikipedia ────────────────────────────────────────────── const s1 = pending.filter((d) => !results[d.id]); let s1ok = 0; console.log("─── Wikipedia ───\n"); for (let i = 0; i < s1.length; i++) { const d = s1[i]; const plantName = getPlantName(diseases, d.plantId); const url = await wikiStage(d, plantName); if (url) { results[d.id] = { url, source: "wikipedia", quality: "good" }; s1ok++; } const pct = ((i + 1) / s1.length * 100).toFixed(0); process.stdout.write(` [${pct}% ${i + 1}/${s1.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`); if ((i + 1) % 25 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2)); } writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2)); console.log(`\n → ${s1ok}/${s1.length} found\n`); // ── Stage 2: Commons ───────────────────────────────────────────────── const s2 = pending.filter((d) => !results[d.id]); let s2ok = 0; if (s2.length > 0) { console.log("─── Wikimedia Commons ───\n"); for (let i = 0; i < s2.length; i++) { const d = s2[i]; const plantName = getPlantName(diseases, d.plantId); let url: string | null = null; try { const result = await Promise.race([ commonsStage(d, plantName), new Promise((_, reject) => setTimeout(() => reject(new Error("timeout")), 25000)), ]); url = result; } catch { /* timeout */ } if (url) { results[d.id] = { url, source: "commons", quality: "good" }; s2ok++; } const pct = ((i + 1) / s2.length * 100).toFixed(0); process.stdout.write(` [${pct}% ${i + 1}/${s2.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`); if ((i + 1) % 10 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2)); } writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2)); console.log(`\n → ${s2ok}/${s2.length} found\n`); } // ── Stage 3: Brave ─────────────────────────────────────────────────── const s3 = pending.filter((d) => !results[d.id]); let s3ok = 0; if (s3.length > 0 && BRAVE_KEY) { console.log("─── Brave Image Search ───\n"); for (const d of s3) { if (braveCount >= MAX_BRAVE) { results[d.id] = { url: "", source: "missing", quality: "missing" }; continue; } const plantName = getPlantName(diseases, d.plantId); const url = await braveStage(d, plantName); if (url) { results[d.id] = { url, source: "brave", quality: "fallback" }; s3ok++; process.stdout.write(` ✅ ${d.name}\n`); } else { results[d.id] = { url: "", source: "missing", quality: "missing" }; process.stdout.write(` ❌ ${d.name}\n`); } await delay(BRAVE_DELAY); } writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2)); console.log(`\n → ${s3ok}/${s3.length} found via Brave\n`); } else if (s3.length > 0) { console.log("─── Brave Image Search ─── → skipped (no key)\n"); for (const d of s3) results[d.id] = { url: "", source: "missing", quality: "missing" }; } // ── Apply ─────────────────────────────────────────────────────────── await applyResults(diseases, results); // ── Report ────────────────────────────────────────────────────────── const good = Object.values(results).filter((r) => r.quality === "good").length; const fallback = Object.values(results).filter((r) => r.quality === "fallback").length; const missing = Object.values(results).filter((r) => r.quality === "missing").length; let report = `# Disease Images — Human Review Needed\n\n`; report += `Generated: ${new Date().toISOString()}\n\n`; for (const [label, ids, type] of [ ["Fallback (Brave)", Object.entries(results).filter(([, r]) => r.quality === "fallback").map(([id]) => id), "fallback"], ["Missing", Object.entries(results).filter(([, r]) => r.quality === "missing").map(([id]) => id), "missing"], ] as const) { if (ids.length === 0) continue; report += `## ${type === "fallback" ? "⚠️" : "🚫"} ${label}\n\n`; for (const id of ids) { const d = diseases.find((x) => x.id === id); const r = results[id]; report += `- **${d?.name ?? id}** (${d?.scientificName ?? ""}) on \`${d?.plantId ?? ""}\``; if (r?.url) report += `\n ${r.url}`; report += `\n\n`; } } if (good === diseases.length) report += `## ✅ All images found!\n`; writeFileSync(REPORT_FILE, report, "utf-8"); console.log(`📝 Review report: ${REPORT_FILE}`); console.log(`\n${"═".repeat(50)}`); console.log(`📊 Total: ${diseases.length} Good: ${good} Fallback: ${fallback} Missing: ${missing}`); console.log(` Brave calls: ${braveCount}`); console.log(`${"═".repeat(50)}\n`); closeDb(); } // ─── Apply results to JSON + DB ────────────────────────────────────────────── async function applyResults(diseases: DiseaseSeed[], results: Record) { const urlMap = new Map( Object.entries(results).filter(([id, r]) => r.url.length > 0 && diseases.some((d) => d.id === id)), ); if (urlMap.size === 0) return console.log("⏭️ No images to apply"); // JSON let n = 0; const updated = diseases.map((d) => { const img = urlMap.get(d.id); if (img) { n++; return { ...d, imageUrl: img.url, imageQuality: img.quality }; } return d; }); writeFileSync(DISEASES_JSON, JSON.stringify(updated, null, 2) + "\n"); console.log(`✅ diseases.json: ${n} images`); // DB try { const dbUrl = process.env.DATABASE_URL; const dbToken = process.env.DATABASE_TOKEN; if (!dbUrl || !dbToken) return console.log(" ⏭️ DB: no DATABASE_URL/TOKEN"); const raw = createClient({ url: dbUrl, authToken: dbToken }); const entries = Array.from(urlMap.entries()); for (let i = 0; i < entries.length; i += 50) { await raw.batch( entries.slice(i, i + 50).map(([id, img]) => ({ sql: "UPDATE diseases SET image_url = ? WHERE id = ?", args: [img.url, id], })), "write", ); } raw.close(); console.log(`✅ Turso DB: ${entries.length} rows`); } catch (err) { console.log(` ⚠️ DB: ${err instanceof Error ? err.message : err}`); } } main().catch((err) => { console.error("\n❌", err); process.exit(1); });