Files
plant-disease-id/scripts/fill-disease-images.ts
2026-06-08 16:42:04 -04:00

441 lines
16 KiB
JavaScript

#!/usr/bin/env node
/**
* fill-disease-images.ts — Three-stage disease image pipeline
*
* For every disease without an imageUrl, tries:
* Stage 1 — Wikipedia search → pageimages
* Stage 2 — Wikimedia Commons search
* Stage 3 — Brave Image Search API (fallback, 1 req/sec, 2000/mo)
*
* Updates both diseases.json (seed) and the Turso DB.
* Flags anything found only via Brave for human review.
*
* Usage: cd apps/web && npx tsx scripts/fill-disease-images.ts
*/
import "dotenv/config";
import { readFileSync, writeFileSync, existsSync } from "fs";
import { resolve } from "path";
import { createClient } from "@libsql/client";
import { closeDb } from "../src/lib/db/index";
// ─── Types & Config ──────────────────────────────────────────────────────────
interface DiseaseSeed {
id: string;
plantId: string;
name: string;
scientificName: string;
commonName?: string;
[key: string]: unknown;
}
interface ImageResult {
url: string;
source: "wikipedia" | "commons" | "brave" | "missing";
quality: "good" | "fallback" | "missing";
}
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
const RESULTS_FILE = resolve(__dirname, ".image-results.json");
const REPORT_FILE = resolve(__dirname, ".image-review-needed.md");
const WIKI_API = "https://en.wikipedia.org/w/api.php";
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
const BRAVE_KEY = process.env.BRAVE_API_KEY ?? "";
const BRAVE_DELAY = 1100;
const MAX_BRAVE = 2000;
const UA = "PlantHealthKB/1.0 (plant-disease-id)";
const ORIGIN = "*";
let braveCount = 0;
// ─── Wikipedia Stage ─────────────────────────────────────────────────────────
/**
* Search Wikipedia and get thumbnails in ONE API call using generator=search.
* Returns first thumbnail found, or null.
*/
async function wikiSearchAndThumb(query: string): Promise<string | null> {
const params = new URLSearchParams({
action: "query",
generator: "search",
gsrsearch: query,
gsrlimit: "5",
prop: "pageimages",
pithumbsize: "600",
format: "json",
origin: ORIGIN,
});
for (let attempt = 0; attempt < 3; attempt++) {
try {
const res = await fetchWithTimeout(`${WIKI_API}?${params}`, {
headers: { "User-Agent": UA },
});
if (res.status === 429) {
await delay(3000 * 2 ** attempt);
continue;
}
if (!res.ok) return null;
const data = (await res.json()) as {
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
};
const pages = data?.query?.pages;
if (!pages) return null;
for (const [, p] of Object.entries(pages)) {
const src = (p as { thumbnail?: { source: string } })?.thumbnail?.source;
if (src) return src;
}
return null;
} catch {
await delay(2000);
}
}
return null;
}
/**
* Try to find a Wikipedia image for a disease.
* Uses generator=search which combines search + thumbnails in one call.
*/
async function wikiStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
// Try 1: disease name + plant name (most specific)
return wikiSearchAndThumb(`"${d.name}" ${plantName}`);
}
// ─── Commons Stage ───────────────────────────────────────────────────────────
/** Fetch with timeout. Aborts after `ms` milliseconds. */
async function fetchWithTimeout(url: string, opts: RequestInit, ms = 15000): Promise<Response> {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), ms);
try {
const res = await fetch(url, { ...opts, signal: ctrl.signal });
return res;
} finally {
clearTimeout(timer);
}
}
async function commonsSearchAndThumb(query: string): Promise<string | null> {
const params = new URLSearchParams({
action: "query",
list: "search",
srsearch: query,
srnamespace: "6",
srlimit: "5",
format: "json",
origin: ORIGIN,
});
for (let attempt = 0; attempt < 3; attempt++) {
try {
const res = await fetchWithTimeout(`${COMMONS_API}?${params}`, {
headers: { "User-Agent": UA },
});
if (res.status === 429) {
await delay(3000 * 2 ** attempt);
continue;
}
if (!res.ok) return null;
const data = (await res.json()) as {
query?: { search?: Array<{ pageid: number; title: string }> };
};
const hits = data?.query?.search ?? [];
if (hits.length === 0) return null;
// Batch-fetch imageinfo for all found page IDs
const pageids = hits.map((h) => h.pageid).join("|");
const imgParams = new URLSearchParams({
action: "query",
pageids,
prop: "imageinfo",
iiprop: "url",
iiurlwidth: "600",
format: "json",
origin: ORIGIN,
});
const imgRes = await fetchWithTimeout(`${COMMONS_API}?${imgParams}`, {
headers: { "User-Agent": UA },
});
if (!imgRes.ok) return null;
const imgData = (await imgRes.json()) as {
query?: { pages?: Record<string, unknown> };
};
const imgPages = imgData?.query?.pages;
if (!imgPages) return null;
for (const [, pg] of Object.entries(imgPages)) {
const p = pg as Record<string, unknown>;
const info = (p.imageinfo as Array<Record<string, string>> | undefined)?.[0];
if (info?.thumburl) return info.thumburl as string;
if (info?.url) return info.url as string;
}
return null;
} catch {
await delay(2000);
}
}
return null;
}
async function commonsStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
let q: string;
if (d.scientificName && !d.scientificName.includes("spp.") && !d.scientificName.includes("/")) {
q = `${d.scientificName} ${plantName}`;
} else {
q = `${d.name} ${plantName} disease`;
}
const url = await commonsSearchAndThumb(q);
return url ?? null;
}
// ─── Brave Stage ─────────────────────────────────────────────────────────────
async function braveStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
if (!BRAVE_KEY || braveCount >= MAX_BRAVE) return null;
const url = new URL("https://api.search.brave.com/res/v1/images/search");
url.searchParams.set("q", `${d.name} ${plantName} plant disease symptom`);
url.searchParams.set("count", "5");
for (let attempt = 0; attempt < 3; attempt++) {
try {
const res = await fetchWithTimeout(url.toString(), {
headers: { "X-Subscription-Token": BRAVE_KEY, Accept: "application/json" },
});
if (res.status === 429) {
await delay(5000 * 2 ** attempt);
continue;
}
if (!res.ok) return null;
braveCount++;
const data = (await res.json()) as {
results?: Array<{ url: string; thumbnail?: { src?: string } }>;
};
const results = data?.results ?? [];
if (results.length === 0) return null;
// Prefer non-stock thumbnails
for (const r of results) {
const src = r.thumbnail?.src ?? r.url;
if (src && !src.includes("dreamstime") && !src.includes("shutterstock") &&
!src.includes("alamy") && !src.includes("istock") && !src.includes("123rf")) {
return src;
}
}
return results[0].thumbnail?.src ?? results[0].url;
} catch {
await delay(2000);
}
}
return null;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
function delay(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
function loadDiseases(): DiseaseSeed[] {
return JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
}
function getPlantName(diseases: DiseaseSeed[], diseaseId: string): string {
const plant = diseases.find((p) => p.id === diseaseId);
return plant?.commonName ?? plant?.name ?? diseaseId;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
console.log("\n🔍 Plant Disease Image Filler\n");
const diseases = loadDiseases();
console.log(`📋 ${diseases.length} diseases loaded\n`);
// Load existing results
let results: Record<string, ImageResult> = {};
if (existsSync(RESULTS_FILE)) {
try { results = JSON.parse(readFileSync(RESULTS_FILE, "utf-8")); } catch { /* fresh */ }
}
const pending = diseases.filter((d) => {
if ((d.imageUrl as string)?.length) return false;
return !results[d.id];
});
if (pending.length === 0) {
console.log("✅ All done\n");
await applyResults(diseases, results);
return;
}
console.log(`${pending.length} need images\n`);
// ── Stage 1: Wikipedia ──────────────────────────────────────────────
const s1 = pending.filter((d) => !results[d.id]);
let s1ok = 0;
console.log("─── Wikipedia ───\n");
for (let i = 0; i < s1.length; i++) {
const d = s1[i];
const plantName = getPlantName(diseases, d.plantId);
const url = await wikiStage(d, plantName);
if (url) {
results[d.id] = { url, source: "wikipedia", quality: "good" };
s1ok++;
}
const pct = ((i + 1) / s1.length * 100).toFixed(0);
process.stdout.write(` [${pct}% ${i + 1}/${s1.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`);
if ((i + 1) % 25 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
}
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
console.log(`\n${s1ok}/${s1.length} found\n`);
// ── Stage 2: Commons ─────────────────────────────────────────────────
const s2 = pending.filter((d) => !results[d.id]);
let s2ok = 0;
if (s2.length > 0) {
console.log("─── Wikimedia Commons ───\n");
for (let i = 0; i < s2.length; i++) {
const d = s2[i];
const plantName = getPlantName(diseases, d.plantId);
let url: string | null = null;
try {
const result = await Promise.race([
commonsStage(d, plantName),
new Promise<null>((_, reject) => setTimeout(() => reject(new Error("timeout")), 25000)),
]);
url = result;
} catch { /* timeout */ }
if (url) {
results[d.id] = { url, source: "commons", quality: "good" };
s2ok++;
}
const pct = ((i + 1) / s2.length * 100).toFixed(0);
process.stdout.write(` [${pct}% ${i + 1}/${s2.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`);
if ((i + 1) % 10 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
}
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
console.log(`\n${s2ok}/${s2.length} found\n`);
}
// ── Stage 3: Brave ───────────────────────────────────────────────────
const s3 = pending.filter((d) => !results[d.id]);
let s3ok = 0;
if (s3.length > 0 && BRAVE_KEY) {
console.log("─── Brave Image Search ───\n");
for (const d of s3) {
if (braveCount >= MAX_BRAVE) {
results[d.id] = { url: "", source: "missing", quality: "missing" };
continue;
}
const plantName = getPlantName(diseases, d.plantId);
const url = await braveStage(d, plantName);
if (url) {
results[d.id] = { url, source: "brave", quality: "fallback" };
s3ok++;
process.stdout.write(`${d.name}\n`);
} else {
results[d.id] = { url: "", source: "missing", quality: "missing" };
process.stdout.write(`${d.name}\n`);
}
await delay(BRAVE_DELAY);
}
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
console.log(`\n${s3ok}/${s3.length} found via Brave\n`);
} else if (s3.length > 0) {
console.log("─── Brave Image Search ─── → skipped (no key)\n");
for (const d of s3) results[d.id] = { url: "", source: "missing", quality: "missing" };
}
// ── Apply ───────────────────────────────────────────────────────────
await applyResults(diseases, results);
// ── Report ──────────────────────────────────────────────────────────
const good = Object.values(results).filter((r) => r.quality === "good").length;
const fallback = Object.values(results).filter((r) => r.quality === "fallback").length;
const missing = Object.values(results).filter((r) => r.quality === "missing").length;
let report = `# Disease Images — Human Review Needed\n\n`;
report += `Generated: ${new Date().toISOString()}\n\n`;
for (const [label, ids, type] of [
["Fallback (Brave)", Object.entries(results).filter(([, r]) => r.quality === "fallback").map(([id]) => id), "fallback"],
["Missing", Object.entries(results).filter(([, r]) => r.quality === "missing").map(([id]) => id), "missing"],
] as const) {
if (ids.length === 0) continue;
report += `## ${type === "fallback" ? "⚠️" : "🚫"} ${label}\n\n`;
for (const id of ids) {
const d = diseases.find((x) => x.id === id);
const r = results[id];
report += `- **${d?.name ?? id}** (${d?.scientificName ?? ""}) on \`${d?.plantId ?? ""}\``;
if (r?.url) report += `\n ${r.url}`;
report += `\n\n`;
}
}
if (good === diseases.length) report += `## ✅ All images found!\n`;
writeFileSync(REPORT_FILE, report, "utf-8");
console.log(`📝 Review report: ${REPORT_FILE}`);
console.log(`\n${"═".repeat(50)}`);
console.log(`📊 Total: ${diseases.length} Good: ${good} Fallback: ${fallback} Missing: ${missing}`);
console.log(` Brave calls: ${braveCount}`);
console.log(`${"═".repeat(50)}\n`);
closeDb();
}
// ─── Apply results to JSON + DB ──────────────────────────────────────────────
async function applyResults(diseases: DiseaseSeed[], results: Record<string, ImageResult>) {
const urlMap = new Map(
Object.entries(results).filter(([id, r]) => r.url.length > 0 && diseases.some((d) => d.id === id)),
);
if (urlMap.size === 0) return console.log("⏭️ No images to apply");
// JSON
let n = 0;
const updated = diseases.map((d) => {
const img = urlMap.get(d.id);
if (img) { n++; return { ...d, imageUrl: img.url, imageQuality: img.quality }; }
return d;
});
writeFileSync(DISEASES_JSON, JSON.stringify(updated, null, 2) + "\n");
console.log(`✅ diseases.json: ${n} images`);
// DB
try {
const dbUrl = process.env.DATABASE_URL;
const dbToken = process.env.DATABASE_TOKEN;
if (!dbUrl || !dbToken) return console.log(" ⏭️ DB: no DATABASE_URL/TOKEN");
const raw = createClient({ url: dbUrl, authToken: dbToken });
const entries = Array.from(urlMap.entries());
for (let i = 0; i < entries.length; i += 50) {
await raw.batch(
entries.slice(i, i + 50).map(([id, img]) => ({
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
args: [img.url, id],
})),
"write",
);
}
raw.close();
console.log(`✅ Turso DB: ${entries.length} rows`);
} catch (err) {
console.log(` ⚠️ DB: ${err instanceof Error ? err.message : err}`);
}
}
main().catch((err) => { console.error("\n❌", err); process.exit(1); });