441 lines
16 KiB
JavaScript
441 lines
16 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* fill-disease-images.ts — Three-stage disease image pipeline
|
|
*
|
|
* For every disease without an imageUrl, tries:
|
|
* Stage 1 — Wikipedia search → pageimages
|
|
* Stage 2 — Wikimedia Commons search
|
|
* Stage 3 — Brave Image Search API (fallback, 1 req/sec, 2000/mo)
|
|
*
|
|
* Updates both diseases.json (seed) and the Turso DB.
|
|
* Flags anything found only via Brave for human review.
|
|
*
|
|
* Usage: cd apps/web && npx tsx scripts/fill-disease-images.ts
|
|
*/
|
|
|
|
import "dotenv/config";
|
|
import { readFileSync, writeFileSync, existsSync } from "fs";
|
|
import { resolve } from "path";
|
|
import { createClient } from "@libsql/client";
|
|
import { closeDb } from "../src/lib/db/index";
|
|
|
|
// ─── Types & Config ──────────────────────────────────────────────────────────
|
|
|
|
interface DiseaseSeed {
|
|
id: string;
|
|
plantId: string;
|
|
name: string;
|
|
scientificName: string;
|
|
commonName?: string;
|
|
[key: string]: unknown;
|
|
}
|
|
|
|
interface ImageResult {
|
|
url: string;
|
|
source: "wikipedia" | "commons" | "brave" | "missing";
|
|
quality: "good" | "fallback" | "missing";
|
|
}
|
|
|
|
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
|
|
const RESULTS_FILE = resolve(__dirname, ".image-results.json");
|
|
const REPORT_FILE = resolve(__dirname, ".image-review-needed.md");
|
|
|
|
const WIKI_API = "https://en.wikipedia.org/w/api.php";
|
|
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
|
|
const BRAVE_KEY = process.env.BRAVE_API_KEY ?? "";
|
|
const BRAVE_DELAY = 1100;
|
|
const MAX_BRAVE = 2000;
|
|
const UA = "PlantHealthKB/1.0 (plant-disease-id)";
|
|
const ORIGIN = "*";
|
|
|
|
let braveCount = 0;
|
|
|
|
// ─── Wikipedia Stage ─────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Search Wikipedia and get thumbnails in ONE API call using generator=search.
|
|
* Returns first thumbnail found, or null.
|
|
*/
|
|
async function wikiSearchAndThumb(query: string): Promise<string | null> {
|
|
const params = new URLSearchParams({
|
|
action: "query",
|
|
generator: "search",
|
|
gsrsearch: query,
|
|
gsrlimit: "5",
|
|
prop: "pageimages",
|
|
pithumbsize: "600",
|
|
format: "json",
|
|
origin: ORIGIN,
|
|
});
|
|
|
|
for (let attempt = 0; attempt < 3; attempt++) {
|
|
try {
|
|
const res = await fetchWithTimeout(`${WIKI_API}?${params}`, {
|
|
headers: { "User-Agent": UA },
|
|
});
|
|
if (res.status === 429) {
|
|
await delay(3000 * 2 ** attempt);
|
|
continue;
|
|
}
|
|
if (!res.ok) return null;
|
|
const data = (await res.json()) as {
|
|
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
|
|
};
|
|
const pages = data?.query?.pages;
|
|
if (!pages) return null;
|
|
for (const [, p] of Object.entries(pages)) {
|
|
const src = (p as { thumbnail?: { source: string } })?.thumbnail?.source;
|
|
if (src) return src;
|
|
}
|
|
return null;
|
|
} catch {
|
|
await delay(2000);
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Try to find a Wikipedia image for a disease.
|
|
* Uses generator=search which combines search + thumbnails in one call.
|
|
*/
|
|
async function wikiStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
|
// Try 1: disease name + plant name (most specific)
|
|
return wikiSearchAndThumb(`"${d.name}" ${plantName}`);
|
|
}
|
|
|
|
// ─── Commons Stage ───────────────────────────────────────────────────────────
|
|
|
|
/** Fetch with timeout. Aborts after `ms` milliseconds. */
|
|
async function fetchWithTimeout(url: string, opts: RequestInit, ms = 15000): Promise<Response> {
|
|
const ctrl = new AbortController();
|
|
const timer = setTimeout(() => ctrl.abort(), ms);
|
|
try {
|
|
const res = await fetch(url, { ...opts, signal: ctrl.signal });
|
|
return res;
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
async function commonsSearchAndThumb(query: string): Promise<string | null> {
|
|
const params = new URLSearchParams({
|
|
action: "query",
|
|
list: "search",
|
|
srsearch: query,
|
|
srnamespace: "6",
|
|
srlimit: "5",
|
|
format: "json",
|
|
origin: ORIGIN,
|
|
});
|
|
|
|
for (let attempt = 0; attempt < 3; attempt++) {
|
|
try {
|
|
const res = await fetchWithTimeout(`${COMMONS_API}?${params}`, {
|
|
headers: { "User-Agent": UA },
|
|
});
|
|
if (res.status === 429) {
|
|
await delay(3000 * 2 ** attempt);
|
|
continue;
|
|
}
|
|
if (!res.ok) return null;
|
|
const data = (await res.json()) as {
|
|
query?: { search?: Array<{ pageid: number; title: string }> };
|
|
};
|
|
const hits = data?.query?.search ?? [];
|
|
if (hits.length === 0) return null;
|
|
|
|
// Batch-fetch imageinfo for all found page IDs
|
|
const pageids = hits.map((h) => h.pageid).join("|");
|
|
const imgParams = new URLSearchParams({
|
|
action: "query",
|
|
pageids,
|
|
prop: "imageinfo",
|
|
iiprop: "url",
|
|
iiurlwidth: "600",
|
|
format: "json",
|
|
origin: ORIGIN,
|
|
});
|
|
|
|
const imgRes = await fetchWithTimeout(`${COMMONS_API}?${imgParams}`, {
|
|
headers: { "User-Agent": UA },
|
|
});
|
|
if (!imgRes.ok) return null;
|
|
const imgData = (await imgRes.json()) as {
|
|
query?: { pages?: Record<string, unknown> };
|
|
};
|
|
const imgPages = imgData?.query?.pages;
|
|
if (!imgPages) return null;
|
|
|
|
for (const [, pg] of Object.entries(imgPages)) {
|
|
const p = pg as Record<string, unknown>;
|
|
const info = (p.imageinfo as Array<Record<string, string>> | undefined)?.[0];
|
|
if (info?.thumburl) return info.thumburl as string;
|
|
if (info?.url) return info.url as string;
|
|
}
|
|
return null;
|
|
} catch {
|
|
await delay(2000);
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function commonsStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
|
let q: string;
|
|
if (d.scientificName && !d.scientificName.includes("spp.") && !d.scientificName.includes("/")) {
|
|
q = `${d.scientificName} ${plantName}`;
|
|
} else {
|
|
q = `${d.name} ${plantName} disease`;
|
|
}
|
|
|
|
const url = await commonsSearchAndThumb(q);
|
|
return url ?? null;
|
|
}
|
|
|
|
// ─── Brave Stage ─────────────────────────────────────────────────────────────
|
|
|
|
async function braveStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
|
if (!BRAVE_KEY || braveCount >= MAX_BRAVE) return null;
|
|
|
|
const url = new URL("https://api.search.brave.com/res/v1/images/search");
|
|
url.searchParams.set("q", `${d.name} ${plantName} plant disease symptom`);
|
|
url.searchParams.set("count", "5");
|
|
|
|
for (let attempt = 0; attempt < 3; attempt++) {
|
|
try {
|
|
const res = await fetchWithTimeout(url.toString(), {
|
|
headers: { "X-Subscription-Token": BRAVE_KEY, Accept: "application/json" },
|
|
});
|
|
if (res.status === 429) {
|
|
await delay(5000 * 2 ** attempt);
|
|
continue;
|
|
}
|
|
if (!res.ok) return null;
|
|
braveCount++;
|
|
const data = (await res.json()) as {
|
|
results?: Array<{ url: string; thumbnail?: { src?: string } }>;
|
|
};
|
|
const results = data?.results ?? [];
|
|
if (results.length === 0) return null;
|
|
|
|
// Prefer non-stock thumbnails
|
|
for (const r of results) {
|
|
const src = r.thumbnail?.src ?? r.url;
|
|
if (src && !src.includes("dreamstime") && !src.includes("shutterstock") &&
|
|
!src.includes("alamy") && !src.includes("istock") && !src.includes("123rf")) {
|
|
return src;
|
|
}
|
|
}
|
|
return results[0].thumbnail?.src ?? results[0].url;
|
|
} catch {
|
|
await delay(2000);
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
|
|
function delay(ms: number): Promise<void> {
|
|
return new Promise((r) => setTimeout(r, ms));
|
|
}
|
|
|
|
function loadDiseases(): DiseaseSeed[] {
|
|
return JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
|
|
}
|
|
|
|
function getPlantName(diseases: DiseaseSeed[], diseaseId: string): string {
|
|
const plant = diseases.find((p) => p.id === diseaseId);
|
|
return plant?.commonName ?? plant?.name ?? diseaseId;
|
|
}
|
|
|
|
// ─── Main ────────────────────────────────────────────────────────────────────
|
|
|
|
async function main() {
|
|
console.log("\n🔍 Plant Disease Image Filler\n");
|
|
|
|
const diseases = loadDiseases();
|
|
console.log(`📋 ${diseases.length} diseases loaded\n`);
|
|
|
|
// Load existing results
|
|
let results: Record<string, ImageResult> = {};
|
|
if (existsSync(RESULTS_FILE)) {
|
|
try { results = JSON.parse(readFileSync(RESULTS_FILE, "utf-8")); } catch { /* fresh */ }
|
|
}
|
|
|
|
const pending = diseases.filter((d) => {
|
|
if ((d.imageUrl as string)?.length) return false;
|
|
return !results[d.id];
|
|
});
|
|
|
|
if (pending.length === 0) {
|
|
console.log("✅ All done\n");
|
|
await applyResults(diseases, results);
|
|
return;
|
|
}
|
|
|
|
console.log(`⏳ ${pending.length} need images\n`);
|
|
|
|
// ── Stage 1: Wikipedia ──────────────────────────────────────────────
|
|
const s1 = pending.filter((d) => !results[d.id]);
|
|
let s1ok = 0;
|
|
console.log("─── Wikipedia ───\n");
|
|
|
|
for (let i = 0; i < s1.length; i++) {
|
|
const d = s1[i];
|
|
const plantName = getPlantName(diseases, d.plantId);
|
|
const url = await wikiStage(d, plantName);
|
|
if (url) {
|
|
results[d.id] = { url, source: "wikipedia", quality: "good" };
|
|
s1ok++;
|
|
}
|
|
const pct = ((i + 1) / s1.length * 100).toFixed(0);
|
|
process.stdout.write(` [${pct}% ${i + 1}/${s1.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`);
|
|
if ((i + 1) % 25 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
|
}
|
|
|
|
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
|
console.log(`\n → ${s1ok}/${s1.length} found\n`);
|
|
|
|
// ── Stage 2: Commons ─────────────────────────────────────────────────
|
|
const s2 = pending.filter((d) => !results[d.id]);
|
|
let s2ok = 0;
|
|
|
|
if (s2.length > 0) {
|
|
console.log("─── Wikimedia Commons ───\n");
|
|
for (let i = 0; i < s2.length; i++) {
|
|
const d = s2[i];
|
|
const plantName = getPlantName(diseases, d.plantId);
|
|
let url: string | null = null;
|
|
try {
|
|
const result = await Promise.race([
|
|
commonsStage(d, plantName),
|
|
new Promise<null>((_, reject) => setTimeout(() => reject(new Error("timeout")), 25000)),
|
|
]);
|
|
url = result;
|
|
} catch { /* timeout */ }
|
|
if (url) {
|
|
results[d.id] = { url, source: "commons", quality: "good" };
|
|
s2ok++;
|
|
}
|
|
const pct = ((i + 1) / s2.length * 100).toFixed(0);
|
|
process.stdout.write(` [${pct}% ${i + 1}/${s2.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`);
|
|
|
|
if ((i + 1) % 10 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
|
}
|
|
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
|
console.log(`\n → ${s2ok}/${s2.length} found\n`);
|
|
}
|
|
|
|
// ── Stage 3: Brave ───────────────────────────────────────────────────
|
|
const s3 = pending.filter((d) => !results[d.id]);
|
|
let s3ok = 0;
|
|
|
|
if (s3.length > 0 && BRAVE_KEY) {
|
|
console.log("─── Brave Image Search ───\n");
|
|
for (const d of s3) {
|
|
if (braveCount >= MAX_BRAVE) {
|
|
results[d.id] = { url: "", source: "missing", quality: "missing" };
|
|
continue;
|
|
}
|
|
const plantName = getPlantName(diseases, d.plantId);
|
|
const url = await braveStage(d, plantName);
|
|
if (url) {
|
|
results[d.id] = { url, source: "brave", quality: "fallback" };
|
|
s3ok++;
|
|
process.stdout.write(` ✅ ${d.name}\n`);
|
|
} else {
|
|
results[d.id] = { url: "", source: "missing", quality: "missing" };
|
|
process.stdout.write(` ❌ ${d.name}\n`);
|
|
}
|
|
await delay(BRAVE_DELAY);
|
|
}
|
|
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
|
console.log(`\n → ${s3ok}/${s3.length} found via Brave\n`);
|
|
} else if (s3.length > 0) {
|
|
console.log("─── Brave Image Search ─── → skipped (no key)\n");
|
|
for (const d of s3) results[d.id] = { url: "", source: "missing", quality: "missing" };
|
|
}
|
|
|
|
// ── Apply ───────────────────────────────────────────────────────────
|
|
await applyResults(diseases, results);
|
|
|
|
// ── Report ──────────────────────────────────────────────────────────
|
|
const good = Object.values(results).filter((r) => r.quality === "good").length;
|
|
const fallback = Object.values(results).filter((r) => r.quality === "fallback").length;
|
|
const missing = Object.values(results).filter((r) => r.quality === "missing").length;
|
|
|
|
let report = `# Disease Images — Human Review Needed\n\n`;
|
|
report += `Generated: ${new Date().toISOString()}\n\n`;
|
|
|
|
for (const [label, ids, type] of [
|
|
["Fallback (Brave)", Object.entries(results).filter(([, r]) => r.quality === "fallback").map(([id]) => id), "fallback"],
|
|
["Missing", Object.entries(results).filter(([, r]) => r.quality === "missing").map(([id]) => id), "missing"],
|
|
] as const) {
|
|
if (ids.length === 0) continue;
|
|
report += `## ${type === "fallback" ? "⚠️" : "🚫"} ${label}\n\n`;
|
|
for (const id of ids) {
|
|
const d = diseases.find((x) => x.id === id);
|
|
const r = results[id];
|
|
report += `- **${d?.name ?? id}** (${d?.scientificName ?? ""}) on \`${d?.plantId ?? ""}\``;
|
|
if (r?.url) report += `\n ${r.url}`;
|
|
report += `\n\n`;
|
|
}
|
|
}
|
|
|
|
if (good === diseases.length) report += `## ✅ All images found!\n`;
|
|
writeFileSync(REPORT_FILE, report, "utf-8");
|
|
console.log(`📝 Review report: ${REPORT_FILE}`);
|
|
|
|
console.log(`\n${"═".repeat(50)}`);
|
|
console.log(`📊 Total: ${diseases.length} Good: ${good} Fallback: ${fallback} Missing: ${missing}`);
|
|
console.log(` Brave calls: ${braveCount}`);
|
|
console.log(`${"═".repeat(50)}\n`);
|
|
|
|
closeDb();
|
|
}
|
|
|
|
// ─── Apply results to JSON + DB ──────────────────────────────────────────────
|
|
|
|
async function applyResults(diseases: DiseaseSeed[], results: Record<string, ImageResult>) {
|
|
const urlMap = new Map(
|
|
Object.entries(results).filter(([id, r]) => r.url.length > 0 && diseases.some((d) => d.id === id)),
|
|
);
|
|
if (urlMap.size === 0) return console.log("⏭️ No images to apply");
|
|
|
|
// JSON
|
|
let n = 0;
|
|
const updated = diseases.map((d) => {
|
|
const img = urlMap.get(d.id);
|
|
if (img) { n++; return { ...d, imageUrl: img.url, imageQuality: img.quality }; }
|
|
return d;
|
|
});
|
|
writeFileSync(DISEASES_JSON, JSON.stringify(updated, null, 2) + "\n");
|
|
console.log(`✅ diseases.json: ${n} images`);
|
|
|
|
// DB
|
|
try {
|
|
const dbUrl = process.env.DATABASE_URL;
|
|
const dbToken = process.env.DATABASE_TOKEN;
|
|
if (!dbUrl || !dbToken) return console.log(" ⏭️ DB: no DATABASE_URL/TOKEN");
|
|
const raw = createClient({ url: dbUrl, authToken: dbToken });
|
|
const entries = Array.from(urlMap.entries());
|
|
for (let i = 0; i < entries.length; i += 50) {
|
|
await raw.batch(
|
|
entries.slice(i, i + 50).map(([id, img]) => ({
|
|
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
|
args: [img.url, id],
|
|
})),
|
|
"write",
|
|
);
|
|
}
|
|
raw.close();
|
|
console.log(`✅ Turso DB: ${entries.length} rows`);
|
|
} catch (err) {
|
|
console.log(` ⚠️ DB: ${err instanceof Error ? err.message : err}`);
|
|
}
|
|
}
|
|
|
|
main().catch((err) => { console.error("\n❌", err); process.exit(1); });
|