ooooeee
This commit is contained in:
440
apps/web/scripts/fill-disease-images.ts
Normal file
440
apps/web/scripts/fill-disease-images.ts
Normal file
@@ -0,0 +1,440 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-disease-images.ts — Three-stage disease image pipeline
|
||||
*
|
||||
* For every disease without an imageUrl, tries:
|
||||
* Stage 1 — Wikipedia search → pageimages
|
||||
* Stage 2 — Wikimedia Commons search
|
||||
* Stage 3 — Brave Image Search API (fallback, 1 req/sec, 2000/mo)
|
||||
*
|
||||
* Updates both diseases.json (seed) and the Turso DB.
|
||||
* Flags anything found only via Brave for human review.
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fill-disease-images.ts
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync, writeFileSync, existsSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { closeDb } from "../src/lib/db/index";
|
||||
|
||||
// ─── Types & Config ──────────────────────────────────────────────────────────
|
||||
|
||||
interface DiseaseSeed {
|
||||
id: string;
|
||||
plantId: string;
|
||||
name: string;
|
||||
scientificName: string;
|
||||
commonName?: string;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
interface ImageResult {
|
||||
url: string;
|
||||
source: "wikipedia" | "commons" | "brave" | "missing";
|
||||
quality: "good" | "fallback" | "missing";
|
||||
}
|
||||
|
||||
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
|
||||
const RESULTS_FILE = resolve(__dirname, ".image-results.json");
|
||||
const REPORT_FILE = resolve(__dirname, ".image-review-needed.md");
|
||||
|
||||
const WIKI_API = "https://en.wikipedia.org/w/api.php";
|
||||
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
|
||||
const BRAVE_KEY = process.env.BRAVE_API_KEY ?? "";
|
||||
const BRAVE_DELAY = 1100;
|
||||
const MAX_BRAVE = 2000;
|
||||
const UA = "PlantHealthKB/1.0 (plant-disease-id)";
|
||||
const ORIGIN = "*";
|
||||
|
||||
let braveCount = 0;
|
||||
|
||||
// ─── Wikipedia Stage ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Search Wikipedia and get thumbnails in ONE API call using generator=search.
|
||||
* Returns first thumbnail found, or null.
|
||||
*/
|
||||
async function wikiSearchAndThumb(query: string): Promise<string | null> {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
generator: "search",
|
||||
gsrsearch: query,
|
||||
gsrlimit: "5",
|
||||
prop: "pageimages",
|
||||
pithumbsize: "600",
|
||||
format: "json",
|
||||
origin: ORIGIN,
|
||||
});
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetchWithTimeout(`${WIKI_API}?${params}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await delay(3000 * 2 ** attempt);
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
const data = (await res.json()) as {
|
||||
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
|
||||
};
|
||||
const pages = data?.query?.pages;
|
||||
if (!pages) return null;
|
||||
for (const [, p] of Object.entries(pages)) {
|
||||
const src = (p as { thumbnail?: { source: string } })?.thumbnail?.source;
|
||||
if (src) return src;
|
||||
}
|
||||
return null;
|
||||
} catch {
|
||||
await delay(2000);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to find a Wikipedia image for a disease.
|
||||
* Uses generator=search which combines search + thumbnails in one call.
|
||||
*/
|
||||
async function wikiStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
||||
// Try 1: disease name + plant name (most specific)
|
||||
return wikiSearchAndThumb(`"${d.name}" ${plantName}`);
|
||||
}
|
||||
|
||||
// ─── Commons Stage ───────────────────────────────────────────────────────────
|
||||
|
||||
/** Fetch with timeout. Aborts after `ms` milliseconds. */
|
||||
async function fetchWithTimeout(url: string, opts: RequestInit, ms = 15000): Promise<Response> {
|
||||
const ctrl = new AbortController();
|
||||
const timer = setTimeout(() => ctrl.abort(), ms);
|
||||
try {
|
||||
const res = await fetch(url, { ...opts, signal: ctrl.signal });
|
||||
return res;
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
async function commonsSearchAndThumb(query: string): Promise<string | null> {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
list: "search",
|
||||
srsearch: query,
|
||||
srnamespace: "6",
|
||||
srlimit: "5",
|
||||
format: "json",
|
||||
origin: ORIGIN,
|
||||
});
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetchWithTimeout(`${COMMONS_API}?${params}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await delay(3000 * 2 ** attempt);
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
const data = (await res.json()) as {
|
||||
query?: { search?: Array<{ pageid: number; title: string }> };
|
||||
};
|
||||
const hits = data?.query?.search ?? [];
|
||||
if (hits.length === 0) return null;
|
||||
|
||||
// Batch-fetch imageinfo for all found page IDs
|
||||
const pageids = hits.map((h) => h.pageid).join("|");
|
||||
const imgParams = new URLSearchParams({
|
||||
action: "query",
|
||||
pageids,
|
||||
prop: "imageinfo",
|
||||
iiprop: "url",
|
||||
iiurlwidth: "600",
|
||||
format: "json",
|
||||
origin: ORIGIN,
|
||||
});
|
||||
|
||||
const imgRes = await fetchWithTimeout(`${COMMONS_API}?${imgParams}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (!imgRes.ok) return null;
|
||||
const imgData = (await imgRes.json()) as {
|
||||
query?: { pages?: Record<string, unknown> };
|
||||
};
|
||||
const imgPages = imgData?.query?.pages;
|
||||
if (!imgPages) return null;
|
||||
|
||||
for (const [, pg] of Object.entries(imgPages)) {
|
||||
const p = pg as Record<string, unknown>;
|
||||
const info = (p.imageinfo as Array<Record<string, string>> | undefined)?.[0];
|
||||
if (info?.thumburl) return info.thumburl as string;
|
||||
if (info?.url) return info.url as string;
|
||||
}
|
||||
return null;
|
||||
} catch {
|
||||
await delay(2000);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function commonsStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
||||
let q: string;
|
||||
if (d.scientificName && !d.scientificName.includes("spp.") && !d.scientificName.includes("/")) {
|
||||
q = `${d.scientificName} ${plantName}`;
|
||||
} else {
|
||||
q = `${d.name} ${plantName} disease`;
|
||||
}
|
||||
|
||||
const url = await commonsSearchAndThumb(q);
|
||||
return url ?? null;
|
||||
}
|
||||
|
||||
// ─── Brave Stage ─────────────────────────────────────────────────────────────
|
||||
|
||||
async function braveStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
||||
if (!BRAVE_KEY || braveCount >= MAX_BRAVE) return null;
|
||||
|
||||
const url = new URL("https://api.search.brave.com/res/v1/images/search");
|
||||
url.searchParams.set("q", `${d.name} ${plantName} plant disease symptom`);
|
||||
url.searchParams.set("count", "5");
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetchWithTimeout(url.toString(), {
|
||||
headers: { "X-Subscription-Token": BRAVE_KEY, Accept: "application/json" },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await delay(5000 * 2 ** attempt);
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
braveCount++;
|
||||
const data = (await res.json()) as {
|
||||
results?: Array<{ url: string; thumbnail?: { src?: string } }>;
|
||||
};
|
||||
const results = data?.results ?? [];
|
||||
if (results.length === 0) return null;
|
||||
|
||||
// Prefer non-stock thumbnails
|
||||
for (const r of results) {
|
||||
const src = r.thumbnail?.src ?? r.url;
|
||||
if (src && !src.includes("dreamstime") && !src.includes("shutterstock") &&
|
||||
!src.includes("alamy") && !src.includes("istock") && !src.includes("123rf")) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
return results[0].thumbnail?.src ?? results[0].url;
|
||||
} catch {
|
||||
await delay(2000);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
function loadDiseases(): DiseaseSeed[] {
|
||||
return JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
|
||||
}
|
||||
|
||||
function getPlantName(diseases: DiseaseSeed[], diseaseId: string): string {
|
||||
const plant = diseases.find((p) => p.id === diseaseId);
|
||||
return plant?.commonName ?? plant?.name ?? diseaseId;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log("\n🔍 Plant Disease Image Filler\n");
|
||||
|
||||
const diseases = loadDiseases();
|
||||
console.log(`📋 ${diseases.length} diseases loaded\n`);
|
||||
|
||||
// Load existing results
|
||||
let results: Record<string, ImageResult> = {};
|
||||
if (existsSync(RESULTS_FILE)) {
|
||||
try { results = JSON.parse(readFileSync(RESULTS_FILE, "utf-8")); } catch { /* fresh */ }
|
||||
}
|
||||
|
||||
const pending = diseases.filter((d) => {
|
||||
if ((d.imageUrl as string)?.length) return false;
|
||||
return !results[d.id];
|
||||
});
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log("✅ All done\n");
|
||||
await applyResults(diseases, results);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`⏳ ${pending.length} need images\n`);
|
||||
|
||||
// ── Stage 1: Wikipedia ──────────────────────────────────────────────
|
||||
const s1 = pending.filter((d) => !results[d.id]);
|
||||
let s1ok = 0;
|
||||
console.log("─── Wikipedia ───\n");
|
||||
|
||||
for (let i = 0; i < s1.length; i++) {
|
||||
const d = s1[i];
|
||||
const plantName = getPlantName(diseases, d.plantId);
|
||||
const url = await wikiStage(d, plantName);
|
||||
if (url) {
|
||||
results[d.id] = { url, source: "wikipedia", quality: "good" };
|
||||
s1ok++;
|
||||
}
|
||||
const pct = ((i + 1) / s1.length * 100).toFixed(0);
|
||||
process.stdout.write(` [${pct}% ${i + 1}/${s1.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`);
|
||||
if ((i + 1) % 25 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
}
|
||||
|
||||
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
console.log(`\n → ${s1ok}/${s1.length} found\n`);
|
||||
|
||||
// ── Stage 2: Commons ─────────────────────────────────────────────────
|
||||
const s2 = pending.filter((d) => !results[d.id]);
|
||||
let s2ok = 0;
|
||||
|
||||
if (s2.length > 0) {
|
||||
console.log("─── Wikimedia Commons ───\n");
|
||||
for (let i = 0; i < s2.length; i++) {
|
||||
const d = s2[i];
|
||||
const plantName = getPlantName(diseases, d.plantId);
|
||||
let url: string | null = null;
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
commonsStage(d, plantName),
|
||||
new Promise<null>((_, reject) => setTimeout(() => reject(new Error("timeout")), 25000)),
|
||||
]);
|
||||
url = result;
|
||||
} catch { /* timeout */ }
|
||||
if (url) {
|
||||
results[d.id] = { url, source: "commons", quality: "good" };
|
||||
s2ok++;
|
||||
}
|
||||
const pct = ((i + 1) / s2.length * 100).toFixed(0);
|
||||
process.stdout.write(` [${pct}% ${i + 1}/${s2.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`);
|
||||
|
||||
if ((i + 1) % 10 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
}
|
||||
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
console.log(`\n → ${s2ok}/${s2.length} found\n`);
|
||||
}
|
||||
|
||||
// ── Stage 3: Brave ───────────────────────────────────────────────────
|
||||
const s3 = pending.filter((d) => !results[d.id]);
|
||||
let s3ok = 0;
|
||||
|
||||
if (s3.length > 0 && BRAVE_KEY) {
|
||||
console.log("─── Brave Image Search ───\n");
|
||||
for (const d of s3) {
|
||||
if (braveCount >= MAX_BRAVE) {
|
||||
results[d.id] = { url: "", source: "missing", quality: "missing" };
|
||||
continue;
|
||||
}
|
||||
const plantName = getPlantName(diseases, d.plantId);
|
||||
const url = await braveStage(d, plantName);
|
||||
if (url) {
|
||||
results[d.id] = { url, source: "brave", quality: "fallback" };
|
||||
s3ok++;
|
||||
process.stdout.write(` ✅ ${d.name}\n`);
|
||||
} else {
|
||||
results[d.id] = { url: "", source: "missing", quality: "missing" };
|
||||
process.stdout.write(` ❌ ${d.name}\n`);
|
||||
}
|
||||
await delay(BRAVE_DELAY);
|
||||
}
|
||||
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
console.log(`\n → ${s3ok}/${s3.length} found via Brave\n`);
|
||||
} else if (s3.length > 0) {
|
||||
console.log("─── Brave Image Search ─── → skipped (no key)\n");
|
||||
for (const d of s3) results[d.id] = { url: "", source: "missing", quality: "missing" };
|
||||
}
|
||||
|
||||
// ── Apply ───────────────────────────────────────────────────────────
|
||||
await applyResults(diseases, results);
|
||||
|
||||
// ── Report ──────────────────────────────────────────────────────────
|
||||
const good = Object.values(results).filter((r) => r.quality === "good").length;
|
||||
const fallback = Object.values(results).filter((r) => r.quality === "fallback").length;
|
||||
const missing = Object.values(results).filter((r) => r.quality === "missing").length;
|
||||
|
||||
let report = `# Disease Images — Human Review Needed\n\n`;
|
||||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
|
||||
for (const [label, ids, type] of [
|
||||
["Fallback (Brave)", Object.entries(results).filter(([, r]) => r.quality === "fallback").map(([id]) => id), "fallback"],
|
||||
["Missing", Object.entries(results).filter(([, r]) => r.quality === "missing").map(([id]) => id), "missing"],
|
||||
] as const) {
|
||||
if (ids.length === 0) continue;
|
||||
report += `## ${type === "fallback" ? "⚠️" : "🚫"} ${label}\n\n`;
|
||||
for (const id of ids) {
|
||||
const d = diseases.find((x) => x.id === id);
|
||||
const r = results[id];
|
||||
report += `- **${d?.name ?? id}** (${d?.scientificName ?? ""}) on \`${d?.plantId ?? ""}\``;
|
||||
if (r?.url) report += `\n ${r.url}`;
|
||||
report += `\n\n`;
|
||||
}
|
||||
}
|
||||
|
||||
if (good === diseases.length) report += `## ✅ All images found!\n`;
|
||||
writeFileSync(REPORT_FILE, report, "utf-8");
|
||||
console.log(`📝 Review report: ${REPORT_FILE}`);
|
||||
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`📊 Total: ${diseases.length} Good: ${good} Fallback: ${fallback} Missing: ${missing}`);
|
||||
console.log(` Brave calls: ${braveCount}`);
|
||||
console.log(`${"═".repeat(50)}\n`);
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
// ─── Apply results to JSON + DB ──────────────────────────────────────────────
|
||||
|
||||
async function applyResults(diseases: DiseaseSeed[], results: Record<string, ImageResult>) {
|
||||
const urlMap = new Map(
|
||||
Object.entries(results).filter(([id, r]) => r.url.length > 0 && diseases.some((d) => d.id === id)),
|
||||
);
|
||||
if (urlMap.size === 0) return console.log("⏭️ No images to apply");
|
||||
|
||||
// JSON
|
||||
let n = 0;
|
||||
const updated = diseases.map((d) => {
|
||||
const img = urlMap.get(d.id);
|
||||
if (img) { n++; return { ...d, imageUrl: img.url, imageQuality: img.quality }; }
|
||||
return d;
|
||||
});
|
||||
writeFileSync(DISEASES_JSON, JSON.stringify(updated, null, 2) + "\n");
|
||||
console.log(`✅ diseases.json: ${n} images`);
|
||||
|
||||
// DB
|
||||
try {
|
||||
const dbUrl = process.env.DATABASE_URL;
|
||||
const dbToken = process.env.DATABASE_TOKEN;
|
||||
if (!dbUrl || !dbToken) return console.log(" ⏭️ DB: no DATABASE_URL/TOKEN");
|
||||
const raw = createClient({ url: dbUrl, authToken: dbToken });
|
||||
const entries = Array.from(urlMap.entries());
|
||||
for (let i = 0; i < entries.length; i += 50) {
|
||||
await raw.batch(
|
||||
entries.slice(i, i + 50).map(([id, img]) => ({
|
||||
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
||||
args: [img.url, id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
}
|
||||
raw.close();
|
||||
console.log(`✅ Turso DB: ${entries.length} rows`);
|
||||
} catch (err) {
|
||||
console.log(` ⚠️ DB: ${err instanceof Error ? err.message : err}`);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => { console.error("\n❌", err); process.exit(1); });
|
||||
Reference in New Issue
Block a user