Files
plant-disease-id/scripts/fill-ddg-images.ts
2026-06-08 16:42:04 -04:00

269 lines
8.5 KiB
JavaScript

#!/usr/bin/env node
/**
* fill-ddg-images.ts — DuckDuckGo Image Search for remaining disease images.
*
* No API key needed. Searches DuckDuckGo Images API for each disease
* without an image and updates the Turso DB.
*
* Prioritizes by severity (critical → high → moderate → low).
* Runs at 1 request/sec to be polite to DuckDuckGo.
* Resumable via state file (scripts/.ddg-progress.json).
*
* Usage:
* cd apps/web && npx tsx scripts/fill-ddg-images.ts
*/
import { readFileSync, writeFileSync } from "fs";
import { resolve } from "path";
// Load .env.development for DB creds
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) process.env[key] = val;
}
}
}
} catch {}
import { getDb, closeDb } from "../src/lib/db/index";
import { diseases } from "../src/lib/db/schema";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
// DuckDuckGo
import { imageSearch } from "@mudbill/duckduckgo-images-api";
interface DiseaseRow {
id: string;
name: string;
scientificName: string;
severity: string;
plantId: string;
}
// ─── Config ──────────────────────────────────────────────────────────────────
const POLITE_DELAY = 800; // ms between calls
const DB_FLUSH_BATCH = 50;
const STATE_FILE = resolve(__dirname, ".ddg-progress.json");
interface RunState {
processedIds: string[];
totalFound: number;
}
function loadState(): RunState | null {
try {
return JSON.parse(readFileSync(STATE_FILE, "utf-8"));
} catch {
return null;
}
}
function saveState(processedIds: string[], totalFound: number) {
writeFileSync(STATE_FILE, JSON.stringify({ processedIds, totalFound }, null, 2), "utf-8");
}
// ─── DuckDuckGo Search ───────────────────────────────────────────────────────
async function searchImage(query: string): Promise<string | null> {
try {
const results = await imageSearch({ query, safe: true, iterations: 1, retries: 2 });
if (!results || results.length === 0) return null;
// Prefer non-stock images
for (const r of results) {
if (r.image && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(r.image)) {
return r.image;
}
}
return results[0].image || results[0].thumbnail || null;
} catch {
// DuckDuckGo may block or timeout; silently skip
return null;
}
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
console.log("\n🦆 DuckDuckGo Disease Image Filler\n");
const db = getDb();
// Load state
const state = loadState();
const processedSet = new Set(state?.processedIds || []);
const totalFoundPrev = state?.totalFound ?? 0;
// Get all diseases that still need images
const allDiseases = (await db
.select({
id: diseases.id,
name: diseases.name,
scientificName: diseases.scientificName,
severity: diseases.severity,
plantId: diseases.plantId,
})
.from(diseases)
.where(sql`(image_url IS NULL OR image_url = '')`)
.all()) as DiseaseRow[];
console.log(`📋 ${allDiseases.length} diseases need images\n`);
if (allDiseases.length === 0) {
console.log("✅ All diseases already have images!\n");
closeDb();
return;
}
// Sort by severity: critical > high > moderate > low
const severityOrder: Record<string, number> = { critical: 0, high: 1, moderate: 2, low: 3 };
allDiseases.sort((a, b) => (severityOrder[a.severity] ?? 99) - (severityOrder[b.severity] ?? 99));
// Filter out already-processed
const pending = allDiseases.filter((d) => !processedSet.has(d.id));
console.log(
`📊 Remaining: critical=${allDiseases.filter((d) => d.severity === "critical" && !processedSet.has(d.id)).length}, ` +
`high=${allDiseases.filter((d) => d.severity === "high" && !processedSet.has(d.id)).length}, ` +
`moderate=${allDiseases.filter((d) => d.severity === "moderate" && !processedSet.has(d.id)).length}, ` +
`low=${allDiseases.filter((d) => d.severity === "low" && !processedSet.has(d.id)).length}\n`,
);
if (pending.length === 0) {
console.log("✅ All remaining diseases already attempted\n");
closeDb();
return;
}
const raw = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
const processedIds: string[] = state?.processedIds ?? [];
let found = totalFoundPrev;
let updates: Array<{ id: string; url: string }> = [];
for (let i = 0; i < pending.length; i++) {
const d = pending[i];
const sev = d.severity.padEnd(8);
// Build search query — "[disease] on [plant]" phrasing for better specificity
const plantName = d.plantId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
const query1 = `${d.name} on ${plantName} plant disease`;
const query2 = `${d.scientificName || d.name} on ${plantName} disease`;
const query3 = `${d.name} plant disease ${plantName}`;
const query4 = `${d.name} plant`;
const query5 = `${d.name} symptom`;
process.stdout.write(
` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 42).padEnd(44)} `,
);
// Try queries in order until we get a result
let url: string | null = null;
for (const q of [query1, query2, query3, query4, query5]) {
url = await searchImage(q);
if (url) break;
}
if (url) {
updates.push({ id: d.id, url });
found++;
processedIds.push(d.id);
console.log("✅");
} else {
processedIds.push(d.id);
console.log("❌");
}
// Flush to DB in batches
if (updates.length >= DB_FLUSH_BATCH) {
await raw.batch(
updates.map((u) => ({
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
console.log(` → Flushed ${updates.length} to DB`);
updates = [];
}
// Save state every 50
if ((i + 1) % 50 === 0) {
saveState(processedIds, found);
}
// Be polite — 1 req/sec
await new Promise((r) => setTimeout(r, POLITE_DELAY));
}
// Final flush
if (updates.length > 0) {
await raw.batch(
updates.map((u) => ({
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
console.log(` → Flushed ${updates.length} to DB`);
}
saveState(processedIds, found);
raw.close();
// Final report
const finalList = await db
.select({ id: diseases.id, name: diseases.name, imageUrl: diseases.imageUrl })
.from(diseases)
.all();
const w = finalList.filter((d) => d.imageUrl);
const wo = finalList.filter((d) => !d.imageUrl);
console.log(`\n${"═".repeat(50)}`);
console.log(`🦆 DUCKDUCKGO SEARCH COMPLETE`);
console.log(`${"═".repeat(50)}`);
console.log(` Processed: ${pending.length}`);
console.log(` Found this run: ${found - totalFoundPrev}`);
console.log(` Total with images: ${w.length}/${finalList.length}`);
console.log(` Still missing: ${wo.length}`);
if (wo.length > 0) {
const reportPath = resolve(__dirname, ".ddg-image-review-needed.md");
let report = "# Disease Images - Still Missing (DDG)\n\n";
report += `Generated: ${new Date().toISOString()}\n\n`;
report += `## Summary\n\n`;
report += `- Total: ${finalList.length}\n`;
report += `- With images: ${w.length}\n`;
report += `- Still missing: ${wo.length}\n\n`;
report += `## Missing Diseases\n\n`;
for (const d of wo) {
report += `- ${d.name} (\`${d.id}\`)\n`;
}
writeFileSync(reportPath, report, "utf-8");
console.log(`\n📝 Missing report: ${reportPath}`);
} else {
console.log("\n✅ ALL diseases now have images!");
}
closeDb();
console.log();
}
main().catch((err) => {
console.error("\n❌ Fatal:", err);
process.exit(1);
});