#!/usr/bin/env node /** * fill-ddg-images.ts — DuckDuckGo Image Search for remaining disease images. * * No API key needed. Searches DuckDuckGo Images API for each disease * without an image and updates the Turso DB. * * Prioritizes by severity (critical → high → moderate → low). * Runs at 1 request/sec to be polite to DuckDuckGo. * Resumable via state file (scripts/.ddg-progress.json). * * Usage: * cd apps/web && npx tsx scripts/fill-ddg-images.ts */ import { readFileSync, writeFileSync } from "fs"; import { resolve } from "path"; // Load .env.development for DB creds const envPath = resolve(__dirname, "../.env.development"); try { const env = readFileSync(envPath, "utf-8"); for (const line of env.split("\n")) { const trimmed = line.trim(); if (trimmed && !trimmed.startsWith("#")) { const eqIdx = trimmed.indexOf("="); if (eqIdx > 0) { const key = trimmed.slice(0, eqIdx).trim(); const val = trimmed.slice(eqIdx + 1).trim(); if (!process.env[key]) process.env[key] = val; } } } } catch {} import { getDb, closeDb } from "../src/lib/db/index"; import { diseases } from "../src/lib/db/schema"; import { createClient } from "@libsql/client"; import { sql } from "drizzle-orm"; // DuckDuckGo import { imageSearch } from "@mudbill/duckduckgo-images-api"; interface DiseaseRow { id: string; name: string; scientificName: string; severity: string; plantId: string; } // ─── Config ────────────────────────────────────────────────────────────────── const POLITE_DELAY = 800; // ms between calls const DB_FLUSH_BATCH = 50; const STATE_FILE = resolve(__dirname, ".ddg-progress.json"); interface RunState { processedIds: string[]; totalFound: number; } function loadState(): RunState | null { try { return JSON.parse(readFileSync(STATE_FILE, "utf-8")); } catch { return null; } } function saveState(processedIds: string[], totalFound: number) { writeFileSync(STATE_FILE, JSON.stringify({ processedIds, totalFound }, null, 2), "utf-8"); } // ─── DuckDuckGo Search ─────────────────────────────────────────────────────── async function searchImage(query: string): Promise { try { const results = await imageSearch({ query, safe: true, iterations: 1, retries: 2 }); if (!results || results.length === 0) return null; // Prefer non-stock images for (const r of results) { if (r.image && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(r.image)) { return r.image; } } return results[0].image || results[0].thumbnail || null; } catch { // DuckDuckGo may block or timeout; silently skip return null; } } // ─── Main ──────────────────────────────────────────────────────────────────── async function main() { console.log("\n🦆 DuckDuckGo Disease Image Filler\n"); const db = getDb(); // Load state const state = loadState(); const processedSet = new Set(state?.processedIds || []); const totalFoundPrev = state?.totalFound ?? 0; // Get all diseases that still need images const allDiseases = (await db .select({ id: diseases.id, name: diseases.name, scientificName: diseases.scientificName, severity: diseases.severity, plantId: diseases.plantId, }) .from(diseases) .where(sql`(image_url IS NULL OR image_url = '')`) .all()) as DiseaseRow[]; console.log(`📋 ${allDiseases.length} diseases need images\n`); if (allDiseases.length === 0) { console.log("✅ All diseases already have images!\n"); closeDb(); return; } // Sort by severity: critical > high > moderate > low const severityOrder: Record = { critical: 0, high: 1, moderate: 2, low: 3 }; allDiseases.sort((a, b) => (severityOrder[a.severity] ?? 99) - (severityOrder[b.severity] ?? 99)); // Filter out already-processed const pending = allDiseases.filter((d) => !processedSet.has(d.id)); console.log( `📊 Remaining: critical=${allDiseases.filter((d) => d.severity === "critical" && !processedSet.has(d.id)).length}, ` + `high=${allDiseases.filter((d) => d.severity === "high" && !processedSet.has(d.id)).length}, ` + `moderate=${allDiseases.filter((d) => d.severity === "moderate" && !processedSet.has(d.id)).length}, ` + `low=${allDiseases.filter((d) => d.severity === "low" && !processedSet.has(d.id)).length}\n`, ); if (pending.length === 0) { console.log("✅ All remaining diseases already attempted\n"); closeDb(); return; } const raw = createClient({ url: process.env.DATABASE_URL!, authToken: process.env.DATABASE_TOKEN!, }); const processedIds: string[] = state?.processedIds ?? []; let found = totalFoundPrev; let updates: Array<{ id: string; url: string }> = []; for (let i = 0; i < pending.length; i++) { const d = pending[i]; const sev = d.severity.padEnd(8); // Build search query — "[disease] on [plant]" phrasing for better specificity const plantName = d.plantId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()); const query1 = `${d.name} on ${plantName} plant disease`; const query2 = `${d.scientificName || d.name} on ${plantName} disease`; const query3 = `${d.name} plant disease ${plantName}`; const query4 = `${d.name} plant`; const query5 = `${d.name} symptom`; process.stdout.write( ` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 42).padEnd(44)} `, ); // Try queries in order until we get a result let url: string | null = null; for (const q of [query1, query2, query3, query4, query5]) { url = await searchImage(q); if (url) break; } if (url) { updates.push({ id: d.id, url }); found++; processedIds.push(d.id); console.log("✅"); } else { processedIds.push(d.id); console.log("❌"); } // Flush to DB in batches if (updates.length >= DB_FLUSH_BATCH) { await raw.batch( updates.map((u) => ({ sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?", args: [u.url, u.id], })), "write", ); console.log(` → Flushed ${updates.length} to DB`); updates = []; } // Save state every 50 if ((i + 1) % 50 === 0) { saveState(processedIds, found); } // Be polite — 1 req/sec await new Promise((r) => setTimeout(r, POLITE_DELAY)); } // Final flush if (updates.length > 0) { await raw.batch( updates.map((u) => ({ sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?", args: [u.url, u.id], })), "write", ); console.log(` → Flushed ${updates.length} to DB`); } saveState(processedIds, found); raw.close(); // Final report const finalList = await db .select({ id: diseases.id, name: diseases.name, imageUrl: diseases.imageUrl }) .from(diseases) .all(); const w = finalList.filter((d) => d.imageUrl); const wo = finalList.filter((d) => !d.imageUrl); console.log(`\n${"═".repeat(50)}`); console.log(`🦆 DUCKDUCKGO SEARCH COMPLETE`); console.log(`${"═".repeat(50)}`); console.log(` Processed: ${pending.length}`); console.log(` Found this run: ${found - totalFoundPrev}`); console.log(` Total with images: ${w.length}/${finalList.length}`); console.log(` Still missing: ${wo.length}`); if (wo.length > 0) { const reportPath = resolve(__dirname, ".ddg-image-review-needed.md"); let report = "# Disease Images - Still Missing (DDG)\n\n"; report += `Generated: ${new Date().toISOString()}\n\n`; report += `## Summary\n\n`; report += `- Total: ${finalList.length}\n`; report += `- With images: ${w.length}\n`; report += `- Still missing: ${wo.length}\n\n`; report += `## Missing Diseases\n\n`; for (const d of wo) { report += `- ${d.name} (\`${d.id}\`)\n`; } writeFileSync(reportPath, report, "utf-8"); console.log(`\n📝 Missing report: ${reportPath}`); } else { console.log("\n✅ ALL diseases now have images!"); } closeDb(); console.log(); } main().catch((err) => { console.error("\n❌ Fatal:", err); process.exit(1); });