267 lines
8.4 KiB
JavaScript
267 lines
8.4 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* fill-ddg-images.ts — DuckDuckGo Image Search for remaining disease images.
|
|
*
|
|
* No API key needed. Searches DuckDuckGo Images API for each disease
|
|
* without an image and updates the Turso DB.
|
|
*
|
|
* Prioritizes by severity (critical → high → moderate → low).
|
|
* Runs at 1 request/sec to be polite to DuckDuckGo.
|
|
* Resumable via state file (scripts/.ddg-progress.json).
|
|
*
|
|
* Usage:
|
|
* cd apps/web && npx tsx scripts/fill-ddg-images.ts
|
|
*/
|
|
|
|
import { readFileSync, writeFileSync } from "fs";
|
|
import { resolve } from "path";
|
|
|
|
// Load .env.development for DB creds
|
|
const envPath = resolve(__dirname, "../.env.development");
|
|
try {
|
|
const env = readFileSync(envPath, "utf-8");
|
|
for (const line of env.split("\n")) {
|
|
const trimmed = line.trim();
|
|
if (trimmed && !trimmed.startsWith("#")) {
|
|
const eqIdx = trimmed.indexOf("=");
|
|
if (eqIdx > 0) {
|
|
const key = trimmed.slice(0, eqIdx).trim();
|
|
const val = trimmed.slice(eqIdx + 1).trim();
|
|
if (!process.env[key]) process.env[key] = val;
|
|
}
|
|
}
|
|
}
|
|
} catch {}
|
|
|
|
import { getDb, closeDb } from "../src/lib/db/index";
|
|
import { diseases } from "../src/lib/db/schema";
|
|
import { createClient } from "@libsql/client";
|
|
import { sql } from "drizzle-orm";
|
|
|
|
// DuckDuckGo
|
|
import { imageSearch } from "@mudbill/duckduckgo-images-api";
|
|
|
|
interface DiseaseRow {
|
|
id: string;
|
|
name: string;
|
|
scientificName: string;
|
|
severity: string;
|
|
plantId: string;
|
|
}
|
|
|
|
// ─── Config ──────────────────────────────────────────────────────────────────
|
|
|
|
const POLITE_DELAY = 1100; // ms between calls
|
|
const DB_FLUSH_BATCH = 50;
|
|
const STATE_FILE = resolve(__dirname, ".ddg-progress.json");
|
|
|
|
interface RunState {
|
|
processedIds: string[];
|
|
totalFound: number;
|
|
}
|
|
|
|
function loadState(): RunState | null {
|
|
try {
|
|
return JSON.parse(readFileSync(STATE_FILE, "utf-8"));
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function saveState(processedIds: string[], totalFound: number) {
|
|
writeFileSync(STATE_FILE, JSON.stringify({ processedIds, totalFound }, null, 2), "utf-8");
|
|
}
|
|
|
|
// ─── DuckDuckGo Search ───────────────────────────────────────────────────────
|
|
|
|
async function searchImage(query: string): Promise<string | null> {
|
|
try {
|
|
const results = await imageSearch({ query, safe: true, iterations: 1, retries: 2 });
|
|
if (!results || results.length === 0) return null;
|
|
|
|
// Prefer non-stock images
|
|
for (const r of results) {
|
|
if (r.image && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(r.image)) {
|
|
return r.image;
|
|
}
|
|
}
|
|
return results[0].image || results[0].thumbnail || null;
|
|
} catch {
|
|
// DuckDuckGo may block or timeout; silently skip
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// ─── Main ────────────────────────────────────────────────────────────────────
|
|
|
|
async function main() {
|
|
console.log("\n🦆 DuckDuckGo Disease Image Filler\n");
|
|
|
|
const db = getDb();
|
|
|
|
// Load state
|
|
const state = loadState();
|
|
const processedSet = new Set(state?.processedIds || []);
|
|
const totalFoundPrev = state?.totalFound ?? 0;
|
|
|
|
// Get all diseases that still need images
|
|
const allDiseases = (await db
|
|
.select({
|
|
id: diseases.id,
|
|
name: diseases.name,
|
|
scientificName: diseases.scientificName,
|
|
severity: diseases.severity,
|
|
plantId: diseases.plantId,
|
|
})
|
|
.from(diseases)
|
|
.where(sql`(image_url IS NULL OR image_url = '')`)
|
|
.all()) as DiseaseRow[];
|
|
|
|
console.log(`📋 ${allDiseases.length} diseases need images\n`);
|
|
|
|
if (allDiseases.length === 0) {
|
|
console.log("✅ All diseases already have images!\n");
|
|
closeDb();
|
|
return;
|
|
}
|
|
|
|
// Sort by severity: critical > high > moderate > low
|
|
const severityOrder: Record<string, number> = { critical: 0, high: 1, moderate: 2, low: 3 };
|
|
allDiseases.sort((a, b) => (severityOrder[a.severity] ?? 99) - (severityOrder[b.severity] ?? 99));
|
|
|
|
// Filter out already-processed
|
|
const pending = allDiseases.filter((d) => !processedSet.has(d.id));
|
|
|
|
console.log(
|
|
`📊 Remaining: critical=${allDiseases.filter((d) => d.severity === "critical" && !processedSet.has(d.id)).length}, ` +
|
|
`high=${allDiseases.filter((d) => d.severity === "high" && !processedSet.has(d.id)).length}, ` +
|
|
`moderate=${allDiseases.filter((d) => d.severity === "moderate" && !processedSet.has(d.id)).length}, ` +
|
|
`low=${allDiseases.filter((d) => d.severity === "low" && !processedSet.has(d.id)).length}\n`,
|
|
);
|
|
|
|
if (pending.length === 0) {
|
|
console.log("✅ All remaining diseases already attempted\n");
|
|
closeDb();
|
|
return;
|
|
}
|
|
|
|
const raw = createClient({
|
|
url: process.env.DATABASE_URL!,
|
|
authToken: process.env.DATABASE_TOKEN!,
|
|
});
|
|
|
|
const processedIds: string[] = state?.processedIds ?? [];
|
|
let found = totalFoundPrev;
|
|
let updates: Array<{ id: string; url: string }> = [];
|
|
|
|
for (let i = 0; i < pending.length; i++) {
|
|
const d = pending[i];
|
|
const sev = d.severity.padEnd(8);
|
|
|
|
// Build search query — "[disease] on [plant]" phrasing for better specificity
|
|
const plantName = d.plantId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
|
const query1 = `${d.name} on ${plantName} plant disease`;
|
|
const query2 = `${d.scientificName || d.name} on ${plantName} disease`;
|
|
const query3 = `${d.name} plant disease ${plantName}`;
|
|
|
|
process.stdout.write(
|
|
` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 42).padEnd(44)} `,
|
|
);
|
|
|
|
// Try queries in order until we get a result
|
|
let url: string | null = null;
|
|
for (const q of [query1, query2, query3]) {
|
|
url = await searchImage(q);
|
|
if (url) break;
|
|
}
|
|
|
|
if (url) {
|
|
updates.push({ id: d.id, url });
|
|
found++;
|
|
processedIds.push(d.id);
|
|
console.log("✅");
|
|
} else {
|
|
processedIds.push(d.id);
|
|
console.log("❌");
|
|
}
|
|
|
|
// Flush to DB in batches
|
|
if (updates.length >= DB_FLUSH_BATCH) {
|
|
await raw.batch(
|
|
updates.map((u) => ({
|
|
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
|
args: [u.url, u.id],
|
|
})),
|
|
"write",
|
|
);
|
|
console.log(` → Flushed ${updates.length} to DB`);
|
|
updates = [];
|
|
}
|
|
|
|
// Save state every 50
|
|
if ((i + 1) % 50 === 0) {
|
|
saveState(processedIds, found);
|
|
}
|
|
|
|
// Be polite — 1 req/sec
|
|
await new Promise((r) => setTimeout(r, POLITE_DELAY));
|
|
}
|
|
|
|
// Final flush
|
|
if (updates.length > 0) {
|
|
await raw.batch(
|
|
updates.map((u) => ({
|
|
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
|
args: [u.url, u.id],
|
|
})),
|
|
"write",
|
|
);
|
|
console.log(` → Flushed ${updates.length} to DB`);
|
|
}
|
|
|
|
saveState(processedIds, found);
|
|
raw.close();
|
|
|
|
// Final report
|
|
const finalList = await db
|
|
.select({ id: diseases.id, name: diseases.name, imageUrl: diseases.imageUrl })
|
|
.from(diseases)
|
|
.all();
|
|
const w = finalList.filter((d) => d.imageUrl);
|
|
const wo = finalList.filter((d) => !d.imageUrl);
|
|
|
|
console.log(`\n${"═".repeat(50)}`);
|
|
console.log(`🦆 DUCKDUCKGO SEARCH COMPLETE`);
|
|
console.log(`${"═".repeat(50)}`);
|
|
console.log(` Processed: ${pending.length}`);
|
|
console.log(` Found this run: ${found - totalFoundPrev}`);
|
|
console.log(` Total with images: ${w.length}/${finalList.length}`);
|
|
console.log(` Still missing: ${wo.length}`);
|
|
|
|
if (wo.length > 0) {
|
|
const reportPath = resolve(__dirname, ".ddg-image-review-needed.md");
|
|
let report = "# Disease Images - Still Missing (DDG)\n\n";
|
|
report += `Generated: ${new Date().toISOString()}\n\n`;
|
|
report += `## Summary\n\n`;
|
|
report += `- Total: ${finalList.length}\n`;
|
|
report += `- With images: ${w.length}\n`;
|
|
report += `- Still missing: ${wo.length}\n\n`;
|
|
report += `## Missing Diseases\n\n`;
|
|
for (const d of wo) {
|
|
report += `- ${d.name} (\`${d.id}\`)\n`;
|
|
}
|
|
writeFileSync(reportPath, report, "utf-8");
|
|
console.log(`\n📝 Missing report: ${reportPath}`);
|
|
} else {
|
|
console.log("\n✅ ALL diseases now have images!");
|
|
}
|
|
|
|
closeDb();
|
|
console.log();
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("\n❌ Fatal:", err);
|
|
process.exit(1);
|
|
});
|