beepboop
This commit is contained in:
266
apps/web/scripts/fill-ddg-images.ts
Normal file
266
apps/web/scripts/fill-ddg-images.ts
Normal file
@@ -0,0 +1,266 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-ddg-images.ts — DuckDuckGo Image Search for remaining disease images.
|
||||
*
|
||||
* No API key needed. Searches DuckDuckGo Images API for each disease
|
||||
* without an image and updates the Turso DB.
|
||||
*
|
||||
* Prioritizes by severity (critical → high → moderate → low).
|
||||
* Runs at 1 request/sec to be polite to DuckDuckGo.
|
||||
* Resumable via state file (scripts/.ddg-progress.json).
|
||||
*
|
||||
* Usage:
|
||||
* cd apps/web && npx tsx scripts/fill-ddg-images.ts
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// Load .env.development for DB creds
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases } from "../src/lib/db/schema";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
// DuckDuckGo
|
||||
import { imageSearch } from "@mudbill/duckduckgo-images-api";
|
||||
|
||||
interface DiseaseRow {
|
||||
id: string;
|
||||
name: string;
|
||||
scientificName: string;
|
||||
severity: string;
|
||||
plantId: string;
|
||||
}
|
||||
|
||||
// ─── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
const POLITE_DELAY = 1100; // ms between calls
|
||||
const DB_FLUSH_BATCH = 50;
|
||||
const STATE_FILE = resolve(__dirname, ".ddg-progress.json");
|
||||
|
||||
interface RunState {
|
||||
processedIds: string[];
|
||||
totalFound: number;
|
||||
}
|
||||
|
||||
function loadState(): RunState | null {
|
||||
try {
|
||||
return JSON.parse(readFileSync(STATE_FILE, "utf-8"));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function saveState(processedIds: string[], totalFound: number) {
|
||||
writeFileSync(STATE_FILE, JSON.stringify({ processedIds, totalFound }, null, 2), "utf-8");
|
||||
}
|
||||
|
||||
// ─── DuckDuckGo Search ───────────────────────────────────────────────────────
|
||||
|
||||
async function searchImage(query: string): Promise<string | null> {
|
||||
try {
|
||||
const results = await imageSearch({ query, safe: true, iterations: 1, retries: 2 });
|
||||
if (!results || results.length === 0) return null;
|
||||
|
||||
// Prefer non-stock images
|
||||
for (const r of results) {
|
||||
if (r.image && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(r.image)) {
|
||||
return r.image;
|
||||
}
|
||||
}
|
||||
return results[0].image || results[0].thumbnail || null;
|
||||
} catch {
|
||||
// DuckDuckGo may block or timeout; silently skip
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log("\n🦆 DuckDuckGo Disease Image Filler\n");
|
||||
|
||||
const db = getDb();
|
||||
|
||||
// Load state
|
||||
const state = loadState();
|
||||
const processedSet = new Set(state?.processedIds || []);
|
||||
const totalFoundPrev = state?.totalFound ?? 0;
|
||||
|
||||
// Get all diseases that still need images
|
||||
const allDiseases = (await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
name: diseases.name,
|
||||
scientificName: diseases.scientificName,
|
||||
severity: diseases.severity,
|
||||
plantId: diseases.plantId,
|
||||
})
|
||||
.from(diseases)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all()) as DiseaseRow[];
|
||||
|
||||
console.log(`📋 ${allDiseases.length} diseases need images\n`);
|
||||
|
||||
if (allDiseases.length === 0) {
|
||||
console.log("✅ All diseases already have images!\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
// Sort by severity: critical > high > moderate > low
|
||||
const severityOrder: Record<string, number> = { critical: 0, high: 1, moderate: 2, low: 3 };
|
||||
allDiseases.sort((a, b) => (severityOrder[a.severity] ?? 99) - (severityOrder[b.severity] ?? 99));
|
||||
|
||||
// Filter out already-processed
|
||||
const pending = allDiseases.filter((d) => !processedSet.has(d.id));
|
||||
|
||||
console.log(
|
||||
`📊 Remaining: critical=${allDiseases.filter((d) => d.severity === "critical" && !processedSet.has(d.id)).length}, ` +
|
||||
`high=${allDiseases.filter((d) => d.severity === "high" && !processedSet.has(d.id)).length}, ` +
|
||||
`moderate=${allDiseases.filter((d) => d.severity === "moderate" && !processedSet.has(d.id)).length}, ` +
|
||||
`low=${allDiseases.filter((d) => d.severity === "low" && !processedSet.has(d.id)).length}\n`,
|
||||
);
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log("✅ All remaining diseases already attempted\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
const raw = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
const processedIds: string[] = state?.processedIds ?? [];
|
||||
let found = totalFoundPrev;
|
||||
let updates: Array<{ id: string; url: string }> = [];
|
||||
|
||||
for (let i = 0; i < pending.length; i++) {
|
||||
const d = pending[i];
|
||||
const sev = d.severity.padEnd(8);
|
||||
|
||||
// Build search query — "[disease] on [plant]" phrasing for better specificity
|
||||
const plantName = d.plantId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
||||
const query1 = `${d.name} on ${plantName} plant disease`;
|
||||
const query2 = `${d.scientificName || d.name} on ${plantName} disease`;
|
||||
const query3 = `${d.name} plant disease ${plantName}`;
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 42).padEnd(44)} `,
|
||||
);
|
||||
|
||||
// Try queries in order until we get a result
|
||||
let url: string | null = null;
|
||||
for (const q of [query1, query2, query3]) {
|
||||
url = await searchImage(q);
|
||||
if (url) break;
|
||||
}
|
||||
|
||||
if (url) {
|
||||
updates.push({ id: d.id, url });
|
||||
found++;
|
||||
processedIds.push(d.id);
|
||||
console.log("✅");
|
||||
} else {
|
||||
processedIds.push(d.id);
|
||||
console.log("❌");
|
||||
}
|
||||
|
||||
// Flush to DB in batches
|
||||
if (updates.length >= DB_FLUSH_BATCH) {
|
||||
await raw.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
updates = [];
|
||||
}
|
||||
|
||||
// Save state every 50
|
||||
if ((i + 1) % 50 === 0) {
|
||||
saveState(processedIds, found);
|
||||
}
|
||||
|
||||
// Be polite — 1 req/sec
|
||||
await new Promise((r) => setTimeout(r, POLITE_DELAY));
|
||||
}
|
||||
|
||||
// Final flush
|
||||
if (updates.length > 0) {
|
||||
await raw.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
}
|
||||
|
||||
saveState(processedIds, found);
|
||||
raw.close();
|
||||
|
||||
// Final report
|
||||
const finalList = await db
|
||||
.select({ id: diseases.id, name: diseases.name, imageUrl: diseases.imageUrl })
|
||||
.from(diseases)
|
||||
.all();
|
||||
const w = finalList.filter((d) => d.imageUrl);
|
||||
const wo = finalList.filter((d) => !d.imageUrl);
|
||||
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`🦆 DUCKDUCKGO SEARCH COMPLETE`);
|
||||
console.log(`${"═".repeat(50)}`);
|
||||
console.log(` Processed: ${pending.length}`);
|
||||
console.log(` Found this run: ${found - totalFoundPrev}`);
|
||||
console.log(` Total with images: ${w.length}/${finalList.length}`);
|
||||
console.log(` Still missing: ${wo.length}`);
|
||||
|
||||
if (wo.length > 0) {
|
||||
const reportPath = resolve(__dirname, ".ddg-image-review-needed.md");
|
||||
let report = "# Disease Images - Still Missing (DDG)\n\n";
|
||||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
report += `## Summary\n\n`;
|
||||
report += `- Total: ${finalList.length}\n`;
|
||||
report += `- With images: ${w.length}\n`;
|
||||
report += `- Still missing: ${wo.length}\n\n`;
|
||||
report += `## Missing Diseases\n\n`;
|
||||
for (const d of wo) {
|
||||
report += `- ${d.name} (\`${d.id}\`)\n`;
|
||||
}
|
||||
writeFileSync(reportPath, report, "utf-8");
|
||||
console.log(`\n📝 Missing report: ${reportPath}`);
|
||||
} else {
|
||||
console.log("\n✅ ALL diseases now have images!");
|
||||
}
|
||||
|
||||
closeDb();
|
||||
console.log();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌ Fatal:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user