Files
plant-disease-id/apps/web/scripts/fill-brave-images-v2.ts
2026-06-06 15:09:46 -04:00

415 lines
13 KiB
JavaScript

#!/usr/bin/env node
/**
* fill-brave-images-v2.ts — Brave Image Search for remaining disease images.
*
* Prioritizes by severity (critical → high → moderate → low).
* Runs at 1 request/sec (Brave free tier rate limit).
* Updates Turso DB directly with found images.
* When current key is exhausted, prompts for next key.
* Falls back to duckduckgo-images-api when all keys are spent.
*
* Usage:
* cd apps/web && npx tsx scripts/fill-brave-images-v2.ts
*
* Pass additional API keys as args:
* npx tsx scripts/fill-brave-images-v2.ts KEY2 KEY3
*/
import { readFileSync, writeFileSync } from "fs";
import { resolve } from "path";
// Load env
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) process.env[key] = val;
}
}
}
} catch {}
// Also try .env.local for BRAVE_API_KEY
try {
const envLocal = readFileSync(resolve(__dirname, "../.env.local"), "utf-8");
for (const line of envLocal.split("\n")) {
const trimmed = line.trim();
if (trimmed.startsWith("BRAVE_API_KEY=")) {
const val = trimmed.slice("BRAVE_API_KEY=".length).trim();
if (!process.env.BRAVE_API_KEY) process.env.BRAVE_API_KEY = val;
}
}
} catch {}
import { getDb, closeDb } from "../src/lib/db/index";
import { diseases } from "../src/lib/db/schema";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
interface DiseaseRow {
id: string;
name: string;
scientificName: string;
severity: string;
plantId: string;
}
// ─── Config ──────────────────────────────────────────────────────────────────
const BRAVE_DELAY = 1100; // ms between calls (1 req/sec)
const DB_FLUSH_BATCH = 50;
const MAX_PER_KEY = 1800; // Leave 200 buffer of the 2000/mo limit
const STATE_FILE = resolve(__dirname, ".brave-progress.json");
let currentKeyIndex = 0;
let braveKeys: string[] = [];
let callsThisKey = 0;
let totalFound = 0;
// totalSkipped tracking removed — not needed for v2
// ─── State persistence ───────────────────────────────────────────────────────
interface RunState {
processedIds: string[];
currentKeyIndex: number;
callsThisKey: number;
totalFound: number;
}
function loadState(): RunState | null {
try {
return JSON.parse(readFileSync(STATE_FILE, "utf-8"));
} catch {
return null;
}
}
function saveState(processedIds: string[]) {
writeFileSync(
STATE_FILE,
JSON.stringify(
{
processedIds,
currentKeyIndex,
callsThisKey,
totalFound,
},
null,
2,
),
"utf-8",
);
}
// ─── Brave API ───────────────────────────────────────────────────────────────
async function braveImageSearch(query: string): Promise<string | null> {
const key = braveKeys[currentKeyIndex];
if (!key) return null;
const url = new URL("https://api.search.brave.com/res/v1/images/search");
url.searchParams.set("q", query);
url.searchParams.set("count", "3");
for (let attempt = 0; attempt < 3; attempt++) {
try {
const res = await fetch(url.toString(), {
headers: { "X-Subscription-Token": key, Accept: "application/json" },
});
if (res.status === 429) {
console.log("\n [RATE LIMITED] Key " + (currentKeyIndex + 1) + " exhausted!");
return "RATE_LIMITED";
}
if (!res.ok) return null;
callsThisKey++;
const data = (await res.json()) as {
results?: Array<{ url: string; thumbnail?: { src?: string } }>;
};
const results = data?.results ?? [];
if (results.length === 0) return null;
// Prefer non-stock images
for (const r of results) {
const src = r.thumbnail?.src ?? r.url;
if (src && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(src)) {
return src;
}
}
return results[0].thumbnail?.src ?? results[0].url;
} catch {
await new Promise((r) => setTimeout(r, 2000));
}
}
return null;
}
// ─── DuckDuckGo fallback ────────────────────────────────────────────────────
async function ddgFallbackSearch(query: string): Promise<string | null> {
try {
// Try to use duckduckgo-images-api if installed
const ddg = await import("duckduckgo-images-api").catch(() => null);
if (ddg) {
const results = await ddg.image_search({ query, moderate: true });
if (results && results.length > 0) {
for (const r of results) {
if (r.image && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(r.image)) {
return r.image;
}
}
return results[0].image || null;
}
}
} catch {
// duckduckgo-images-api not installed
}
return null;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
console.log("\n🔍 Brave Disease Image Filler v2\n");
// Parse keys from args + env
const argsKeys = process.argv.slice(2).filter((a) => !a.startsWith("-"));
const envKey = process.env.BRAVE_API_KEY;
braveKeys = [envKey, ...argsKeys].filter(Boolean) as string[];
braveKeys = [...new Set(braveKeys)]; // dedup
if (braveKeys.length === 0) {
console.log("❌ No Brave API keys found.");
console.log(" Set BRAVE_API_KEY in .env.local or pass as argument.\n");
process.exit(1);
}
console.log(`🔑 ${braveKeys.length} Brave API key(s) available\n`);
// Load state
const state = loadState();
if (state) {
currentKeyIndex = state.currentKeyIndex;
callsThisKey = state.callsThisKey;
totalFound = state.totalFound;
console.log(
`📋 Resuming from previous run (${state.processedIds.length} processed, ${totalFound} found)\n`,
);
}
// Get diseases from DB
const db = getDb();
const allDiseases = (await db
.select({
id: diseases.id,
name: diseases.name,
scientificName: diseases.scientificName,
severity: diseases.severity,
plantId: diseases.plantId,
})
.from(diseases)
.where(sql`(image_url IS NULL OR image_url = '')`)
.all()) as DiseaseRow[];
console.log(`📋 ${allDiseases.length} diseases need images\n`);
if (allDiseases.length === 0) {
console.log("✅ All diseases already have images!\n");
closeDb();
return;
}
// Sort by severity priority
const severityOrder = { critical: 0, high: 1, moderate: 2, low: 3 };
allDiseases.sort(
(a, b) =>
(severityOrder[a.severity as keyof typeof severityOrder] || 99) -
(severityOrder[b.severity as keyof typeof severityOrder] || 99),
);
// Filter out already-processed from state
const processedSet = new Set(state?.processedIds || []);
const pending = allDiseases.filter((d) => !processedSet.has(d.id));
console.log(
`📊 Prioritization: critical=${allDiseases.filter((d) => d.severity === "critical" && !processedSet.has(d.id)).length}, high=${allDiseases.filter((d) => d.severity === "high" && !processedSet.has(d.id)).length}, moderate=${allDiseases.filter((d) => d.severity === "moderate" && !processedSet.has(d.id)).length}, low=${allDiseases.filter((d) => d.severity === "low" && !processedSet.has(d.id)).length}\n`,
);
if (pending.length === 0) {
console.log("✅ All remaining diseases already attempted\n");
closeDb();
return;
}
const raw = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
let updates: Array<{ id: string; url: string }> = [];
const processedIds: string[] = state?.processedIds || [];
let found = totalFound;
let ddgMode = false;
for (let i = 0; i < pending.length; i++) {
const d = pending[i];
// Check if current key needs rotating
if (!ddgMode && callsThisKey >= MAX_PER_KEY) {
if (currentKeyIndex < braveKeys.length - 1) {
currentKeyIndex++;
callsThisKey = 0;
console.log(`\n 🔄 Rotating to key ${currentKeyIndex + 1}/${braveKeys.length}\n`);
} else {
console.log(
`\n ⚠️ All ${braveKeys.length} Brave keys exhausted. Switching to DuckDuckGo fallback.\n`,
);
ddgMode = true;
// Install duckduckgo-images-api if not available
try {
await import("duckduckgo-images-api");
} catch {
console.log(" Installing duckduckgo-images-api...");
const { execSync } = await import("child_process");
execSync("npm install duckduckgo-images-api", {
cwd: resolve(__dirname, ".."),
stdio: "pipe",
});
console.log(" Done.\n");
}
}
}
// Build search query
const plantName = d.plantId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
const query = `${d.name} ${d.scientificName} ${plantName} plant disease`;
const sev = d.severity.padEnd(8);
process.stdout.write(
` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 40).padEnd(42)} `,
);
let url: string | null = null;
if (ddgMode) {
url = await ddgFallbackSearch(query);
if (!url) {
// Try a simpler query
url = await ddgFallbackSearch(`${d.name} disease`);
}
} else {
url = await braveImageSearch(query);
if (url === "RATE_LIMITED") {
// Key exhausted mid-query, try next
if (currentKeyIndex < braveKeys.length - 1) {
currentKeyIndex++;
callsThisKey = 0;
console.log("\n 🔄 Rotating key...");
url = await braveImageSearch(query);
} else {
console.log("\n ⚠️ All keys exhausted mid-batch!");
ddgMode = true;
url = await ddgFallbackSearch(query);
}
}
}
if (url) {
updates.push({ id: d.id, url });
found++;
processedIds.push(d.id);
console.log("✅");
} else {
processedIds.push(d.id); // Mark as attempted even if not found
console.log("❌");
}
// Flush to DB
if (updates.length >= DB_FLUSH_BATCH) {
await raw.batch(
updates.map((u) => ({
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
console.log(` → Flushed ${updates.length} to DB`);
updates = [];
}
// Save state every 50
if ((i + 1) % 50 === 0) {
saveState(processedIds);
}
// Rate limit (even for DDG to be polite)
await new Promise((r) => setTimeout(r, ddgMode ? 500 : BRAVE_DELAY));
}
// Final flush
if (updates.length > 0) {
await raw.batch(
updates.map((u) => ({
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
console.log(` → Flushed ${updates.length} to DB`);
}
saveState(processedIds);
raw.close();
// Final report
const finalList = await db
.select({ id: diseases.id, name: diseases.name, imageUrl: diseases.imageUrl })
.from(diseases)
.all();
const w = finalList.filter((d) => d.imageUrl);
const wo = finalList.filter((d) => !d.imageUrl);
console.log(`\n${"═".repeat(50)}`);
console.log(`📊 BRAVE IMAGE SEARCH COMPLETE`);
console.log(`${"═".repeat(50)}`);
console.log(` Processed: ${pending.length}`);
console.log(` Found this run: ${found - totalFound}`);
console.log(` Total with images: ${w.length}/${finalList.length}`);
console.log(` Still missing: ${wo.length}`);
console.log(` Brave keys used: ${currentKeyIndex + 1}`);
console.log(` Calls on current key: ${callsThisKey}`);
console.log(` DuckDuckGo mode: ${ddgMode}`);
if (wo.length > 0) {
const rp = resolve(__dirname, ".disease-image-review-needed.md");
let report = "# Disease Images - Still Missing\n\n";
report += `Generated: ${new Date().toISOString()}\n\n`;
report += `## Summary\n\n`;
report += `- Total: ${finalList.length}\n`;
report += `- With images: ${w.length}\n`;
report += `- Still missing: ${wo.length}\n\n`;
report += `## Missing Diseases\n\n`;
for (const d of wo) {
report += `- ${d.name} (\`${d.id}\`)\n`;
}
writeFileSync(rp, report, "utf-8");
console.log(`\n📝 Report: ${rp}`);
} else {
console.log("\n✅ ALL diseases now have images!");
}
closeDb();
console.log("\n");
}
main().catch((err) => {
console.error("\n❌", err);
process.exit(1);
});