beepboop
This commit is contained in:
414
apps/web/scripts/fill-brave-images-v2.ts
Normal file
414
apps/web/scripts/fill-brave-images-v2.ts
Normal file
@@ -0,0 +1,414 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-brave-images-v2.ts — Brave Image Search for remaining disease images.
|
||||
*
|
||||
* Prioritizes by severity (critical → high → moderate → low).
|
||||
* Runs at 1 request/sec (Brave free tier rate limit).
|
||||
* Updates Turso DB directly with found images.
|
||||
* When current key is exhausted, prompts for next key.
|
||||
* Falls back to duckduckgo-images-api when all keys are spent.
|
||||
*
|
||||
* Usage:
|
||||
* cd apps/web && npx tsx scripts/fill-brave-images-v2.ts
|
||||
*
|
||||
* Pass additional API keys as args:
|
||||
* npx tsx scripts/fill-brave-images-v2.ts KEY2 KEY3
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// Load env
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
// Also try .env.local for BRAVE_API_KEY
|
||||
try {
|
||||
const envLocal = readFileSync(resolve(__dirname, "../.env.local"), "utf-8");
|
||||
for (const line of envLocal.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed.startsWith("BRAVE_API_KEY=")) {
|
||||
const val = trimmed.slice("BRAVE_API_KEY=".length).trim();
|
||||
if (!process.env.BRAVE_API_KEY) process.env.BRAVE_API_KEY = val;
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases } from "../src/lib/db/schema";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
interface DiseaseRow {
|
||||
id: string;
|
||||
name: string;
|
||||
scientificName: string;
|
||||
severity: string;
|
||||
plantId: string;
|
||||
}
|
||||
|
||||
// ─── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
const BRAVE_DELAY = 1100; // ms between calls (1 req/sec)
|
||||
const DB_FLUSH_BATCH = 50;
|
||||
const MAX_PER_KEY = 1800; // Leave 200 buffer of the 2000/mo limit
|
||||
const STATE_FILE = resolve(__dirname, ".brave-progress.json");
|
||||
|
||||
let currentKeyIndex = 0;
|
||||
let braveKeys: string[] = [];
|
||||
let callsThisKey = 0;
|
||||
let totalFound = 0;
|
||||
// totalSkipped tracking removed — not needed for v2
|
||||
|
||||
// ─── State persistence ───────────────────────────────────────────────────────
|
||||
|
||||
interface RunState {
|
||||
processedIds: string[];
|
||||
currentKeyIndex: number;
|
||||
callsThisKey: number;
|
||||
totalFound: number;
|
||||
}
|
||||
|
||||
function loadState(): RunState | null {
|
||||
try {
|
||||
return JSON.parse(readFileSync(STATE_FILE, "utf-8"));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function saveState(processedIds: string[]) {
|
||||
writeFileSync(
|
||||
STATE_FILE,
|
||||
JSON.stringify(
|
||||
{
|
||||
processedIds,
|
||||
currentKeyIndex,
|
||||
callsThisKey,
|
||||
totalFound,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
"utf-8",
|
||||
);
|
||||
}
|
||||
|
||||
// ─── Brave API ───────────────────────────────────────────────────────────────
|
||||
|
||||
async function braveImageSearch(query: string): Promise<string | null> {
|
||||
const key = braveKeys[currentKeyIndex];
|
||||
if (!key) return null;
|
||||
|
||||
const url = new URL("https://api.search.brave.com/res/v1/images/search");
|
||||
url.searchParams.set("q", query);
|
||||
url.searchParams.set("count", "3");
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetch(url.toString(), {
|
||||
headers: { "X-Subscription-Token": key, Accept: "application/json" },
|
||||
});
|
||||
|
||||
if (res.status === 429) {
|
||||
console.log("\n [RATE LIMITED] Key " + (currentKeyIndex + 1) + " exhausted!");
|
||||
return "RATE_LIMITED";
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
|
||||
callsThisKey++;
|
||||
const data = (await res.json()) as {
|
||||
results?: Array<{ url: string; thumbnail?: { src?: string } }>;
|
||||
};
|
||||
const results = data?.results ?? [];
|
||||
if (results.length === 0) return null;
|
||||
|
||||
// Prefer non-stock images
|
||||
for (const r of results) {
|
||||
const src = r.thumbnail?.src ?? r.url;
|
||||
if (src && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(src)) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
return results[0].thumbnail?.src ?? results[0].url;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── DuckDuckGo fallback ────────────────────────────────────────────────────
|
||||
|
||||
async function ddgFallbackSearch(query: string): Promise<string | null> {
|
||||
try {
|
||||
// Try to use duckduckgo-images-api if installed
|
||||
const ddg = await import("duckduckgo-images-api").catch(() => null);
|
||||
if (ddg) {
|
||||
const results = await ddg.image_search({ query, moderate: true });
|
||||
if (results && results.length > 0) {
|
||||
for (const r of results) {
|
||||
if (r.image && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(r.image)) {
|
||||
return r.image;
|
||||
}
|
||||
}
|
||||
return results[0].image || null;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// duckduckgo-images-api not installed
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log("\n🔍 Brave Disease Image Filler v2\n");
|
||||
|
||||
// Parse keys from args + env
|
||||
const argsKeys = process.argv.slice(2).filter((a) => !a.startsWith("-"));
|
||||
const envKey = process.env.BRAVE_API_KEY;
|
||||
braveKeys = [envKey, ...argsKeys].filter(Boolean) as string[];
|
||||
braveKeys = [...new Set(braveKeys)]; // dedup
|
||||
|
||||
if (braveKeys.length === 0) {
|
||||
console.log("❌ No Brave API keys found.");
|
||||
console.log(" Set BRAVE_API_KEY in .env.local or pass as argument.\n");
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`🔑 ${braveKeys.length} Brave API key(s) available\n`);
|
||||
|
||||
// Load state
|
||||
const state = loadState();
|
||||
if (state) {
|
||||
currentKeyIndex = state.currentKeyIndex;
|
||||
callsThisKey = state.callsThisKey;
|
||||
totalFound = state.totalFound;
|
||||
console.log(
|
||||
`📋 Resuming from previous run (${state.processedIds.length} processed, ${totalFound} found)\n`,
|
||||
);
|
||||
}
|
||||
|
||||
// Get diseases from DB
|
||||
const db = getDb();
|
||||
const allDiseases = (await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
name: diseases.name,
|
||||
scientificName: diseases.scientificName,
|
||||
severity: diseases.severity,
|
||||
plantId: diseases.plantId,
|
||||
})
|
||||
.from(diseases)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all()) as DiseaseRow[];
|
||||
|
||||
console.log(`📋 ${allDiseases.length} diseases need images\n`);
|
||||
|
||||
if (allDiseases.length === 0) {
|
||||
console.log("✅ All diseases already have images!\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
// Sort by severity priority
|
||||
const severityOrder = { critical: 0, high: 1, moderate: 2, low: 3 };
|
||||
allDiseases.sort(
|
||||
(a, b) =>
|
||||
(severityOrder[a.severity as keyof typeof severityOrder] || 99) -
|
||||
(severityOrder[b.severity as keyof typeof severityOrder] || 99),
|
||||
);
|
||||
|
||||
// Filter out already-processed from state
|
||||
const processedSet = new Set(state?.processedIds || []);
|
||||
const pending = allDiseases.filter((d) => !processedSet.has(d.id));
|
||||
|
||||
console.log(
|
||||
`📊 Prioritization: critical=${allDiseases.filter((d) => d.severity === "critical" && !processedSet.has(d.id)).length}, high=${allDiseases.filter((d) => d.severity === "high" && !processedSet.has(d.id)).length}, moderate=${allDiseases.filter((d) => d.severity === "moderate" && !processedSet.has(d.id)).length}, low=${allDiseases.filter((d) => d.severity === "low" && !processedSet.has(d.id)).length}\n`,
|
||||
);
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log("✅ All remaining diseases already attempted\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
const raw = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
let updates: Array<{ id: string; url: string }> = [];
|
||||
const processedIds: string[] = state?.processedIds || [];
|
||||
let found = totalFound;
|
||||
let ddgMode = false;
|
||||
|
||||
for (let i = 0; i < pending.length; i++) {
|
||||
const d = pending[i];
|
||||
|
||||
// Check if current key needs rotating
|
||||
if (!ddgMode && callsThisKey >= MAX_PER_KEY) {
|
||||
if (currentKeyIndex < braveKeys.length - 1) {
|
||||
currentKeyIndex++;
|
||||
callsThisKey = 0;
|
||||
console.log(`\n 🔄 Rotating to key ${currentKeyIndex + 1}/${braveKeys.length}\n`);
|
||||
} else {
|
||||
console.log(
|
||||
`\n ⚠️ All ${braveKeys.length} Brave keys exhausted. Switching to DuckDuckGo fallback.\n`,
|
||||
);
|
||||
ddgMode = true;
|
||||
// Install duckduckgo-images-api if not available
|
||||
try {
|
||||
await import("duckduckgo-images-api");
|
||||
} catch {
|
||||
console.log(" Installing duckduckgo-images-api...");
|
||||
const { execSync } = await import("child_process");
|
||||
execSync("npm install duckduckgo-images-api", {
|
||||
cwd: resolve(__dirname, ".."),
|
||||
stdio: "pipe",
|
||||
});
|
||||
console.log(" Done.\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build search query
|
||||
const plantName = d.plantId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
||||
const query = `${d.name} ${d.scientificName} ${plantName} plant disease`;
|
||||
const sev = d.severity.padEnd(8);
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 40).padEnd(42)} `,
|
||||
);
|
||||
|
||||
let url: string | null = null;
|
||||
|
||||
if (ddgMode) {
|
||||
url = await ddgFallbackSearch(query);
|
||||
if (!url) {
|
||||
// Try a simpler query
|
||||
url = await ddgFallbackSearch(`${d.name} disease`);
|
||||
}
|
||||
} else {
|
||||
url = await braveImageSearch(query);
|
||||
if (url === "RATE_LIMITED") {
|
||||
// Key exhausted mid-query, try next
|
||||
if (currentKeyIndex < braveKeys.length - 1) {
|
||||
currentKeyIndex++;
|
||||
callsThisKey = 0;
|
||||
console.log("\n 🔄 Rotating key...");
|
||||
url = await braveImageSearch(query);
|
||||
} else {
|
||||
console.log("\n ⚠️ All keys exhausted mid-batch!");
|
||||
ddgMode = true;
|
||||
url = await ddgFallbackSearch(query);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (url) {
|
||||
updates.push({ id: d.id, url });
|
||||
found++;
|
||||
processedIds.push(d.id);
|
||||
console.log("✅");
|
||||
} else {
|
||||
processedIds.push(d.id); // Mark as attempted even if not found
|
||||
console.log("❌");
|
||||
}
|
||||
|
||||
// Flush to DB
|
||||
if (updates.length >= DB_FLUSH_BATCH) {
|
||||
await raw.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
updates = [];
|
||||
}
|
||||
|
||||
// Save state every 50
|
||||
if ((i + 1) % 50 === 0) {
|
||||
saveState(processedIds);
|
||||
}
|
||||
|
||||
// Rate limit (even for DDG to be polite)
|
||||
await new Promise((r) => setTimeout(r, ddgMode ? 500 : BRAVE_DELAY));
|
||||
}
|
||||
|
||||
// Final flush
|
||||
if (updates.length > 0) {
|
||||
await raw.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
}
|
||||
|
||||
saveState(processedIds);
|
||||
raw.close();
|
||||
|
||||
// Final report
|
||||
const finalList = await db
|
||||
.select({ id: diseases.id, name: diseases.name, imageUrl: diseases.imageUrl })
|
||||
.from(diseases)
|
||||
.all();
|
||||
const w = finalList.filter((d) => d.imageUrl);
|
||||
const wo = finalList.filter((d) => !d.imageUrl);
|
||||
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`📊 BRAVE IMAGE SEARCH COMPLETE`);
|
||||
console.log(`${"═".repeat(50)}`);
|
||||
console.log(` Processed: ${pending.length}`);
|
||||
console.log(` Found this run: ${found - totalFound}`);
|
||||
console.log(` Total with images: ${w.length}/${finalList.length}`);
|
||||
console.log(` Still missing: ${wo.length}`);
|
||||
console.log(` Brave keys used: ${currentKeyIndex + 1}`);
|
||||
console.log(` Calls on current key: ${callsThisKey}`);
|
||||
console.log(` DuckDuckGo mode: ${ddgMode}`);
|
||||
|
||||
if (wo.length > 0) {
|
||||
const rp = resolve(__dirname, ".disease-image-review-needed.md");
|
||||
let report = "# Disease Images - Still Missing\n\n";
|
||||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
report += `## Summary\n\n`;
|
||||
report += `- Total: ${finalList.length}\n`;
|
||||
report += `- With images: ${w.length}\n`;
|
||||
report += `- Still missing: ${wo.length}\n\n`;
|
||||
report += `## Missing Diseases\n\n`;
|
||||
for (const d of wo) {
|
||||
report += `- ${d.name} (\`${d.id}\`)\n`;
|
||||
}
|
||||
writeFileSync(rp, report, "utf-8");
|
||||
console.log(`\n📝 Report: ${rp}`);
|
||||
} else {
|
||||
console.log("\n✅ ALL diseases now have images!");
|
||||
}
|
||||
|
||||
closeDb();
|
||||
console.log("\n");
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌", err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user