#!/usr/bin/env node /** * fill-plant-images-v2.ts — Batch Wikipedia image fetch for remaining plants. * * Phase 1: Query 50 scientific names at a time via pageimages. * Phase 2: Query 50 common names at a time. * Phase 3: Search individually for stragglers. * * Usage: cd apps/web && npx tsx scripts/fill-plant-images-v2.ts */ import { readFileSync, writeFileSync } from "fs"; import { resolve } from "path"; // Load env const envPath = resolve(__dirname, "../.env.development"); try { const env = readFileSync(envPath, "utf-8"); for (const line of env.split("\n")) { const trimmed = line.trim(); if (trimmed && !trimmed.startsWith("#")) { const eqIdx = trimmed.indexOf("="); if (eqIdx > 0) { const key = trimmed.slice(0, eqIdx).trim(); const val = trimmed.slice(eqIdx + 1).trim(); if (!process.env[key]) { process.env[key] = val; } } } } } catch (e) {} import { getDb, closeDb } from "../src/lib/db/index"; import { plants } from "../src/lib/db/schema"; import { createClient } from "@libsql/client"; import { sql } from "drizzle-orm"; const API = "https://en.wikipedia.org/w/api.php"; const UA = "PlantHealthKB/1.0"; const BATCH = 50; interface PlantRow { id: string; commonName: string; scientificName: string; } function clean(s: string): string { return s .replace(/[xX]/g, "x") .replace(/\s*spp\.?\s*/gi, "") .replace(/[.\u00d7']/g, "") .trim(); } async function fetchThumbs(titles: string[]): Promise> { if (titles.length === 0) { return new Map(); } const p = new URLSearchParams({ action: "query", titles: titles.join("|"), prop: "pageimages", pithumbsize: "400", redirects: "1", format: "json", }); for (let a = 0; a < 3; a++) { try { const r = await fetch(API + "?" + p.toString(), { headers: { "User-Agent": UA }, }); if (r.status === 429) { await new Promise((rr) => setTimeout(rr, 5000 * Math.pow(2, a))); continue; } if (!r.ok) { return new Map(); } const d = (await r.json()) as any; const pages = d?.query?.pages; if (!pages) { return new Map(); } const m = new Map(); for (const [, pg] of Object.entries(pages)) { const p2 = pg as any; if (!p2.missing && p2.thumbnail?.source) { m.set(p2.title.toLowerCase(), p2.thumbnail.source); } } return m; } catch (e) { await new Promise((rr) => setTimeout(rr, 2000)); } } return new Map(); } async function searchOne(query: string): Promise { const p = new URLSearchParams({ action: "query", generator: "search", gsrsearch: query, gsrlimit: "3", prop: "pageimages", pithumbsize: "400", format: "json", }); for (let a = 0; a < 3; a++) { try { const r = await fetch(API + "?" + p.toString(), { headers: { "User-Agent": UA }, }); if (r.status === 429) { await new Promise((rr) => setTimeout(rr, 5000 * Math.pow(2, a))); continue; } if (!r.ok) { return null; } const d = (await r.json()) as any; const pages = d?.query?.pages; if (!pages) { return null; } for (const [, pg] of Object.entries(pages)) { const p2 = pg as any; if (p2.thumbnail?.source) { return p2.thumbnail.source; } } return null; } catch (e) { await new Promise((rr) => setTimeout(rr, 2000)); } } return null; } async function batchPhase( plants: PlantRow[], titleFn: (p: PlantRow) => string, label: string, dbClient: any, ): Promise { const remaining: PlantRow[] = []; const updates: Array<{ id: string; url: string }> = []; for (let i = 0; i < plants.length; i += BATCH) { const chunk = plants.slice(i, i + BATCH); const titles = chunk.map(titleFn).filter((t) => t.length > 2); console.log( " [" + label + "] " + (i + 1) + "-" + Math.min(i + BATCH, plants.length) + "/" + plants.length + " ", ); const imageMap = await fetchThumbs(titles); let n = 0; for (const pl of chunk) { const t = titleFn(pl).toLowerCase(); const img = imageMap.get(t); if (img) { updates.push({ id: pl.id, url: img }); n++; } else { remaining.push(pl); } } console.log(" found: " + n); if (updates.length >= 100) { await dbClient.batch( updates.map((u) => ({ sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?", args: [u.url, u.id], })), "write", ); updates.length = 0; } await new Promise((r) => setTimeout(r, 1500)); } if (updates.length > 0) { await dbClient.batch( updates.map((u) => ({ sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?", args: [u.url, u.id], })), "write", ); } return remaining; } async function main() { console.log("\nPlant Image Filler v2\n"); const db = getDb(); const allPlants = (await db .select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName, }) .from(plants) .where(sql`(image_url IS NULL OR image_url = '')`) .all()) as PlantRow[]; console.log("Plants needing images: " + allPlants.length + "\n"); if (allPlants.length === 0) { console.log("All plants have images!\n"); closeDb(); return; } const raw = createClient({ url: process.env.DATABASE_URL!, authToken: process.env.DATABASE_TOKEN!, }); let found = 0; // Phase 1: Scientific name console.log("--- Phase 1: Scientific names ---\n"); let remaining = await batchPhase(allPlants, (p) => clean(p.scientificName), "sci", raw); // Phase 2: Common name if (remaining.length > 0) { console.log("\n--- Phase 2: Common names (" + remaining.length + ") ---\n"); remaining = await batchPhase(remaining, (p) => p.commonName, "common", raw); } // Phase 3: Search if (remaining.length > 0) { console.log("\n--- Phase 3: Search (" + remaining.length + ") ---\n"); for (let i = 0; i < remaining.length; i++) { const pl = remaining[i]; const q = clean(pl.scientificName) + " " + pl.commonName; console.log(" [" + (i + 1) + "/" + remaining.length + "] " + pl.commonName); const img = await searchOne(q); if (img) { await raw.execute({ sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?", args: [img, pl.id], }); found++; console.log(" OK"); } else { console.log(" MISS"); } await new Promise((r) => setTimeout(r, 500)); } } raw.close(); // Report const finalList = await db .select({ id: plants.id, commonName: plants.commonName, imageUrl: plants.imageUrl, }) .from(plants) .all(); const w = finalList.filter((p) => p.imageUrl); const wo = finalList.filter((p) => !p.imageUrl); console.log("\n" + "=".repeat(50)); console.log("FINAL: " + finalList.length + " plants"); console.log(" With images: " + w.length); console.log(" Missing: " + wo.length); if (wo.length > 0) { const rp = resolve(__dirname, ".plant-image-review-needed.md"); let report = "# Plant Images - Still Missing\n\n"; report += "Generated: " + new Date().toISOString() + "\n\n"; report += "## Missing (" + wo.length + ")\n\n"; for (const p of wo) { report += "- " + p.commonName + " (" + p.id + ")\n"; } writeFileSync(rp, report, "utf-8"); console.log("Report: " + rp); } else { console.log("\nALL PLANTS HAVE IMAGES!"); } closeDb(); } main().catch((err: any) => { console.error("Error:", err); process.exit(1); });