#!/usr/bin/env node /** * fill-plant-images.ts — Fetch plant images from Wikipedia for plants missing them. * * Uses the Wikipedia API to search for the plant's scientific name * and grab the page thumbnail. * * Usage: cd apps/web && npx tsx scripts/fill-plant-images.ts */ import { readFileSync, writeFileSync } from "fs"; import { resolve } from "path"; // Load env const envPath = resolve(__dirname, "../.env.development"); try { const env = readFileSync(envPath, "utf-8"); for (const line of env.split("\n")) { const trimmed = line.trim(); if (trimmed && !trimmed.startsWith("#")) { const eqIdx = trimmed.indexOf("="); if (eqIdx > 0) { const key = trimmed.slice(0, eqIdx).trim(); const val = trimmed.slice(eqIdx + 1).trim(); if (!process.env[key]) process.env[key] = val; } } } } catch {} import { getDb, closeDb } from "../src/lib/db/index"; import { plants } from "../src/lib/db/schema"; import { createClient } from "@libsql/client"; import { sql } from "drizzle-orm"; const WIKI_API = "https://en.wikipedia.org/w/api.php"; const UA = "PlantHealthKB/1.0 (plant-images)"; const DELAY_MS = 500; const BATCH_SIZE = 50; /** Direct page lookup by title — more reliable for known scientific names. */ async function directPageLookup(title: string): Promise { const params = new URLSearchParams({ action: "query", titles: title, prop: "pageimages", pithumbsize: "400", format: "json", origin: "*", }); for (let attempt = 0; attempt < 3; attempt++) { try { const res = await fetch(`${WIKI_API}?${params}`, { headers: { "User-Agent": UA }, }); if (res.status === 429) { await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt)); continue; } if (!res.ok) return null; const data = (await res.json()) as { query?: { pages?: Record }; }; const pages = data?.query?.pages; if (!pages) return null; for (const [, p] of Object.entries(pages)) { if (!p.missing && p.thumbnail?.source) return p.thumbnail.source; } return null; } catch { await new Promise((r) => setTimeout(r, 2000)); } } return null; } async function main() { console.log("\n🌿 Fetching plant images from Wikipedia\n"); const db = getDb(); const allPlants = await db .select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName }) .from(plants) .where(sql`(image_url IS NULL OR image_url = '')`) .all(); console.log(`šŸ“‹ ${allPlants.length} plants need images\n`); if (allPlants.length === 0) { console.log("āœ… All plants already have images!\n"); closeDb(); return; } const rawClient = createClient({ url: process.env.DATABASE_URL!, authToken: process.env.DATABASE_TOKEN!, }); let found = 0; const updates: { id: string; url: string }[] = []; // Phase 1: Try direct page lookup by scientific name (most accurate) console.log("─── Phase 1: Direct page lookup ───\n"); for (let i = 0; i < allPlants.length; i++) { const plant = allPlants[i]; const sciName = plant.scientificName .replace(/[Ɨ'"]/g, "") .replace(/\s*spp\.?\s*/i, "") .trim(); process.stdout.write( ` [${String(i + 1).padStart(3)}/${allPlants.length}] ${plant.commonName.padEnd(30)} `, ); let url: string | null = null; // Try scientific name first if (sciName && sciName !== "Unknown" && sciName !== "Various") { url = await directPageLookup(sciName); } // Try common name if scientific name didn't work if (!url) { url = await directPageLookup(plant.commonName); } // Try genus name if (!url && sciName) { const genus = sciName.split(/\s+/)[0]; if (genus && genus.length > 3) { url = await directPageLookup(genus); } } if (url) { updates.push({ id: plant.id, url }); found++; process.stdout.write("āœ…\n"); } else { process.stdout.write("ā­ļø\n"); } // Flush to DB in batches if (updates.length >= BATCH_SIZE) { await rawClient.batch( updates.map((u) => ({ sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?", args: [u.url, u.id], })), "write", ); console.log(` → Flushed ${updates.length} to DB`); updates.length = 0; } await new Promise((r) => setTimeout(r, DELAY_MS)); } // Flush remaining if (updates.length > 0) { await rawClient.batch( updates.map((u) => ({ sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?", args: [u.url, u.id], })), "write", ); console.log(` → Flushed ${updates.length} to DB`); updates.length = 0; } console.log(`\nāœ… Phase 1 done: ${found}/${allPlants.length} plants got images\n`); // Phase 2: Try remaining via search API const stillMissing = await db .select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName }) .from(plants) .where(sql`(image_url IS NULL OR image_url = '')`) .all(); if (stillMissing.length > 0) { console.log(`─── Phase 2: Search API for ${stillMissing.length} remaining ───\n`); for (let i = 0; i < stillMissing.length; i++) { const plant = stillMissing[i]; const sciName = plant.scientificName.replace(/[Ɨ'"]/g, "").trim(); process.stdout.write( ` [${String(i + 1).padStart(3)}/${stillMissing.length}] ${plant.commonName.padEnd(30)} `, ); // Search with scientific name const searchTerm = `${sciName} ${plant.commonName}`; const params = new URLSearchParams({ action: "query", list: "search", srsearch: searchTerm, srlimit: "3", format: "json", origin: "*", }); let url: string | null = null; for (let attempt = 0; attempt < 3; attempt++) { try { const res = await fetch(`${WIKI_API}?${params}`, { headers: { "User-Agent": UA }, }); if (res.status === 429) { await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt)); continue; } if (!res.ok) break; const data = (await res.json()) as { query?: { search?: Array<{ title: string; pageid: number }> }; }; const hits = data?.query?.search ?? []; if (hits.length === 0) break; // Get thumbnail for first result for (const hit of hits) { const pageParams = new URLSearchParams({ action: "query", pageids: String(hit.pageid), prop: "pageimages", pithumbsize: "400", format: "json", origin: "*", }); const pageRes = await fetch(`${WIKI_API}?${pageParams}`, { headers: { "User-Agent": UA }, }); if (!pageRes.ok) continue; const pageData = (await pageRes.json()) as { query?: { pages?: Record }; }; const pages = pageData?.query?.pages; if (!pages) continue; for (const [, p] of Object.entries(pages)) { if (p.thumbnail?.source) { url = p.thumbnail.source; break; } } if (url) break; } break; } catch { await new Promise((r) => setTimeout(r, 2000)); } } if (url) { await rawClient.execute({ sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?", args: [url, plant.id], }); found++; process.stdout.write("āœ…\n"); } else { process.stdout.write("āŒ\n"); } await new Promise((r) => setTimeout(r, DELAY_MS)); } } // Final count const final = await db .select({ id: plants.id, commonName: plants.commonName, imageUrl: plants.imageUrl }) .from(plants) .all(); const withImg = final.filter((p) => p.imageUrl); const withoutImg = final.filter((p) => !p.imageUrl); console.log(`\n${"═".repeat(50)}`); console.log(`šŸ“Š FINAL: ${final.length} plants`); console.log(` With images: ${withImg.length}`); console.log(` Missing images: ${withoutImg.length}`); if (withoutImg.length > 0) { console.log(`\nšŸ“ Plants still needing images:`); withoutImg.forEach((p) => console.log(` āŒ ${p.id}: ${p.commonName}`)); // Save to file for reference const reportPath = resolve(__dirname, ".plant-image-review-needed.md"); let report = "# Plant Images — Still Missing\n\n"; report += `Generated: ${new Date().toISOString()}\n\n`; report += `## 🚫 Plants without images (${withoutImg.length})\n\n`; for (const p of withoutImg) { report += `- **${p.commonName}** (\`${p.id}\`)\n`; } writeFileSync(reportPath, report, "utf-8"); console.log(` šŸ“ Review report: ${reportPath}`); } else { console.log("\nāœ… All plants now have images!"); } rawClient.close(); closeDb(); } main().catch((err) => { console.error("\nāŒ", err); process.exit(1); });