Files
plant-disease-id/apps/web/scripts/fill-plant-images-v2.ts
2026-06-06 15:09:46 -04:00

302 lines
7.9 KiB
JavaScript

#!/usr/bin/env node
/**
* fill-plant-images-v2.ts — Batch Wikipedia image fetch for remaining plants.
*
* Phase 1: Query 50 scientific names at a time via pageimages.
* Phase 2: Query 50 common names at a time.
* Phase 3: Search individually for stragglers.
*
* Usage: cd apps/web && npx tsx scripts/fill-plant-images-v2.ts
*/
import { readFileSync, writeFileSync } from "fs";
import { resolve } from "path";
// Load env
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) {
process.env[key] = val;
}
}
}
}
} catch (e) {}
import { getDb, closeDb } from "../src/lib/db/index";
import { plants } from "../src/lib/db/schema";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
const API = "https://en.wikipedia.org/w/api.php";
const UA = "PlantHealthKB/1.0";
const BATCH = 50;
interface PlantRow {
id: string;
commonName: string;
scientificName: string;
}
function clean(s: string): string {
return s
.replace(/[xX]/g, "x")
.replace(/\s*spp\.?\s*/gi, "")
.replace(/[.\u00d7']/g, "")
.trim();
}
async function fetchThumbs(titles: string[]): Promise<Map<string, string>> {
if (titles.length === 0) {
return new Map();
}
const p = new URLSearchParams({
action: "query",
titles: titles.join("|"),
prop: "pageimages",
pithumbsize: "400",
redirects: "1",
format: "json",
});
for (let a = 0; a < 3; a++) {
try {
const r = await fetch(API + "?" + p.toString(), {
headers: { "User-Agent": UA },
});
if (r.status === 429) {
await new Promise((rr) => setTimeout(rr, 5000 * Math.pow(2, a)));
continue;
}
if (!r.ok) {
return new Map();
}
const d = (await r.json()) as any;
const pages = d?.query?.pages;
if (!pages) {
return new Map();
}
const m = new Map<string, string>();
for (const [, pg] of Object.entries(pages)) {
const p2 = pg as any;
if (!p2.missing && p2.thumbnail?.source) {
m.set(p2.title.toLowerCase(), p2.thumbnail.source);
}
}
return m;
} catch (e) {
await new Promise((rr) => setTimeout(rr, 2000));
}
}
return new Map();
}
async function searchOne(query: string): Promise<string | null> {
const p = new URLSearchParams({
action: "query",
generator: "search",
gsrsearch: query,
gsrlimit: "3",
prop: "pageimages",
pithumbsize: "400",
format: "json",
});
for (let a = 0; a < 3; a++) {
try {
const r = await fetch(API + "?" + p.toString(), {
headers: { "User-Agent": UA },
});
if (r.status === 429) {
await new Promise((rr) => setTimeout(rr, 5000 * Math.pow(2, a)));
continue;
}
if (!r.ok) {
return null;
}
const d = (await r.json()) as any;
const pages = d?.query?.pages;
if (!pages) {
return null;
}
for (const [, pg] of Object.entries(pages)) {
const p2 = pg as any;
if (p2.thumbnail?.source) {
return p2.thumbnail.source;
}
}
return null;
} catch (e) {
await new Promise((rr) => setTimeout(rr, 2000));
}
}
return null;
}
async function batchPhase(
plants: PlantRow[],
titleFn: (p: PlantRow) => string,
label: string,
dbClient: any,
): Promise<PlantRow[]> {
const remaining: PlantRow[] = [];
const updates: Array<{ id: string; url: string }> = [];
for (let i = 0; i < plants.length; i += BATCH) {
const chunk = plants.slice(i, i + BATCH);
const titles = chunk.map(titleFn).filter((t) => t.length > 2);
console.log(
" [" +
label +
"] " +
(i + 1) +
"-" +
Math.min(i + BATCH, plants.length) +
"/" +
plants.length +
" ",
);
const imageMap = await fetchThumbs(titles);
let n = 0;
for (const pl of chunk) {
const t = titleFn(pl).toLowerCase();
const img = imageMap.get(t);
if (img) {
updates.push({ id: pl.id, url: img });
n++;
} else {
remaining.push(pl);
}
}
console.log(" found: " + n);
if (updates.length >= 100) {
await dbClient.batch(
updates.map((u) => ({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
updates.length = 0;
}
await new Promise((r) => setTimeout(r, 1500));
}
if (updates.length > 0) {
await dbClient.batch(
updates.map((u) => ({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
}
return remaining;
}
async function main() {
console.log("\nPlant Image Filler v2\n");
const db = getDb();
const allPlants = (await db
.select({
id: plants.id,
commonName: plants.commonName,
scientificName: plants.scientificName,
})
.from(plants)
.where(sql`(image_url IS NULL OR image_url = '')`)
.all()) as PlantRow[];
console.log("Plants needing images: " + allPlants.length + "\n");
if (allPlants.length === 0) {
console.log("All plants have images!\n");
closeDb();
return;
}
const raw = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
let found = 0;
// Phase 1: Scientific name
console.log("--- Phase 1: Scientific names ---\n");
let remaining = await batchPhase(allPlants, (p) => clean(p.scientificName), "sci", raw);
// Phase 2: Common name
if (remaining.length > 0) {
console.log("\n--- Phase 2: Common names (" + remaining.length + ") ---\n");
remaining = await batchPhase(remaining, (p) => p.commonName, "common", raw);
}
// Phase 3: Search
if (remaining.length > 0) {
console.log("\n--- Phase 3: Search (" + remaining.length + ") ---\n");
for (let i = 0; i < remaining.length; i++) {
const pl = remaining[i];
const q = clean(pl.scientificName) + " " + pl.commonName;
console.log(" [" + (i + 1) + "/" + remaining.length + "] " + pl.commonName);
const img = await searchOne(q);
if (img) {
await raw.execute({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [img, pl.id],
});
found++;
console.log(" OK");
} else {
console.log(" MISS");
}
await new Promise((r) => setTimeout(r, 500));
}
}
raw.close();
// Report
const finalList = await db
.select({
id: plants.id,
commonName: plants.commonName,
imageUrl: plants.imageUrl,
})
.from(plants)
.all();
const w = finalList.filter((p) => p.imageUrl);
const wo = finalList.filter((p) => !p.imageUrl);
console.log("\n" + "=".repeat(50));
console.log("FINAL: " + finalList.length + " plants");
console.log(" With images: " + w.length);
console.log(" Missing: " + wo.length);
if (wo.length > 0) {
const rp = resolve(__dirname, ".plant-image-review-needed.md");
let report = "# Plant Images - Still Missing\n\n";
report += "Generated: " + new Date().toISOString() + "\n\n";
report += "## Missing (" + wo.length + ")\n\n";
for (const p of wo) {
report += "- " + p.commonName + " (" + p.id + ")\n";
}
writeFileSync(rp, report, "utf-8");
console.log("Report: " + rp);
} else {
console.log("\nALL PLANTS HAVE IMAGES!");
}
closeDb();
}
main().catch((err: any) => {
console.error("Error:", err);
process.exit(1);
});