302 lines
7.9 KiB
JavaScript
302 lines
7.9 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* fill-plant-images-v2.ts — Batch Wikipedia image fetch for remaining plants.
|
|
*
|
|
* Phase 1: Query 50 scientific names at a time via pageimages.
|
|
* Phase 2: Query 50 common names at a time.
|
|
* Phase 3: Search individually for stragglers.
|
|
*
|
|
* Usage: cd apps/web && npx tsx scripts/fill-plant-images-v2.ts
|
|
*/
|
|
|
|
import { readFileSync, writeFileSync } from "fs";
|
|
import { resolve } from "path";
|
|
|
|
// Load env
|
|
const envPath = resolve(__dirname, "../.env.development");
|
|
try {
|
|
const env = readFileSync(envPath, "utf-8");
|
|
for (const line of env.split("\n")) {
|
|
const trimmed = line.trim();
|
|
if (trimmed && !trimmed.startsWith("#")) {
|
|
const eqIdx = trimmed.indexOf("=");
|
|
if (eqIdx > 0) {
|
|
const key = trimmed.slice(0, eqIdx).trim();
|
|
const val = trimmed.slice(eqIdx + 1).trim();
|
|
if (!process.env[key]) {
|
|
process.env[key] = val;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {}
|
|
|
|
import { getDb, closeDb } from "../src/lib/db/index";
|
|
import { plants } from "../src/lib/db/schema";
|
|
import { createClient } from "@libsql/client";
|
|
import { sql } from "drizzle-orm";
|
|
|
|
const API = "https://en.wikipedia.org/w/api.php";
|
|
const UA = "PlantHealthKB/1.0";
|
|
const BATCH = 50;
|
|
|
|
interface PlantRow {
|
|
id: string;
|
|
commonName: string;
|
|
scientificName: string;
|
|
}
|
|
|
|
function clean(s: string): string {
|
|
return s
|
|
.replace(/[xX]/g, "x")
|
|
.replace(/\s*spp\.?\s*/gi, "")
|
|
.replace(/[.\u00d7']/g, "")
|
|
.trim();
|
|
}
|
|
|
|
async function fetchThumbs(titles: string[]): Promise<Map<string, string>> {
|
|
if (titles.length === 0) {
|
|
return new Map();
|
|
}
|
|
const p = new URLSearchParams({
|
|
action: "query",
|
|
titles: titles.join("|"),
|
|
prop: "pageimages",
|
|
pithumbsize: "400",
|
|
redirects: "1",
|
|
format: "json",
|
|
});
|
|
for (let a = 0; a < 3; a++) {
|
|
try {
|
|
const r = await fetch(API + "?" + p.toString(), {
|
|
headers: { "User-Agent": UA },
|
|
});
|
|
if (r.status === 429) {
|
|
await new Promise((rr) => setTimeout(rr, 5000 * Math.pow(2, a)));
|
|
continue;
|
|
}
|
|
if (!r.ok) {
|
|
return new Map();
|
|
}
|
|
const d = (await r.json()) as any;
|
|
const pages = d?.query?.pages;
|
|
if (!pages) {
|
|
return new Map();
|
|
}
|
|
const m = new Map<string, string>();
|
|
for (const [, pg] of Object.entries(pages)) {
|
|
const p2 = pg as any;
|
|
if (!p2.missing && p2.thumbnail?.source) {
|
|
m.set(p2.title.toLowerCase(), p2.thumbnail.source);
|
|
}
|
|
}
|
|
return m;
|
|
} catch (e) {
|
|
await new Promise((rr) => setTimeout(rr, 2000));
|
|
}
|
|
}
|
|
return new Map();
|
|
}
|
|
|
|
async function searchOne(query: string): Promise<string | null> {
|
|
const p = new URLSearchParams({
|
|
action: "query",
|
|
generator: "search",
|
|
gsrsearch: query,
|
|
gsrlimit: "3",
|
|
prop: "pageimages",
|
|
pithumbsize: "400",
|
|
format: "json",
|
|
});
|
|
for (let a = 0; a < 3; a++) {
|
|
try {
|
|
const r = await fetch(API + "?" + p.toString(), {
|
|
headers: { "User-Agent": UA },
|
|
});
|
|
if (r.status === 429) {
|
|
await new Promise((rr) => setTimeout(rr, 5000 * Math.pow(2, a)));
|
|
continue;
|
|
}
|
|
if (!r.ok) {
|
|
return null;
|
|
}
|
|
const d = (await r.json()) as any;
|
|
const pages = d?.query?.pages;
|
|
if (!pages) {
|
|
return null;
|
|
}
|
|
for (const [, pg] of Object.entries(pages)) {
|
|
const p2 = pg as any;
|
|
if (p2.thumbnail?.source) {
|
|
return p2.thumbnail.source;
|
|
}
|
|
}
|
|
return null;
|
|
} catch (e) {
|
|
await new Promise((rr) => setTimeout(rr, 2000));
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function batchPhase(
|
|
plants: PlantRow[],
|
|
titleFn: (p: PlantRow) => string,
|
|
label: string,
|
|
dbClient: any,
|
|
): Promise<PlantRow[]> {
|
|
const remaining: PlantRow[] = [];
|
|
const updates: Array<{ id: string; url: string }> = [];
|
|
|
|
for (let i = 0; i < plants.length; i += BATCH) {
|
|
const chunk = plants.slice(i, i + BATCH);
|
|
const titles = chunk.map(titleFn).filter((t) => t.length > 2);
|
|
console.log(
|
|
" [" +
|
|
label +
|
|
"] " +
|
|
(i + 1) +
|
|
"-" +
|
|
Math.min(i + BATCH, plants.length) +
|
|
"/" +
|
|
plants.length +
|
|
" ",
|
|
);
|
|
const imageMap = await fetchThumbs(titles);
|
|
let n = 0;
|
|
for (const pl of chunk) {
|
|
const t = titleFn(pl).toLowerCase();
|
|
const img = imageMap.get(t);
|
|
if (img) {
|
|
updates.push({ id: pl.id, url: img });
|
|
n++;
|
|
} else {
|
|
remaining.push(pl);
|
|
}
|
|
}
|
|
console.log(" found: " + n);
|
|
if (updates.length >= 100) {
|
|
await dbClient.batch(
|
|
updates.map((u) => ({
|
|
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
|
args: [u.url, u.id],
|
|
})),
|
|
"write",
|
|
);
|
|
updates.length = 0;
|
|
}
|
|
await new Promise((r) => setTimeout(r, 1500));
|
|
}
|
|
|
|
if (updates.length > 0) {
|
|
await dbClient.batch(
|
|
updates.map((u) => ({
|
|
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
|
args: [u.url, u.id],
|
|
})),
|
|
"write",
|
|
);
|
|
}
|
|
|
|
return remaining;
|
|
}
|
|
|
|
async function main() {
|
|
console.log("\nPlant Image Filler v2\n");
|
|
const db = getDb();
|
|
const allPlants = (await db
|
|
.select({
|
|
id: plants.id,
|
|
commonName: plants.commonName,
|
|
scientificName: plants.scientificName,
|
|
})
|
|
.from(plants)
|
|
.where(sql`(image_url IS NULL OR image_url = '')`)
|
|
.all()) as PlantRow[];
|
|
|
|
console.log("Plants needing images: " + allPlants.length + "\n");
|
|
if (allPlants.length === 0) {
|
|
console.log("All plants have images!\n");
|
|
closeDb();
|
|
return;
|
|
}
|
|
|
|
const raw = createClient({
|
|
url: process.env.DATABASE_URL!,
|
|
authToken: process.env.DATABASE_TOKEN!,
|
|
});
|
|
let found = 0;
|
|
|
|
// Phase 1: Scientific name
|
|
console.log("--- Phase 1: Scientific names ---\n");
|
|
let remaining = await batchPhase(allPlants, (p) => clean(p.scientificName), "sci", raw);
|
|
|
|
// Phase 2: Common name
|
|
if (remaining.length > 0) {
|
|
console.log("\n--- Phase 2: Common names (" + remaining.length + ") ---\n");
|
|
remaining = await batchPhase(remaining, (p) => p.commonName, "common", raw);
|
|
}
|
|
|
|
// Phase 3: Search
|
|
if (remaining.length > 0) {
|
|
console.log("\n--- Phase 3: Search (" + remaining.length + ") ---\n");
|
|
for (let i = 0; i < remaining.length; i++) {
|
|
const pl = remaining[i];
|
|
const q = clean(pl.scientificName) + " " + pl.commonName;
|
|
console.log(" [" + (i + 1) + "/" + remaining.length + "] " + pl.commonName);
|
|
const img = await searchOne(q);
|
|
if (img) {
|
|
await raw.execute({
|
|
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
|
args: [img, pl.id],
|
|
});
|
|
found++;
|
|
console.log(" OK");
|
|
} else {
|
|
console.log(" MISS");
|
|
}
|
|
await new Promise((r) => setTimeout(r, 500));
|
|
}
|
|
}
|
|
|
|
raw.close();
|
|
|
|
// Report
|
|
const finalList = await db
|
|
.select({
|
|
id: plants.id,
|
|
commonName: plants.commonName,
|
|
imageUrl: plants.imageUrl,
|
|
})
|
|
.from(plants)
|
|
.all();
|
|
const w = finalList.filter((p) => p.imageUrl);
|
|
const wo = finalList.filter((p) => !p.imageUrl);
|
|
|
|
console.log("\n" + "=".repeat(50));
|
|
console.log("FINAL: " + finalList.length + " plants");
|
|
console.log(" With images: " + w.length);
|
|
console.log(" Missing: " + wo.length);
|
|
|
|
if (wo.length > 0) {
|
|
const rp = resolve(__dirname, ".plant-image-review-needed.md");
|
|
let report = "# Plant Images - Still Missing\n\n";
|
|
report += "Generated: " + new Date().toISOString() + "\n\n";
|
|
report += "## Missing (" + wo.length + ")\n\n";
|
|
for (const p of wo) {
|
|
report += "- " + p.commonName + " (" + p.id + ")\n";
|
|
}
|
|
writeFileSync(rp, report, "utf-8");
|
|
console.log("Report: " + rp);
|
|
} else {
|
|
console.log("\nALL PLANTS HAVE IMAGES!");
|
|
}
|
|
|
|
closeDb();
|
|
}
|
|
|
|
main().catch((err: any) => {
|
|
console.error("Error:", err);
|
|
process.exit(1);
|
|
});
|