Files
plant-disease-id/apps/web/scripts/fill-plant-images.ts
2026-06-06 15:09:46 -04:00

309 lines
9.3 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* fill-plant-images.ts — Fetch plant images from Wikipedia for plants missing them.
*
* Uses the Wikipedia API to search for the plant's scientific name
* and grab the page thumbnail.
*
* Usage: cd apps/web && npx tsx scripts/fill-plant-images.ts
*/
import { readFileSync, writeFileSync } from "fs";
import { resolve } from "path";
// Load env
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) process.env[key] = val;
}
}
}
} catch {}
import { getDb, closeDb } from "../src/lib/db/index";
import { plants } from "../src/lib/db/schema";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
const WIKI_API = "https://en.wikipedia.org/w/api.php";
const UA = "PlantHealthKB/1.0 (plant-images)";
const DELAY_MS = 500;
const BATCH_SIZE = 50;
/** Direct page lookup by title — more reliable for known scientific names. */
async function directPageLookup(title: string): Promise<string | null> {
const params = new URLSearchParams({
action: "query",
titles: title,
prop: "pageimages",
pithumbsize: "400",
format: "json",
origin: "*",
});
for (let attempt = 0; attempt < 3; attempt++) {
try {
const res = await fetch(`${WIKI_API}?${params}`, {
headers: { "User-Agent": UA },
});
if (res.status === 429) {
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
continue;
}
if (!res.ok) return null;
const data = (await res.json()) as {
query?: { pages?: Record<string, { thumbnail?: { source: string }; missing?: boolean }> };
};
const pages = data?.query?.pages;
if (!pages) return null;
for (const [, p] of Object.entries(pages)) {
if (!p.missing && p.thumbnail?.source) return p.thumbnail.source;
}
return null;
} catch {
await new Promise((r) => setTimeout(r, 2000));
}
}
return null;
}
async function main() {
console.log("\n🌿 Fetching plant images from Wikipedia\n");
const db = getDb();
const allPlants = await db
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
.from(plants)
.where(sql`(image_url IS NULL OR image_url = '')`)
.all();
console.log(`📋 ${allPlants.length} plants need images\n`);
if (allPlants.length === 0) {
console.log("✅ All plants already have images!\n");
closeDb();
return;
}
const rawClient = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
let found = 0;
const updates: { id: string; url: string }[] = [];
// Phase 1: Try direct page lookup by scientific name (most accurate)
console.log("─── Phase 1: Direct page lookup ───\n");
for (let i = 0; i < allPlants.length; i++) {
const plant = allPlants[i];
const sciName = plant.scientificName
.replace(/[×'"]/g, "")
.replace(/\s*spp\.?\s*/i, "")
.trim();
process.stdout.write(
` [${String(i + 1).padStart(3)}/${allPlants.length}] ${plant.commonName.padEnd(30)} `,
);
let url: string | null = null;
// Try scientific name first
if (sciName && sciName !== "Unknown" && sciName !== "Various") {
url = await directPageLookup(sciName);
}
// Try common name if scientific name didn't work
if (!url) {
url = await directPageLookup(plant.commonName);
}
// Try genus name
if (!url && sciName) {
const genus = sciName.split(/\s+/)[0];
if (genus && genus.length > 3) {
url = await directPageLookup(genus);
}
}
if (url) {
updates.push({ id: plant.id, url });
found++;
process.stdout.write("✅\n");
} else {
process.stdout.write("⏭️\n");
}
// Flush to DB in batches
if (updates.length >= BATCH_SIZE) {
await rawClient.batch(
updates.map((u) => ({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
console.log(` → Flushed ${updates.length} to DB`);
updates.length = 0;
}
await new Promise((r) => setTimeout(r, DELAY_MS));
}
// Flush remaining
if (updates.length > 0) {
await rawClient.batch(
updates.map((u) => ({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
console.log(` → Flushed ${updates.length} to DB`);
updates.length = 0;
}
console.log(`\n✅ Phase 1 done: ${found}/${allPlants.length} plants got images\n`);
// Phase 2: Try remaining via search API
const stillMissing = await db
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
.from(plants)
.where(sql`(image_url IS NULL OR image_url = '')`)
.all();
if (stillMissing.length > 0) {
console.log(`─── Phase 2: Search API for ${stillMissing.length} remaining ───\n`);
for (let i = 0; i < stillMissing.length; i++) {
const plant = stillMissing[i];
const sciName = plant.scientificName.replace(/[×'"]/g, "").trim();
process.stdout.write(
` [${String(i + 1).padStart(3)}/${stillMissing.length}] ${plant.commonName.padEnd(30)} `,
);
// Search with scientific name
const searchTerm = `${sciName} ${plant.commonName}`;
const params = new URLSearchParams({
action: "query",
list: "search",
srsearch: searchTerm,
srlimit: "3",
format: "json",
origin: "*",
});
let url: string | null = null;
for (let attempt = 0; attempt < 3; attempt++) {
try {
const res = await fetch(`${WIKI_API}?${params}`, {
headers: { "User-Agent": UA },
});
if (res.status === 429) {
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
continue;
}
if (!res.ok) break;
const data = (await res.json()) as {
query?: { search?: Array<{ title: string; pageid: number }> };
};
const hits = data?.query?.search ?? [];
if (hits.length === 0) break;
// Get thumbnail for first result
for (const hit of hits) {
const pageParams = new URLSearchParams({
action: "query",
pageids: String(hit.pageid),
prop: "pageimages",
pithumbsize: "400",
format: "json",
origin: "*",
});
const pageRes = await fetch(`${WIKI_API}?${pageParams}`, {
headers: { "User-Agent": UA },
});
if (!pageRes.ok) continue;
const pageData = (await pageRes.json()) as {
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
};
const pages = pageData?.query?.pages;
if (!pages) continue;
for (const [, p] of Object.entries(pages)) {
if (p.thumbnail?.source) {
url = p.thumbnail.source;
break;
}
}
if (url) break;
}
break;
} catch {
await new Promise((r) => setTimeout(r, 2000));
}
}
if (url) {
await rawClient.execute({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [url, plant.id],
});
found++;
process.stdout.write("✅\n");
} else {
process.stdout.write("❌\n");
}
await new Promise((r) => setTimeout(r, DELAY_MS));
}
}
// Final count
const final = await db
.select({ id: plants.id, commonName: plants.commonName, imageUrl: plants.imageUrl })
.from(plants)
.all();
const withImg = final.filter((p) => p.imageUrl);
const withoutImg = final.filter((p) => !p.imageUrl);
console.log(`\n${"═".repeat(50)}`);
console.log(`📊 FINAL: ${final.length} plants`);
console.log(` With images: ${withImg.length}`);
console.log(` Missing images: ${withoutImg.length}`);
if (withoutImg.length > 0) {
console.log(`\n📝 Plants still needing images:`);
withoutImg.forEach((p) => console.log(`${p.id}: ${p.commonName}`));
// Save to file for reference
const reportPath = resolve(__dirname, ".plant-image-review-needed.md");
let report = "# Plant Images — Still Missing\n\n";
report += `Generated: ${new Date().toISOString()}\n\n`;
report += `## 🚫 Plants without images (${withoutImg.length})\n\n`;
for (const p of withoutImg) {
report += `- **${p.commonName}** (\`${p.id}\`)\n`;
}
writeFileSync(reportPath, report, "utf-8");
console.log(` 📝 Review report: ${reportPath}`);
} else {
console.log("\n✅ All plants now have images!");
}
rawClient.close();
closeDb();
}
main().catch((err) => {
console.error("\n❌", err);
process.exit(1);
});