309 lines
9.3 KiB
JavaScript
309 lines
9.3 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* fill-plant-images.ts — Fetch plant images from Wikipedia for plants missing them.
|
||
*
|
||
* Uses the Wikipedia API to search for the plant's scientific name
|
||
* and grab the page thumbnail.
|
||
*
|
||
* Usage: cd apps/web && npx tsx scripts/fill-plant-images.ts
|
||
*/
|
||
|
||
import { readFileSync, writeFileSync } from "fs";
|
||
import { resolve } from "path";
|
||
|
||
// Load env
|
||
const envPath = resolve(__dirname, "../.env.development");
|
||
try {
|
||
const env = readFileSync(envPath, "utf-8");
|
||
for (const line of env.split("\n")) {
|
||
const trimmed = line.trim();
|
||
if (trimmed && !trimmed.startsWith("#")) {
|
||
const eqIdx = trimmed.indexOf("=");
|
||
if (eqIdx > 0) {
|
||
const key = trimmed.slice(0, eqIdx).trim();
|
||
const val = trimmed.slice(eqIdx + 1).trim();
|
||
if (!process.env[key]) process.env[key] = val;
|
||
}
|
||
}
|
||
}
|
||
} catch {}
|
||
|
||
import { getDb, closeDb } from "../src/lib/db/index";
|
||
import { plants } from "../src/lib/db/schema";
|
||
import { createClient } from "@libsql/client";
|
||
import { sql } from "drizzle-orm";
|
||
|
||
const WIKI_API = "https://en.wikipedia.org/w/api.php";
|
||
const UA = "PlantHealthKB/1.0 (plant-images)";
|
||
const DELAY_MS = 500;
|
||
const BATCH_SIZE = 50;
|
||
|
||
/** Direct page lookup by title — more reliable for known scientific names. */
|
||
async function directPageLookup(title: string): Promise<string | null> {
|
||
const params = new URLSearchParams({
|
||
action: "query",
|
||
titles: title,
|
||
prop: "pageimages",
|
||
pithumbsize: "400",
|
||
format: "json",
|
||
origin: "*",
|
||
});
|
||
|
||
for (let attempt = 0; attempt < 3; attempt++) {
|
||
try {
|
||
const res = await fetch(`${WIKI_API}?${params}`, {
|
||
headers: { "User-Agent": UA },
|
||
});
|
||
if (res.status === 429) {
|
||
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
|
||
continue;
|
||
}
|
||
if (!res.ok) return null;
|
||
const data = (await res.json()) as {
|
||
query?: { pages?: Record<string, { thumbnail?: { source: string }; missing?: boolean }> };
|
||
};
|
||
const pages = data?.query?.pages;
|
||
if (!pages) return null;
|
||
for (const [, p] of Object.entries(pages)) {
|
||
if (!p.missing && p.thumbnail?.source) return p.thumbnail.source;
|
||
}
|
||
return null;
|
||
} catch {
|
||
await new Promise((r) => setTimeout(r, 2000));
|
||
}
|
||
}
|
||
return null;
|
||
}
|
||
|
||
async function main() {
|
||
console.log("\n🌿 Fetching plant images from Wikipedia\n");
|
||
|
||
const db = getDb();
|
||
const allPlants = await db
|
||
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
|
||
.from(plants)
|
||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||
.all();
|
||
|
||
console.log(`📋 ${allPlants.length} plants need images\n`);
|
||
|
||
if (allPlants.length === 0) {
|
||
console.log("✅ All plants already have images!\n");
|
||
closeDb();
|
||
return;
|
||
}
|
||
|
||
const rawClient = createClient({
|
||
url: process.env.DATABASE_URL!,
|
||
authToken: process.env.DATABASE_TOKEN!,
|
||
});
|
||
|
||
let found = 0;
|
||
const updates: { id: string; url: string }[] = [];
|
||
|
||
// Phase 1: Try direct page lookup by scientific name (most accurate)
|
||
console.log("─── Phase 1: Direct page lookup ───\n");
|
||
|
||
for (let i = 0; i < allPlants.length; i++) {
|
||
const plant = allPlants[i];
|
||
const sciName = plant.scientificName
|
||
.replace(/[×'"]/g, "")
|
||
.replace(/\s*spp\.?\s*/i, "")
|
||
.trim();
|
||
|
||
process.stdout.write(
|
||
` [${String(i + 1).padStart(3)}/${allPlants.length}] ${plant.commonName.padEnd(30)} `,
|
||
);
|
||
|
||
let url: string | null = null;
|
||
|
||
// Try scientific name first
|
||
if (sciName && sciName !== "Unknown" && sciName !== "Various") {
|
||
url = await directPageLookup(sciName);
|
||
}
|
||
|
||
// Try common name if scientific name didn't work
|
||
if (!url) {
|
||
url = await directPageLookup(plant.commonName);
|
||
}
|
||
|
||
// Try genus name
|
||
if (!url && sciName) {
|
||
const genus = sciName.split(/\s+/)[0];
|
||
if (genus && genus.length > 3) {
|
||
url = await directPageLookup(genus);
|
||
}
|
||
}
|
||
|
||
if (url) {
|
||
updates.push({ id: plant.id, url });
|
||
found++;
|
||
process.stdout.write("✅\n");
|
||
} else {
|
||
process.stdout.write("⏭️\n");
|
||
}
|
||
|
||
// Flush to DB in batches
|
||
if (updates.length >= BATCH_SIZE) {
|
||
await rawClient.batch(
|
||
updates.map((u) => ({
|
||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||
args: [u.url, u.id],
|
||
})),
|
||
"write",
|
||
);
|
||
console.log(` → Flushed ${updates.length} to DB`);
|
||
updates.length = 0;
|
||
}
|
||
|
||
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||
}
|
||
|
||
// Flush remaining
|
||
if (updates.length > 0) {
|
||
await rawClient.batch(
|
||
updates.map((u) => ({
|
||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||
args: [u.url, u.id],
|
||
})),
|
||
"write",
|
||
);
|
||
console.log(` → Flushed ${updates.length} to DB`);
|
||
updates.length = 0;
|
||
}
|
||
|
||
console.log(`\n✅ Phase 1 done: ${found}/${allPlants.length} plants got images\n`);
|
||
|
||
// Phase 2: Try remaining via search API
|
||
const stillMissing = await db
|
||
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
|
||
.from(plants)
|
||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||
.all();
|
||
|
||
if (stillMissing.length > 0) {
|
||
console.log(`─── Phase 2: Search API for ${stillMissing.length} remaining ───\n`);
|
||
|
||
for (let i = 0; i < stillMissing.length; i++) {
|
||
const plant = stillMissing[i];
|
||
const sciName = plant.scientificName.replace(/[×'"]/g, "").trim();
|
||
|
||
process.stdout.write(
|
||
` [${String(i + 1).padStart(3)}/${stillMissing.length}] ${plant.commonName.padEnd(30)} `,
|
||
);
|
||
|
||
// Search with scientific name
|
||
const searchTerm = `${sciName} ${plant.commonName}`;
|
||
const params = new URLSearchParams({
|
||
action: "query",
|
||
list: "search",
|
||
srsearch: searchTerm,
|
||
srlimit: "3",
|
||
format: "json",
|
||
origin: "*",
|
||
});
|
||
|
||
let url: string | null = null;
|
||
for (let attempt = 0; attempt < 3; attempt++) {
|
||
try {
|
||
const res = await fetch(`${WIKI_API}?${params}`, {
|
||
headers: { "User-Agent": UA },
|
||
});
|
||
if (res.status === 429) {
|
||
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
|
||
continue;
|
||
}
|
||
if (!res.ok) break;
|
||
const data = (await res.json()) as {
|
||
query?: { search?: Array<{ title: string; pageid: number }> };
|
||
};
|
||
const hits = data?.query?.search ?? [];
|
||
if (hits.length === 0) break;
|
||
|
||
// Get thumbnail for first result
|
||
for (const hit of hits) {
|
||
const pageParams = new URLSearchParams({
|
||
action: "query",
|
||
pageids: String(hit.pageid),
|
||
prop: "pageimages",
|
||
pithumbsize: "400",
|
||
format: "json",
|
||
origin: "*",
|
||
});
|
||
const pageRes = await fetch(`${WIKI_API}?${pageParams}`, {
|
||
headers: { "User-Agent": UA },
|
||
});
|
||
if (!pageRes.ok) continue;
|
||
const pageData = (await pageRes.json()) as {
|
||
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
|
||
};
|
||
const pages = pageData?.query?.pages;
|
||
if (!pages) continue;
|
||
for (const [, p] of Object.entries(pages)) {
|
||
if (p.thumbnail?.source) {
|
||
url = p.thumbnail.source;
|
||
break;
|
||
}
|
||
}
|
||
if (url) break;
|
||
}
|
||
break;
|
||
} catch {
|
||
await new Promise((r) => setTimeout(r, 2000));
|
||
}
|
||
}
|
||
|
||
if (url) {
|
||
await rawClient.execute({
|
||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||
args: [url, plant.id],
|
||
});
|
||
found++;
|
||
process.stdout.write("✅\n");
|
||
} else {
|
||
process.stdout.write("❌\n");
|
||
}
|
||
|
||
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||
}
|
||
}
|
||
|
||
// Final count
|
||
const final = await db
|
||
.select({ id: plants.id, commonName: plants.commonName, imageUrl: plants.imageUrl })
|
||
.from(plants)
|
||
.all();
|
||
const withImg = final.filter((p) => p.imageUrl);
|
||
const withoutImg = final.filter((p) => !p.imageUrl);
|
||
|
||
console.log(`\n${"═".repeat(50)}`);
|
||
console.log(`📊 FINAL: ${final.length} plants`);
|
||
console.log(` With images: ${withImg.length}`);
|
||
console.log(` Missing images: ${withoutImg.length}`);
|
||
|
||
if (withoutImg.length > 0) {
|
||
console.log(`\n📝 Plants still needing images:`);
|
||
withoutImg.forEach((p) => console.log(` ❌ ${p.id}: ${p.commonName}`));
|
||
// Save to file for reference
|
||
const reportPath = resolve(__dirname, ".plant-image-review-needed.md");
|
||
let report = "# Plant Images — Still Missing\n\n";
|
||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||
report += `## 🚫 Plants without images (${withoutImg.length})\n\n`;
|
||
for (const p of withoutImg) {
|
||
report += `- **${p.commonName}** (\`${p.id}\`)\n`;
|
||
}
|
||
writeFileSync(reportPath, report, "utf-8");
|
||
console.log(` 📝 Review report: ${reportPath}`);
|
||
} else {
|
||
console.log("\n✅ All plants now have images!");
|
||
}
|
||
|
||
rawClient.close();
|
||
closeDb();
|
||
}
|
||
|
||
main().catch((err) => {
|
||
console.error("\n❌", err);
|
||
process.exit(1);
|
||
});
|