This commit is contained in:
2026-06-08 16:42:04 -04:00
commit 8bda14ab63
179 changed files with 48104 additions and 0 deletions

View File

@@ -0,0 +1,308 @@
#!/usr/bin/env node
/**
* fill-plant-images.ts — Fetch plant images from Wikipedia for plants missing them.
*
* Uses the Wikipedia API to search for the plant's scientific name
* and grab the page thumbnail.
*
* Usage: cd apps/web && npx tsx scripts/fill-plant-images.ts
*/
import { readFileSync, writeFileSync } from "fs";
import { resolve } from "path";
// Load env
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) process.env[key] = val;
}
}
}
} catch {}
import { getDb, closeDb } from "../src/lib/db/index";
import { plants } from "../src/lib/db/schema";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
const WIKI_API = "https://en.wikipedia.org/w/api.php";
const UA = "PlantHealthKB/1.0 (plant-images)";
const DELAY_MS = 500;
const BATCH_SIZE = 50;
/** Direct page lookup by title — more reliable for known scientific names. */
async function directPageLookup(title: string): Promise<string | null> {
const params = new URLSearchParams({
action: "query",
titles: title,
prop: "pageimages",
pithumbsize: "400",
format: "json",
origin: "*",
});
for (let attempt = 0; attempt < 3; attempt++) {
try {
const res = await fetch(`${WIKI_API}?${params}`, {
headers: { "User-Agent": UA },
});
if (res.status === 429) {
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
continue;
}
if (!res.ok) return null;
const data = (await res.json()) as {
query?: { pages?: Record<string, { thumbnail?: { source: string }; missing?: boolean }> };
};
const pages = data?.query?.pages;
if (!pages) return null;
for (const [, p] of Object.entries(pages)) {
if (!p.missing && p.thumbnail?.source) return p.thumbnail.source;
}
return null;
} catch {
await new Promise((r) => setTimeout(r, 2000));
}
}
return null;
}
async function main() {
console.log("\n🌿 Fetching plant images from Wikipedia\n");
const db = getDb();
const allPlants = await db
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
.from(plants)
.where(sql`(image_url IS NULL OR image_url = '')`)
.all();
console.log(`📋 ${allPlants.length} plants need images\n`);
if (allPlants.length === 0) {
console.log("✅ All plants already have images!\n");
closeDb();
return;
}
const rawClient = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
let found = 0;
const updates: { id: string; url: string }[] = [];
// Phase 1: Try direct page lookup by scientific name (most accurate)
console.log("─── Phase 1: Direct page lookup ───\n");
for (let i = 0; i < allPlants.length; i++) {
const plant = allPlants[i];
const sciName = plant.scientificName
.replace(/[×'"]/g, "")
.replace(/\s*spp\.?\s*/i, "")
.trim();
process.stdout.write(
` [${String(i + 1).padStart(3)}/${allPlants.length}] ${plant.commonName.padEnd(30)} `,
);
let url: string | null = null;
// Try scientific name first
if (sciName && sciName !== "Unknown" && sciName !== "Various") {
url = await directPageLookup(sciName);
}
// Try common name if scientific name didn't work
if (!url) {
url = await directPageLookup(plant.commonName);
}
// Try genus name
if (!url && sciName) {
const genus = sciName.split(/\s+/)[0];
if (genus && genus.length > 3) {
url = await directPageLookup(genus);
}
}
if (url) {
updates.push({ id: plant.id, url });
found++;
process.stdout.write("✅\n");
} else {
process.stdout.write("⏭️\n");
}
// Flush to DB in batches
if (updates.length >= BATCH_SIZE) {
await rawClient.batch(
updates.map((u) => ({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
console.log(` → Flushed ${updates.length} to DB`);
updates.length = 0;
}
await new Promise((r) => setTimeout(r, DELAY_MS));
}
// Flush remaining
if (updates.length > 0) {
await rawClient.batch(
updates.map((u) => ({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [u.url, u.id],
})),
"write",
);
console.log(` → Flushed ${updates.length} to DB`);
updates.length = 0;
}
console.log(`\n✅ Phase 1 done: ${found}/${allPlants.length} plants got images\n`);
// Phase 2: Try remaining via search API
const stillMissing = await db
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
.from(plants)
.where(sql`(image_url IS NULL OR image_url = '')`)
.all();
if (stillMissing.length > 0) {
console.log(`─── Phase 2: Search API for ${stillMissing.length} remaining ───\n`);
for (let i = 0; i < stillMissing.length; i++) {
const plant = stillMissing[i];
const sciName = plant.scientificName.replace(/[×'"]/g, "").trim();
process.stdout.write(
` [${String(i + 1).padStart(3)}/${stillMissing.length}] ${plant.commonName.padEnd(30)} `,
);
// Search with scientific name
const searchTerm = `${sciName} ${plant.commonName}`;
const params = new URLSearchParams({
action: "query",
list: "search",
srsearch: searchTerm,
srlimit: "3",
format: "json",
origin: "*",
});
let url: string | null = null;
for (let attempt = 0; attempt < 3; attempt++) {
try {
const res = await fetch(`${WIKI_API}?${params}`, {
headers: { "User-Agent": UA },
});
if (res.status === 429) {
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
continue;
}
if (!res.ok) break;
const data = (await res.json()) as {
query?: { search?: Array<{ title: string; pageid: number }> };
};
const hits = data?.query?.search ?? [];
if (hits.length === 0) break;
// Get thumbnail for first result
for (const hit of hits) {
const pageParams = new URLSearchParams({
action: "query",
pageids: String(hit.pageid),
prop: "pageimages",
pithumbsize: "400",
format: "json",
origin: "*",
});
const pageRes = await fetch(`${WIKI_API}?${pageParams}`, {
headers: { "User-Agent": UA },
});
if (!pageRes.ok) continue;
const pageData = (await pageRes.json()) as {
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
};
const pages = pageData?.query?.pages;
if (!pages) continue;
for (const [, p] of Object.entries(pages)) {
if (p.thumbnail?.source) {
url = p.thumbnail.source;
break;
}
}
if (url) break;
}
break;
} catch {
await new Promise((r) => setTimeout(r, 2000));
}
}
if (url) {
await rawClient.execute({
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
args: [url, plant.id],
});
found++;
process.stdout.write("✅\n");
} else {
process.stdout.write("❌\n");
}
await new Promise((r) => setTimeout(r, DELAY_MS));
}
}
// Final count
const final = await db
.select({ id: plants.id, commonName: plants.commonName, imageUrl: plants.imageUrl })
.from(plants)
.all();
const withImg = final.filter((p) => p.imageUrl);
const withoutImg = final.filter((p) => !p.imageUrl);
console.log(`\n${"═".repeat(50)}`);
console.log(`📊 FINAL: ${final.length} plants`);
console.log(` With images: ${withImg.length}`);
console.log(` Missing images: ${withoutImg.length}`);
if (withoutImg.length > 0) {
console.log(`\n📝 Plants still needing images:`);
withoutImg.forEach((p) => console.log(`${p.id}: ${p.commonName}`));
// Save to file for reference
const reportPath = resolve(__dirname, ".plant-image-review-needed.md");
let report = "# Plant Images — Still Missing\n\n";
report += `Generated: ${new Date().toISOString()}\n\n`;
report += `## 🚫 Plants without images (${withoutImg.length})\n\n`;
for (const p of withoutImg) {
report += `- **${p.commonName}** (\`${p.id}\`)\n`;
}
writeFileSync(reportPath, report, "utf-8");
console.log(` 📝 Review report: ${reportPath}`);
} else {
console.log("\n✅ All plants now have images!");
}
rawClient.close();
closeDb();
}
main().catch((err) => {
console.error("\n❌", err);
process.exit(1);
});