re-init
This commit is contained in:
308
scripts/fill-plant-images.ts
Normal file
308
scripts/fill-plant-images.ts
Normal file
@@ -0,0 +1,308 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-plant-images.ts — Fetch plant images from Wikipedia for plants missing them.
|
||||
*
|
||||
* Uses the Wikipedia API to search for the plant's scientific name
|
||||
* and grab the page thumbnail.
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fill-plant-images.ts
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// Load env
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { plants } from "../src/lib/db/schema";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
const WIKI_API = "https://en.wikipedia.org/w/api.php";
|
||||
const UA = "PlantHealthKB/1.0 (plant-images)";
|
||||
const DELAY_MS = 500;
|
||||
const BATCH_SIZE = 50;
|
||||
|
||||
/** Direct page lookup by title — more reliable for known scientific names. */
|
||||
async function directPageLookup(title: string): Promise<string | null> {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
titles: title,
|
||||
prop: "pageimages",
|
||||
pithumbsize: "400",
|
||||
format: "json",
|
||||
origin: "*",
|
||||
});
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetch(`${WIKI_API}?${params}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
const data = (await res.json()) as {
|
||||
query?: { pages?: Record<string, { thumbnail?: { source: string }; missing?: boolean }> };
|
||||
};
|
||||
const pages = data?.query?.pages;
|
||||
if (!pages) return null;
|
||||
for (const [, p] of Object.entries(pages)) {
|
||||
if (!p.missing && p.thumbnail?.source) return p.thumbnail.source;
|
||||
}
|
||||
return null;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("\n🌿 Fetching plant images from Wikipedia\n");
|
||||
|
||||
const db = getDb();
|
||||
const allPlants = await db
|
||||
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
|
||||
.from(plants)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all();
|
||||
|
||||
console.log(`📋 ${allPlants.length} plants need images\n`);
|
||||
|
||||
if (allPlants.length === 0) {
|
||||
console.log("✅ All plants already have images!\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
const rawClient = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
let found = 0;
|
||||
const updates: { id: string; url: string }[] = [];
|
||||
|
||||
// Phase 1: Try direct page lookup by scientific name (most accurate)
|
||||
console.log("─── Phase 1: Direct page lookup ───\n");
|
||||
|
||||
for (let i = 0; i < allPlants.length; i++) {
|
||||
const plant = allPlants[i];
|
||||
const sciName = plant.scientificName
|
||||
.replace(/[×'"]/g, "")
|
||||
.replace(/\s*spp\.?\s*/i, "")
|
||||
.trim();
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(3)}/${allPlants.length}] ${plant.commonName.padEnd(30)} `,
|
||||
);
|
||||
|
||||
let url: string | null = null;
|
||||
|
||||
// Try scientific name first
|
||||
if (sciName && sciName !== "Unknown" && sciName !== "Various") {
|
||||
url = await directPageLookup(sciName);
|
||||
}
|
||||
|
||||
// Try common name if scientific name didn't work
|
||||
if (!url) {
|
||||
url = await directPageLookup(plant.commonName);
|
||||
}
|
||||
|
||||
// Try genus name
|
||||
if (!url && sciName) {
|
||||
const genus = sciName.split(/\s+/)[0];
|
||||
if (genus && genus.length > 3) {
|
||||
url = await directPageLookup(genus);
|
||||
}
|
||||
}
|
||||
|
||||
if (url) {
|
||||
updates.push({ id: plant.id, url });
|
||||
found++;
|
||||
process.stdout.write("✅\n");
|
||||
} else {
|
||||
process.stdout.write("⏭️\n");
|
||||
}
|
||||
|
||||
// Flush to DB in batches
|
||||
if (updates.length >= BATCH_SIZE) {
|
||||
await rawClient.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
updates.length = 0;
|
||||
}
|
||||
|
||||
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||||
}
|
||||
|
||||
// Flush remaining
|
||||
if (updates.length > 0) {
|
||||
await rawClient.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
updates.length = 0;
|
||||
}
|
||||
|
||||
console.log(`\n✅ Phase 1 done: ${found}/${allPlants.length} plants got images\n`);
|
||||
|
||||
// Phase 2: Try remaining via search API
|
||||
const stillMissing = await db
|
||||
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
|
||||
.from(plants)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all();
|
||||
|
||||
if (stillMissing.length > 0) {
|
||||
console.log(`─── Phase 2: Search API for ${stillMissing.length} remaining ───\n`);
|
||||
|
||||
for (let i = 0; i < stillMissing.length; i++) {
|
||||
const plant = stillMissing[i];
|
||||
const sciName = plant.scientificName.replace(/[×'"]/g, "").trim();
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(3)}/${stillMissing.length}] ${plant.commonName.padEnd(30)} `,
|
||||
);
|
||||
|
||||
// Search with scientific name
|
||||
const searchTerm = `${sciName} ${plant.commonName}`;
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
list: "search",
|
||||
srsearch: searchTerm,
|
||||
srlimit: "3",
|
||||
format: "json",
|
||||
origin: "*",
|
||||
});
|
||||
|
||||
let url: string | null = null;
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetch(`${WIKI_API}?${params}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) break;
|
||||
const data = (await res.json()) as {
|
||||
query?: { search?: Array<{ title: string; pageid: number }> };
|
||||
};
|
||||
const hits = data?.query?.search ?? [];
|
||||
if (hits.length === 0) break;
|
||||
|
||||
// Get thumbnail for first result
|
||||
for (const hit of hits) {
|
||||
const pageParams = new URLSearchParams({
|
||||
action: "query",
|
||||
pageids: String(hit.pageid),
|
||||
prop: "pageimages",
|
||||
pithumbsize: "400",
|
||||
format: "json",
|
||||
origin: "*",
|
||||
});
|
||||
const pageRes = await fetch(`${WIKI_API}?${pageParams}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (!pageRes.ok) continue;
|
||||
const pageData = (await pageRes.json()) as {
|
||||
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
|
||||
};
|
||||
const pages = pageData?.query?.pages;
|
||||
if (!pages) continue;
|
||||
for (const [, p] of Object.entries(pages)) {
|
||||
if (p.thumbnail?.source) {
|
||||
url = p.thumbnail.source;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (url) break;
|
||||
}
|
||||
break;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
|
||||
if (url) {
|
||||
await rawClient.execute({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [url, plant.id],
|
||||
});
|
||||
found++;
|
||||
process.stdout.write("✅\n");
|
||||
} else {
|
||||
process.stdout.write("❌\n");
|
||||
}
|
||||
|
||||
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||||
}
|
||||
}
|
||||
|
||||
// Final count
|
||||
const final = await db
|
||||
.select({ id: plants.id, commonName: plants.commonName, imageUrl: plants.imageUrl })
|
||||
.from(plants)
|
||||
.all();
|
||||
const withImg = final.filter((p) => p.imageUrl);
|
||||
const withoutImg = final.filter((p) => !p.imageUrl);
|
||||
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`📊 FINAL: ${final.length} plants`);
|
||||
console.log(` With images: ${withImg.length}`);
|
||||
console.log(` Missing images: ${withoutImg.length}`);
|
||||
|
||||
if (withoutImg.length > 0) {
|
||||
console.log(`\n📝 Plants still needing images:`);
|
||||
withoutImg.forEach((p) => console.log(` ❌ ${p.id}: ${p.commonName}`));
|
||||
// Save to file for reference
|
||||
const reportPath = resolve(__dirname, ".plant-image-review-needed.md");
|
||||
let report = "# Plant Images — Still Missing\n\n";
|
||||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
report += `## 🚫 Plants without images (${withoutImg.length})\n\n`;
|
||||
for (const p of withoutImg) {
|
||||
report += `- **${p.commonName}** (\`${p.id}\`)\n`;
|
||||
}
|
||||
writeFileSync(reportPath, report, "utf-8");
|
||||
console.log(` 📝 Review report: ${reportPath}`);
|
||||
} else {
|
||||
console.log("\n✅ All plants now have images!");
|
||||
}
|
||||
|
||||
rawClient.close();
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌", err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user