search, db integration

This commit is contained in:
2026-06-05 21:47:00 -04:00
parent 365d1281dd
commit 71d7a9d6f0
25 changed files with 1573 additions and 244 deletions

View File

@@ -0,0 +1,23 @@
import "dotenv/config";
import { createClient } from "@libsql/client";
async function main() {
const db = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
console.log("Applying migration: add image_url to diseases...");
await db.execute("ALTER TABLE diseases ADD COLUMN image_url TEXT DEFAULT ''");
await db.execute("UPDATE diseases SET image_url = '' WHERE image_url IS NULL");
// Mark migration as applied
await db.execute(
"INSERT INTO __drizzle_migrations (hash, created_at) VALUES ('0001_add-disease-images', datetime('now'))",
);
console.log("Migration applied successfully.");
db.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,215 @@
#!/usr/bin/env node
/**
* Fetch disease images from Wikipedia/Wikimedia Commons.
*
* For each disease in the database, searches Wikipedia for its page
* and retrieves the main infobox image.
*
* Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
*
* Rate-limited to 1 request per 300ms to be respectful.
*/
import "dotenv/config";
import { createClient } from "@libsql/client";
import { sql } from "drizzle-orm";
import { getDb, closeDb } from "../src/lib/db/index";
import { diseases } from "../src/lib/db/schema";
const WIKI_API = "https://en.wikipedia.org/w/api.php";
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
const MIN_DELAY_MS = 350; // Be respectful
let lastCall = 0;
async function rateLimit() {
const now = Date.now();
const elapsed = now - lastCall;
if (elapsed < MIN_DELAY_MS) {
await new Promise((r) => setTimeout(r, MIN_DELAY_MS - elapsed));
}
lastCall = Date.now();
}
interface WikiSearchResult {
title: string;
pageid: number;
}
async function searchWikipedia(term: string): Promise<WikiSearchResult | null> {
await rateLimit();
const url = `${WIKI_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const results = data?.query?.search;
if (results && results.length > 0) {
return { title: results[0].title, pageid: results[0].pageid };
}
} catch {
// ignore
}
return null;
}
async function getPageImage(title: string): Promise<string | null> {
await rateLimit();
const url = `${WIKI_API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const pages = data?.query?.pages;
if (pages) {
const page = Object.values(pages)[0] as any;
if (page?.thumbnail?.source) {
return page.thumbnail.source;
}
}
} catch {
// ignore
}
return null;
}
async function searchCommons(term: string): Promise<string | null> {
await rateLimit();
const url = `${COMMONS_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=3&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const results = data?.query?.search;
if (results && results.length > 0) {
// Try to get thumbnail for best match
for (const r of results.slice(0, 2)) {
const imgUrl = await getCommonsImage(r.title);
if (imgUrl) return imgUrl;
}
}
} catch {
// ignore
}
return null;
}
async function getCommonsImage(title: string): Promise<string | null> {
await rateLimit();
const url = `${COMMONS_API}?action=query&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url&iiurlwidth=400&format=json&origin=*`;
try {
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
const data = await res.json() as any;
const pages = data?.query?.pages;
if (pages) {
const page = Object.values(pages)[0] as any;
if (page?.imageinfo?.[0]?.thumburl) {
return page.imageinfo[0].thumburl;
}
if (page?.imageinfo?.[0]?.url) {
return page.imageinfo[0].url;
}
}
} catch {
// ignore
}
return null;
}
async function main() {
console.log("🔍 Fetching disease images from Wikipedia\n");
const db = getDb();
const rawClient = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
// Get all diseases without images
const rows = await db
.select({
id: diseases.id,
name: diseases.name,
sciName: diseases.scientificName,
plantId: diseases.plantId,
})
.from(diseases)
.where(sql`image_url IS NULL OR image_url = ''`);
console.log(`📋 ${rows.length} diseases missing images`);
if (rows.length === 0) {
console.log("✅ All diseases already have images!");
process.exit(0);
}
let found = 0;
let skipped = 0;
let batch: { sql: string; args: any[] }[] = [];
const BATCH_SIZE = 50;
let i = 0;
for (const row of rows) {
i++;
// Build search terms: try scientific name + disease name, then disease name alone
const searchTerms = [
`${row.sciName || ""} ${row.name}`.trim(),
row.name,
`${row.name} (${row.sciName})`.trim(),
].filter(Boolean);
let imageUrl: string | null = null;
for (const term of searchTerms) {
if (term.length < 3) continue;
// Try Wikipedia first
const page = await searchWikipedia(term);
if (page) {
imageUrl = await getPageImage(page.title);
if (imageUrl) break;
}
// Try Commons directly
imageUrl = await searchCommons(term);
if (imageUrl) break;
}
if (imageUrl && !imageUrl.startsWith("https://")) {
imageUrl = null;
}
if (imageUrl) {
batch.push({
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
args: [imageUrl, row.id],
});
if (i % 100 === 0) {
process.stdout.write(` 🔍 found ${found} so far...\n`);
}
found++;
} else {
skipped++;
}
// Flush batch
if (batch.length >= BATCH_SIZE) {
await rawClient.batch(
batch.map((b) => ({ sql: b.sql, args: b.args })),
"write",
);
process.stdout.write(` 📦 flushed ${batch.length} updates (${i}/${rows.length})\n`);
batch = [];
}
}
// Flush remaining
if (batch.length > 0) {
await rawClient.batch(
batch.map((b) => ({ sql: b.sql, args: b.args })),
"write",
);
process.stdout.write(` 📦 final flush: ${batch.length} updates\n`);
}
rawClient.close();
closeDb();
console.log(`\n✅ Done! Found images: ${found} | Skipped: ${skipped}`);
}
main().catch((err) => { console.error("❌ Fatal:", err); process.exit(1); });

View File

@@ -0,0 +1,62 @@
/**
* Quick test of Wikipedia image API for disease search terms.
* Run: cd apps/web && npx tsx scripts/test-wiki-images.ts
*/
const API = "https://en.wikipedia.org/w/api.php";
async function search(term: string) {
const url = `${API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
return await res.json() as { query?: { search?: Array<{ title: string; pageid: number }> } };
}
async function getImg(title: string) {
const url = `${API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
return await res.json() as { query?: { pages?: Record<string, { thumbnail?: { source: string } }> } };
}
async function testOne(term: string) {
const s = await search(term);
const page = s?.query?.search?.[0];
if (page) {
const img = await getImg(page.title);
const pages = img?.query?.pages;
if (!pages) { console.log(term, '→ NO PAGES'); return; }
const first = Object.values(pages)[0] as { thumbnail?: { source: string } };
const thumb = first?.thumbnail?.source;
console.log(`${term.padEnd(40)}${page.title.padEnd(50)}${thumb ?? "NO IMG"}`);
} else {
console.log(`${term.padEnd(40)} → NO PAGE`);
}
await new Promise((r) => setTimeout(r, 400));
}
async function main() {
const tests = [
"Phytophthora infestans Late Blight",
"Early Blight",
"Septoria Leaf Spot",
"Powdery Mildew",
"Fusarium oxysporum",
"Citrus Canker",
"Root Rot Pythium",
"Downy Mildew Peronospora",
"Bacterial Leaf Spot Xanthomonas",
"Apple Scab Venturia inaequalis",
"Fire Blight Erwinia amylovora",
"Blossom End Rot",
"Tomato Mosaic Virus",
"Rust Puccinia",
"Black Spot Diplocarpon rosae",
"Sooty Mold Capnodium",
"Clubroot Plasmodiophora brassicae",
"Anthracnose Colletotrichum",
];
console.log("Searching Wikipedia for disease images...\n");
for (const t of tests) {
await testOne(t);
}
}
main().catch(console.error);