search, db integration
This commit is contained in:
23
apps/web/scripts/apply-migration.ts
Normal file
23
apps/web/scripts/apply-migration.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import "dotenv/config";
|
||||
import { createClient } from "@libsql/client";
|
||||
|
||||
async function main() {
|
||||
const db = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
console.log("Applying migration: add image_url to diseases...");
|
||||
await db.execute("ALTER TABLE diseases ADD COLUMN image_url TEXT DEFAULT ''");
|
||||
await db.execute("UPDATE diseases SET image_url = '' WHERE image_url IS NULL");
|
||||
|
||||
// Mark migration as applied
|
||||
await db.execute(
|
||||
"INSERT INTO __drizzle_migrations (hash, created_at) VALUES ('0001_add-disease-images', datetime('now'))",
|
||||
);
|
||||
|
||||
console.log("Migration applied successfully.");
|
||||
db.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
215
apps/web/scripts/scrape-disease-images.ts
Normal file
215
apps/web/scripts/scrape-disease-images.ts
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Fetch disease images from Wikipedia/Wikimedia Commons.
|
||||
*
|
||||
* For each disease in the database, searches Wikipedia for its page
|
||||
* and retrieves the main infobox image.
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
|
||||
*
|
||||
* Rate-limited to 1 request per 300ms to be respectful.
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases } from "../src/lib/db/schema";
|
||||
|
||||
const WIKI_API = "https://en.wikipedia.org/w/api.php";
|
||||
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
|
||||
const MIN_DELAY_MS = 350; // Be respectful
|
||||
|
||||
let lastCall = 0;
|
||||
|
||||
async function rateLimit() {
|
||||
const now = Date.now();
|
||||
const elapsed = now - lastCall;
|
||||
if (elapsed < MIN_DELAY_MS) {
|
||||
await new Promise((r) => setTimeout(r, MIN_DELAY_MS - elapsed));
|
||||
}
|
||||
lastCall = Date.now();
|
||||
}
|
||||
|
||||
interface WikiSearchResult {
|
||||
title: string;
|
||||
pageid: number;
|
||||
}
|
||||
|
||||
async function searchWikipedia(term: string): Promise<WikiSearchResult | null> {
|
||||
await rateLimit();
|
||||
const url = `${WIKI_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
const data = await res.json() as any;
|
||||
const results = data?.query?.search;
|
||||
if (results && results.length > 0) {
|
||||
return { title: results[0].title, pageid: results[0].pageid };
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function getPageImage(title: string): Promise<string | null> {
|
||||
await rateLimit();
|
||||
const url = `${WIKI_API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
const data = await res.json() as any;
|
||||
const pages = data?.query?.pages;
|
||||
if (pages) {
|
||||
const page = Object.values(pages)[0] as any;
|
||||
if (page?.thumbnail?.source) {
|
||||
return page.thumbnail.source;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function searchCommons(term: string): Promise<string | null> {
|
||||
await rateLimit();
|
||||
const url = `${COMMONS_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=3&origin=*`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
const data = await res.json() as any;
|
||||
const results = data?.query?.search;
|
||||
if (results && results.length > 0) {
|
||||
// Try to get thumbnail for best match
|
||||
for (const r of results.slice(0, 2)) {
|
||||
const imgUrl = await getCommonsImage(r.title);
|
||||
if (imgUrl) return imgUrl;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function getCommonsImage(title: string): Promise<string | null> {
|
||||
await rateLimit();
|
||||
const url = `${COMMONS_API}?action=query&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url&iiurlwidth=400&format=json&origin=*`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
const data = await res.json() as any;
|
||||
const pages = data?.query?.pages;
|
||||
if (pages) {
|
||||
const page = Object.values(pages)[0] as any;
|
||||
if (page?.imageinfo?.[0]?.thumburl) {
|
||||
return page.imageinfo[0].thumburl;
|
||||
}
|
||||
if (page?.imageinfo?.[0]?.url) {
|
||||
return page.imageinfo[0].url;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("🔍 Fetching disease images from Wikipedia\n");
|
||||
const db = getDb();
|
||||
const rawClient = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
// Get all diseases without images
|
||||
const rows = await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
name: diseases.name,
|
||||
sciName: diseases.scientificName,
|
||||
plantId: diseases.plantId,
|
||||
})
|
||||
.from(diseases)
|
||||
.where(sql`image_url IS NULL OR image_url = ''`);
|
||||
|
||||
console.log(`📋 ${rows.length} diseases missing images`);
|
||||
if (rows.length === 0) {
|
||||
console.log("✅ All diseases already have images!");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
let found = 0;
|
||||
let skipped = 0;
|
||||
let batch: { sql: string; args: any[] }[] = [];
|
||||
|
||||
const BATCH_SIZE = 50;
|
||||
let i = 0;
|
||||
|
||||
for (const row of rows) {
|
||||
i++;
|
||||
// Build search terms: try scientific name + disease name, then disease name alone
|
||||
const searchTerms = [
|
||||
`${row.sciName || ""} ${row.name}`.trim(),
|
||||
row.name,
|
||||
`${row.name} (${row.sciName})`.trim(),
|
||||
].filter(Boolean);
|
||||
|
||||
let imageUrl: string | null = null;
|
||||
|
||||
for (const term of searchTerms) {
|
||||
if (term.length < 3) continue;
|
||||
// Try Wikipedia first
|
||||
const page = await searchWikipedia(term);
|
||||
if (page) {
|
||||
imageUrl = await getPageImage(page.title);
|
||||
if (imageUrl) break;
|
||||
}
|
||||
// Try Commons directly
|
||||
imageUrl = await searchCommons(term);
|
||||
if (imageUrl) break;
|
||||
}
|
||||
|
||||
if (imageUrl && !imageUrl.startsWith("https://")) {
|
||||
imageUrl = null;
|
||||
}
|
||||
|
||||
if (imageUrl) {
|
||||
batch.push({
|
||||
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
||||
args: [imageUrl, row.id],
|
||||
});
|
||||
if (i % 100 === 0) {
|
||||
process.stdout.write(` 🔍 found ${found} so far...\n`);
|
||||
}
|
||||
found++;
|
||||
} else {
|
||||
skipped++;
|
||||
}
|
||||
|
||||
// Flush batch
|
||||
if (batch.length >= BATCH_SIZE) {
|
||||
await rawClient.batch(
|
||||
batch.map((b) => ({ sql: b.sql, args: b.args })),
|
||||
"write",
|
||||
);
|
||||
process.stdout.write(` 📦 flushed ${batch.length} updates (${i}/${rows.length})\n`);
|
||||
batch = [];
|
||||
}
|
||||
}
|
||||
|
||||
// Flush remaining
|
||||
if (batch.length > 0) {
|
||||
await rawClient.batch(
|
||||
batch.map((b) => ({ sql: b.sql, args: b.args })),
|
||||
"write",
|
||||
);
|
||||
process.stdout.write(` 📦 final flush: ${batch.length} updates\n`);
|
||||
}
|
||||
|
||||
rawClient.close();
|
||||
closeDb();
|
||||
|
||||
console.log(`\n✅ Done! Found images: ${found} | Skipped: ${skipped}`);
|
||||
}
|
||||
|
||||
main().catch((err) => { console.error("❌ Fatal:", err); process.exit(1); });
|
||||
62
apps/web/scripts/test-wiki-images.ts
Normal file
62
apps/web/scripts/test-wiki-images.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
/**
|
||||
* Quick test of Wikipedia image API for disease search terms.
|
||||
* Run: cd apps/web && npx tsx scripts/test-wiki-images.ts
|
||||
*/
|
||||
const API = "https://en.wikipedia.org/w/api.php";
|
||||
|
||||
async function search(term: string) {
|
||||
const url = `${API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
return await res.json() as { query?: { search?: Array<{ title: string; pageid: number }> } };
|
||||
}
|
||||
|
||||
async function getImg(title: string) {
|
||||
const url = `${API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
return await res.json() as { query?: { pages?: Record<string, { thumbnail?: { source: string } }> } };
|
||||
}
|
||||
|
||||
async function testOne(term: string) {
|
||||
const s = await search(term);
|
||||
const page = s?.query?.search?.[0];
|
||||
if (page) {
|
||||
const img = await getImg(page.title);
|
||||
const pages = img?.query?.pages;
|
||||
if (!pages) { console.log(term, '→ NO PAGES'); return; }
|
||||
const first = Object.values(pages)[0] as { thumbnail?: { source: string } };
|
||||
const thumb = first?.thumbnail?.source;
|
||||
console.log(`${term.padEnd(40)} → ${page.title.padEnd(50)} → ${thumb ?? "NO IMG"}`);
|
||||
} else {
|
||||
console.log(`${term.padEnd(40)} → NO PAGE`);
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 400));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const tests = [
|
||||
"Phytophthora infestans Late Blight",
|
||||
"Early Blight",
|
||||
"Septoria Leaf Spot",
|
||||
"Powdery Mildew",
|
||||
"Fusarium oxysporum",
|
||||
"Citrus Canker",
|
||||
"Root Rot Pythium",
|
||||
"Downy Mildew Peronospora",
|
||||
"Bacterial Leaf Spot Xanthomonas",
|
||||
"Apple Scab Venturia inaequalis",
|
||||
"Fire Blight Erwinia amylovora",
|
||||
"Blossom End Rot",
|
||||
"Tomato Mosaic Virus",
|
||||
"Rust Puccinia",
|
||||
"Black Spot Diplocarpon rosae",
|
||||
"Sooty Mold Capnodium",
|
||||
"Clubroot Plasmodiophora brassicae",
|
||||
"Anthracnose Colletotrichum",
|
||||
];
|
||||
console.log("Searching Wikipedia for disease images...\n");
|
||||
for (const t of tests) {
|
||||
await testOne(t);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user