search, db integration

2026-06-05 21:47:00 -04:00
parent 365d1281dd
commit 71d7a9d6f0
25 changed files with 1573 additions and 244 deletions
--- a/apps/web/scripts/scrape-disease-images.ts
+++ b/apps/web/scripts/scrape-disease-images.ts
@@ -0,0 +1,215 @@
+#!/usr/bin/env node
+/**
+ * Fetch disease images from Wikipedia/Wikimedia Commons.
+ *
+ * For each disease in the database, searches Wikipedia for its page
+ * and retrieves the main infobox image.
+ *
+ * Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
+ *
+ * Rate-limited to 1 request per 300ms to be respectful.
+ */
+
+import "dotenv/config";
+import { createClient } from "@libsql/client";
+import { sql } from "drizzle-orm";
+import { getDb, closeDb } from "../src/lib/db/index";
+import { diseases } from "../src/lib/db/schema";
+
+const WIKI_API = "https://en.wikipedia.org/w/api.php";
+const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
+const MIN_DELAY_MS = 350; // Be respectful
+
+let lastCall = 0;
+
+async function rateLimit() {
+  const now = Date.now();
+  const elapsed = now - lastCall;
+  if (elapsed < MIN_DELAY_MS) {
+    await new Promise((r) => setTimeout(r, MIN_DELAY_MS - elapsed));
+  }
+  lastCall = Date.now();
+}
+
+interface WikiSearchResult {
+  title: string;
+  pageid: number;
+}
+
+async function searchWikipedia(term: string): Promise<WikiSearchResult | null> {
+  await rateLimit();
+  const url = `${WIKI_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
+  try {
+    const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
+    const data = await res.json() as any;
+    const results = data?.query?.search;
+    if (results && results.length > 0) {
+      return { title: results[0].title, pageid: results[0].pageid };
+    }
+  } catch {
+    // ignore
+  }
+  return null;
+}
+
+async function getPageImage(title: string): Promise<string | null> {
+  await rateLimit();
+  const url = `${WIKI_API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
+  try {
+    const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
+    const data = await res.json() as any;
+    const pages = data?.query?.pages;
+    if (pages) {
+      const page = Object.values(pages)[0] as any;
+      if (page?.thumbnail?.source) {
+        return page.thumbnail.source;
+      }
+    }
+  } catch {
+    // ignore
+  }
+  return null;
+}
+
+async function searchCommons(term: string): Promise<string | null> {
+  await rateLimit();
+  const url = `${COMMONS_API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=3&origin=*`;
+  try {
+    const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
+    const data = await res.json() as any;
+    const results = data?.query?.search;
+    if (results && results.length > 0) {
+      // Try to get thumbnail for best match
+      for (const r of results.slice(0, 2)) {
+        const imgUrl = await getCommonsImage(r.title);
+        if (imgUrl) return imgUrl;
+      }
+    }
+  } catch {
+    // ignore
+  }
+  return null;
+}
+
+async function getCommonsImage(title: string): Promise<string | null> {
+  await rateLimit();
+  const url = `${COMMONS_API}?action=query&titles=${encodeURIComponent(title)}&prop=imageinfo&iiprop=url&iiurlwidth=400&format=json&origin=*`;
+  try {
+    const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
+    const data = await res.json() as any;
+    const pages = data?.query?.pages;
+    if (pages) {
+      const page = Object.values(pages)[0] as any;
+      if (page?.imageinfo?.[0]?.thumburl) {
+        return page.imageinfo[0].thumburl;
+      }
+      if (page?.imageinfo?.[0]?.url) {
+        return page.imageinfo[0].url;
+      }
+    }
+  } catch {
+    // ignore
+  }
+  return null;
+}
+
+async function main() {
+  console.log("🔍 Fetching disease images from Wikipedia\n");
+  const db = getDb();
+  const rawClient = createClient({
+    url: process.env.DATABASE_URL!,
+    authToken: process.env.DATABASE_TOKEN!,
+  });
+
+  // Get all diseases without images
+  const rows = await db
+    .select({
+      id: diseases.id,
+      name: diseases.name,
+      sciName: diseases.scientificName,
+      plantId: diseases.plantId,
+    })
+    .from(diseases)
+    .where(sql`image_url IS NULL OR image_url = ''`);
+
+  console.log(`📋 ${rows.length} diseases missing images`);
+  if (rows.length === 0) {
+    console.log("✅ All diseases already have images!");
+    process.exit(0);
+  }
+
+  let found = 0;
+  let skipped = 0;
+  let batch: { sql: string; args: any[] }[] = [];
+
+  const BATCH_SIZE = 50;
+  let i = 0;
+
+  for (const row of rows) {
+    i++;
+    // Build search terms: try scientific name + disease name, then disease name alone
+    const searchTerms = [
+      `${row.sciName || ""} ${row.name}`.trim(),
+      row.name,
+      `${row.name} (${row.sciName})`.trim(),
+    ].filter(Boolean);
+
+    let imageUrl: string | null = null;
+
+    for (const term of searchTerms) {
+      if (term.length < 3) continue;
+      // Try Wikipedia first
+      const page = await searchWikipedia(term);
+      if (page) {
+        imageUrl = await getPageImage(page.title);
+        if (imageUrl) break;
+      }
+      // Try Commons directly
+      imageUrl = await searchCommons(term);
+      if (imageUrl) break;
+    }
+
+    if (imageUrl && !imageUrl.startsWith("https://")) {
+      imageUrl = null;
+    }
+
+    if (imageUrl) {
+      batch.push({
+        sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
+        args: [imageUrl, row.id],
+      });
+      if (i % 100 === 0) {
+        process.stdout.write(`  🔍 found ${found} so far...\n`);
+      }
+      found++;
+    } else {
+      skipped++;
+    }
+
+    // Flush batch
+    if (batch.length >= BATCH_SIZE) {
+      await rawClient.batch(
+        batch.map((b) => ({ sql: b.sql, args: b.args })),
+        "write",
+      );
+      process.stdout.write(`  📦 flushed ${batch.length} updates (${i}/${rows.length})\n`);
+      batch = [];
+    }
+  }
+
+  // Flush remaining
+  if (batch.length > 0) {
+    await rawClient.batch(
+      batch.map((b) => ({ sql: b.sql, args: b.args })),
+      "write",
+    );
+    process.stdout.write(`  📦 final flush: ${batch.length} updates\n`);
+  }
+
+  rawClient.close();
+  closeDb();
+
+  console.log(`\n✅ Done! Found images: ${found} | Skipped: ${skipped}`);
+}
+
+main().catch((err) => { console.error("❌ Fatal:", err); process.exit(1); });