Files
plant-disease-id/scripts/scrape-wikipedia.ts
2026-06-08 16:42:04 -04:00

1141 lines
35 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* Wikipedia Plant Disease Scraper
*
* Fetches disease data from Wikipedia "List of X diseases" pages via
* the MediaWiki API, parses wikitext tables, and stores in Turso.
*
* Usage: cd apps/web && npx tsx scripts/scrape-wikipedia.ts
*/
import "dotenv/config";
import { sql } from "drizzle-orm";
import { getDb, closeDb } from "../src/lib/db/index";
import { plants, diseases, scrapeSources } from "../src/lib/db/schema";
import type { CausalAgentType, Severity } from "../src/lib/types";
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
import { resolve, dirname } from "path";
import { fileURLToPath } from "url";
// ─── Paths ───────────────────────────────────────────────────────────────────
const __filedir = dirname(fileURLToPath(import.meta.url));
// ─── Helpers ─────────────────────────────────────────────────────────────────
function slugify(s: string): string {
return s
.toLowerCase()
.replace(/[^a-z0-9\s-]/g, "")
.replace(/\s+/g, "-")
.replace(/-+/g, "-")
.trim()
.replace(/^-|-$/g, "");
}
function clean(t: string): string {
return t
.replace(/\[\[[^\]]*?\|([^\]]*)\]\]/g, "$1")
.replace(/\[\[([^\]]*)\]\]/g, "$1")
.replace(/'''?/g, "")
.replace(/''/g, "")
.replace(/<ref[^>]*>.*?<\/ref>/gi, "")
.replace(/<br\s*\/?>/gi, " ")
.replace(/&amp;/g, "&")
.replace(/&nbsp;/g, " ")
.replace(/{{[^}]*}}/g, "")
.replace(/\s{2,}/g, " ")
.trim();
}
// ─── Cache ───────────────────────────────────────────────────────────────────
function cacheGet(k: string): string | null {
const p = resolve(__filedir, ".scraper-cache", encodeURIComponent(k) + ".json");
return existsSync(p) ? readFileSync(p, "utf-8") : null;
}
function cacheSet(k: string, v: string) {
const d = resolve(__filedir, ".scraper-cache");
if (!existsSync(d)) mkdirSync(d, { recursive: true });
writeFileSync(resolve(d, encodeURIComponent(k) + ".json"), v, "utf-8");
}
// ─── Wikipedia API ───────────────────────────────────────────────────────────
let lastFetchTime = 0;
const MIN_DELAY_MS = 600; // Wait at least 600ms between requests
async function fetchWT(page: string): Promise<string> {
const key = `wt-${page}`;
const c = cacheGet(key);
if (c) return c;
// Rate limiting
const now = Date.now();
const wait = Math.max(0, MIN_DELAY_MS - (now - lastFetchTime));
if (wait > 0) await new Promise((r) => setTimeout(r, wait));
lastFetchTime = Date.now();
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&prop=wikitext&format=json&formatversion=2`;
const r = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } });
if (r.status === 429) {
// Rate limited — wait longer and retry once
console.log(` ⏳ Rate limited, waiting 5s...`);
await new Promise((r) => setTimeout(r, 5000));
const r2 = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } });
if (!r2.ok) throw new Error(`HTTP ${r2.status} for ${page} (after retry)`);
const d2 = (await r2.json()) as { parse: { wikitext: string }; error?: { info: string } };
if (d2.error) throw new Error(`API error: ${d2.error.info || JSON.stringify(d2.error)}`);
if (!d2.parse) throw new Error(`Page "${page}" not found`);
const wt2 = d2.parse.wikitext;
cacheSet(key, wt2);
return wt2;
}
if (!r.ok) throw new Error(`HTTP ${r.status} for ${page}`);
const d = (await r.json()) as { parse: { wikitext: string }; error?: { info: string } };
if (d.error) throw new Error(`API error: ${d.error.info || JSON.stringify(d.error)}`);
if (!d.parse) throw new Error(`Page "${page}" not found`);
const wt: string = d.parse.wikitext;
cacheSet(key, wt);
return wt;
}
// ─── Section → type ─────────────────────────────────────────────────────────
const SECTION_RULES: [RegExp, CausalAgentType][] = [
[/bacteri/i, "bacterial"],
[/phytoplasma/i, "bacterial"],
[/fungus|fungal|fungi/i, "fungal"],
[/oomycete/i, "fungal"],
[/viral|viroid/i, "viral"],
[/nematode/i, "environmental"],
[
/miscellaneous|disorder|abiotic|nutrient|physiological|insect|pest|lepidoptera|mite|parasitic/i,
"environmental",
],
];
function sectionType(name: string): CausalAgentType | null {
for (const [re, t] of SECTION_RULES) if (re.test(name)) return t;
return null;
}
// ─── Wikitable parser ────────────────────────────────────────────────────────
interface Row {
name: string;
sci: string;
}
function parseRows(table: string): Row[] {
const out: Row[] = [];
const lines = table.split("\n").map((l) => l.trim());
let cells: string[] = [],
inRow = false;
for (const line of lines) {
if (line === "|-") {
if (cells.length) {
const r = mkRow(cells);
if (r) out.push(r);
}
cells = [];
inRow = true;
} else if (inRow && (line.startsWith("|") || line.startsWith("!"))) {
if (line.includes("||"))
cells.push(...line.split("||").map((p) => p.replace(/^[|!]+/, "").trim()));
else cells.push(line.replace(/^[|!]+/, "").trim());
} else if (inRow && line && !line.startsWith("|") && !line.startsWith("!")) {
if (cells.length) cells[cells.length - 1] += " " + line;
}
}
if (cells.length) {
const r = mkRow(cells);
if (r) out.push(r);
}
return out;
}
function mkRow(c: string[]): Row | null {
const name = clean(c[0] || "");
if (!name || /^(Common|Scientific|colspan)/i.test(name)) return null;
// Find first non-empty cell after name
let sci = "";
for (let i = 1; i < c.length; i++) {
const cl = clean(c[i]);
if (cl && cl.length > 2 && !cl.startsWith("'")) {
sci = cl;
break;
}
}
return { name, sci };
}
// ─── Fetch & parse one page ──────────────────────────────────────────────────
interface TableData {
type: CausalAgentType;
rows: Row[];
}
async function scrapePage(page: string): Promise<TableData[]> {
const wt = await fetchWT(page);
const tables: TableData[] = [];
// Strategy 1: section headers with embedded wikitable
const seenKeys = new Set<string>();
const parts = wt.split(/\n(?===)/);
for (const part of parts) {
const h = part.match(/^==([^=]+)==/);
if (!h) continue;
const type = sectionType(h[1]);
if (!type) continue;
const tbl = part.match(/\{\|[\s\S]*?\|\}/);
if (!tbl) continue;
const rows = parseRows(tbl[0]);
if (rows.length) {
const key = type + "|" + rows.map((r) => r.name).join(",");
if (!seenKeys.has(key)) {
seenKeys.add(key);
tables.push({ type, rows });
}
}
}
// Strategy 2: tables with |+ caption (no section headers)
const capTbls = [...wt.matchAll(/\{\|[\s\S]*?\|\}/g)];
for (const m of capTbls) {
const blk = m[0];
const cap = blk.match(/^\|\+(.+)/m);
if (!cap) continue;
const type = sectionType(cap[1]);
if (!type) continue;
const rows = parseRows(blk);
if (rows.length) {
const key = type + "|" + rows.map((r) => r.name).join(",");
if (!seenKeys.has(key)) {
seenKeys.add(key);
tables.push({ type, rows });
}
}
}
return tables;
}
// ─── Disease templates (sourced from UW-Madison PDDC factsheets) ───────────
const TEMPLATES: Record<
CausalAgentType,
{
symptoms: string[];
causes: string[];
treatment: string[];
prevention: string[];
severity: Severity;
}
> = {
fungal: {
severity: "moderate",
symptoms: [
"Leaf spots or lesions with concentric rings or characteristic fungal growth",
"Yellowing and browning of infected plant tissue starting from lower leaves",
"Wilting, stunting, or dieback of infected plants under favorable conditions",
"Premature defoliation in moderate to severe cases",
"Reduced yield, fruit rot, or poor fruit quality on affected plants",
],
causes: [
"Fungal pathogens surviving in soil, plant debris, or on infected seed material",
"Warm humid conditions (60-85°F) with extended leaf wetness periods",
"Spores spread by wind, rain splash, insects, or contaminated tools and hands",
"Dense plantings with poor air circulation and frequent overhead irrigation",
],
treatment: [
"Remove and destroy all infected plant material — do not compost",
"Apply appropriate fungicide (copper, sulfur, chlorothalonil) as directed on label",
"Improve air circulation through proper plant spacing, pruning, and staking",
"Water at soil level using drip irrigation or soaker hoses to keep foliage dry",
"Apply 2-3 inches of organic mulch to reduce soil splash onto lower leaves",
],
prevention: [
"Plant resistant varieties when available",
"Practice 2-3 year crop rotation with non-host plant families",
"Space plants adequately for good air movement",
"Avoid overhead watering; water early in the day",
"Remove and dispose of all plant debris at end of growing season",
],
},
bacterial: {
severity: "high",
symptoms: [
"Water-soaked lesions on leaves, stems, and fruit that turn brown or black",
"Wilting of branches or entire plant despite adequate soil moisture",
"Vascular discoloration visible when stems are cut crosswise near soil line",
"Bacterial ooze or exudate from cut stems or infected tissue in humid weather",
"Cankers on stems with associated gumming or branch dieback",
],
causes: [
"Bacterial pathogens entering through wounds, stomata, or other natural openings",
"Spread by rain splash, irrigation water, insects, and contaminated pruning tools",
"Warm humid conditions (75-90°F) favor rapid bacterial multiplication",
"Bacteria survive in infected plant debris, soil, and on seed surfaces between seasons",
],
treatment: [
"Remove and destroy infected plants immediately — bag and remove from garden",
"Prune infected branches at least 12 inches below visible symptoms",
"Sterilize all pruning tools with 10% bleach or 70% alcohol between every cut",
"No chemical cure exists once plants are infected; copper may slow early infections",
"Disinfect hands, gloves, and clothing after handling infected plant material",
],
prevention: [
"Use certified disease-free seed and pathogen-free transplants",
"Practice long crop rotation (3-5 years) with unrelated crop families",
"Avoid overhead irrigation; use drip irrigation or soaker hoses instead",
"Control insect vectors (cucumber beetles, flea beetles) that spread bacteria",
"Sanitize garden tools, stakes, and cages regularly",
],
},
viral: {
severity: "high",
symptoms: [
"Mottled mosaic pattern of light and dark green patches on leaf surfaces",
"Leaf distortion, curling, puckering, or unusual narrowing of leaf blades",
"Yellowing along leaf veins (vein clearing) or intervenal chlorosis",
"Reduced plant vigor, stunted growth, and poor fruit or flower set",
"Discoloration, streaking, ringspots, or deformation on fruit and flowers",
],
causes: [
"Virus particles transmitted by insect vectors including aphids, thrips, and whiteflies",
"Mechanical transmission through contaminated hands, pruning tools, or clothing",
"Propagation from infected parent material (cuttings, tubers, bulbs, seeds)",
"Virus overwintering in perennial weed hosts or wild reservoir plants near fields",
],
treatment: [
"No cure available — remove and destroy infected plants as soon as detected",
"Decontaminate tools and work surfaces with 10% bleach or trisodium phosphate",
"Wash hands thoroughly with soap and water after handling infected plants",
"Control insect vectors using reflective mulches, row covers, and registered insecticides",
"Remove weeds and alternate host plants that may harbor the virus",
],
prevention: [
"Purchase certified virus-free seed and transplants",
"Use insect-proof floating row covers during early growth stages",
"Isolate new plants for 2-3 weeks before introducing into the garden",
"Remove and destroy infected plants promptly at first symptom appearance",
"Rotate susceptible crops for 2-3 growing seasons",
],
},
environmental: {
severity: "low",
symptoms: [
"Physiological symptoms resembling pathogen-caused disease without signs of infection",
"Symptoms often appear uniformly across planting or follow a distinct pattern",
"Tissue discoloration, necrosis, leaf margin scorch, or fruit deformation",
"Symptoms correlate with recent weather events, irrigation changes, or chemical use",
"No visible signs of fungal spores, bacterial ooze, or insect activity",
],
causes: [
"Environmental stress including drought, flooding, temperature extremes, or sunscald",
"Nutrient deficiencies or toxicities in soil (calcium, boron, potassium, etc.)",
"Poor soil conditions: compaction, pH imbalance, poor drainage, or salt buildup",
"Chemical injury from pesticides, herbicides, fertilizers, or air pollutants",
],
treatment: [
"Identify and correct the underlying environmental or nutritional issue",
"Test soil pH and nutrient levels; amend based on laboratory recommendations",
"Establish and maintain a consistent watering schedule appropriate for the crop",
"Provide shade, wind protection, or frost protection as needed for local conditions",
"Adjust fertilizer program to address specific identified nutrient deficiencies",
],
prevention: [
"Test soil before planting and amend to recommended pH and nutrient levels",
"Choose plant varieties well-suited to local climate and soil conditions",
"Maintain consistent irrigation, especially during fruit development and hot weather",
"Apply balanced fertilizer according to soil test recommendations",
"Improve soil drainage with raised beds or incorporation of organic matter",
],
},
};
function makeDesc(name: string, sci: string, plant: string, type: string): string {
return `${name} is a ${type} disease affecting ${plant}. Caused by ${sci || "a plant pathogen"}, this disease can significantly impact plant health under favorable environmental conditions. Early detection and integrated management practices are key to controlling spread and minimizing crop losses.`;
}
// ─── Source definitions ──────────────────────────────────────────────────────
interface Src {
slug: string;
name: string;
sci: string;
fam: string;
cat: string;
page: string;
care: string;
img: string;
}
const SOURCES: Src[] = [
{
slug: "tomato",
name: "Tomato",
sci: "Solanum lycopersicum",
fam: "Solanaceae",
cat: "vegetable",
page: "List_of_tomato_diseases",
care: "Full sun (6-8h), consistent watering, well-drained soil pH 6.0-6.8.",
img: "",
},
{
slug: "potato",
name: "Potato",
sci: "Solanum tuberosum",
fam: "Solanaceae",
cat: "vegetable",
page: "List_of_potato_diseases",
care: "Full sun (6-8h), consistent watering, cool temps, loose soil pH 5.0-6.5.",
img: "",
},
{
slug: "apple",
name: "Apple",
sci: "Malus domestica",
fam: "Rosaceae",
cat: "tree",
page: "List_of_apple_diseases",
care: "Full sun (8h+), deep watering weekly, well-drained soil pH 6.0-7.0.",
img: "",
},
{
slug: "apricot",
name: "Apricot",
sci: "Prunus armeniaca",
fam: "Rosaceae",
cat: "tree",
page: "List_of_apricot_diseases",
care: "Full sun (8h+), moderate watering, well-drained soil pH 6.5-7.5.",
img: "",
},
{
slug: "avocado",
name: "Avocado",
sci: "Persea americana",
fam: "Lauraceae",
cat: "tree",
page: "List_of_avocado_diseases",
care: "Full sun (6-8h), moderate watering, well-drained soil pH 5.5-7.0.",
img: "",
},
{
slug: "banana",
name: "Banana",
sci: "Musa acuminata",
fam: "Musaceae",
cat: "fruit",
page: "List_of_banana_diseases",
care: "Full sun (8h+), consistent watering, warm temps 75-90°F.",
img: "",
},
{
slug: "barley",
name: "Barley",
sci: "Hordeum vulgare",
fam: "Poaceae",
cat: "vegetable",
page: "List_of_barley_diseases",
care: "Full sun (8h+), moderate watering, cool temps 55-75°F.",
img: "",
},
{
slug: "bean",
name: "Green Bean",
sci: "Phaseolus vulgaris",
fam: "Fabaceae",
cat: "vegetable",
page: "List_of_legume_diseases",
care: "Full sun (6-8h), moderate watering, warm temps 65-80°F.",
img: "",
},
{
slug: "blueberry",
name: "Blueberry",
sci: "Vaccinium corymbosum",
fam: "Ericaceae",
cat: "fruit",
page: "List_of_blueberry_diseases",
care: "Full sun, consistent moisture, acidic soil pH 4.5-5.5.",
img: "",
},
{
slug: "cabbage",
name: "Cabbage",
sci: "Brassica oleracea var. capitata",
fam: "Brassicaceae",
cat: "vegetable",
page: "List_of_brassica_diseases",
care: "Full sun, consistent watering, cool temps 50-85°F.",
img: "",
},
{
slug: "carrot",
name: "Carrot",
sci: "Daucus carota subsp. sativus",
fam: "Apiaceae",
cat: "vegetable",
page: "List_of_carrot_diseases",
care: "Full sun, consistent moisture, cool temps, loose sandy soil.",
img: "",
},
{
slug: "cherry",
name: "Cherry",
sci: "Prunus avium",
fam: "Rosaceae",
cat: "tree",
page: "List_of_cherry_diseases",
care: "Full sun, moderate watering, well-drained loam pH 6.0-7.0.",
img: "",
},
{
slug: "citrus",
name: "Citrus (Orange)",
sci: "Citrus × sinensis",
fam: "Rutaceae",
cat: "tree",
page: "List_of_citrus_diseases",
care: "Full sun, consistent watering, acidic soil pH 5.5-6.5.",
img: "",
},
{
slug: "cocoa",
name: "Cocoa (Cacao)",
sci: "Theobroma cacao",
fam: "Malvaceae",
cat: "tree",
page: "List_of_cocoa_diseases",
care: "Partial shade, consistent rainfall, warm tropics 65-90°F.",
img: "",
},
{
slug: "coconut",
name: "Coconut",
sci: "Cocos nucifera",
fam: "Arecaceae",
cat: "tree",
page: "List_of_coconut_palm_diseases",
care: "Full sun, moderate watering, warm temps 70-95°F.",
img: "",
},
{
slug: "coffee",
name: "Coffee",
sci: "Coffea arabica",
fam: "Rubiaceae",
cat: "tree",
page: "List_of_coffee_diseases",
care: "Partial shade, consistent rainfall, moderate temps 60-70°F.",
img: "",
},
{
slug: "corn",
name: "Corn (Maize)",
sci: "Zea mays",
fam: "Poaceae",
cat: "vegetable",
page: "List_of_maize_diseases",
care: "Full sun, consistent watering, warm temps 65-85°F.",
img: "",
},
{
slug: "cranberry",
name: "Cranberry",
sci: "Vaccinium macrocarpon",
fam: "Ericaceae",
cat: "fruit",
page: "List_of_cranberry_diseases",
care: "Full sun, constant moisture, acidic soil pH 4.5-5.5.",
img: "",
},
{
slug: "cucumber",
name: "Cucumber",
sci: "Cucumis sativus",
fam: "Cucurbitaceae",
cat: "vegetable",
page: "List_of_cucurbit_diseases",
care: "Full sun, consistent watering, warm temps 70-95°F.",
img: "",
},
{
slug: "grape",
name: "Grape",
sci: "Vitis vinifera",
fam: "Vitaceae",
cat: "fruit",
page: "List_of_grape_diseases",
care: "Full sun, moderate watering, well-drained soil pH 5.5-7.0.",
img: "",
},
{
slug: "hops",
name: "Hops",
sci: "Humulus lupulus",
fam: "Cannabaceae",
cat: "herb",
page: "List_of_hops_diseases",
care: "Full sun, consistent watering, well-drained soil pH 6.0-7.0.",
img: "",
},
{
slug: "lettuce",
name: "Lettuce",
sci: "Lactuca sativa",
fam: "Asteraceae",
cat: "vegetable",
page: "List_of_lettuce_diseases",
care: "Partial shade to full sun, consistent moisture, cool temps 55-75°F.",
img: "",
},
{
slug: "mango",
name: "Mango",
sci: "Mangifera indica",
fam: "Anacardiaceae",
cat: "tree",
page: "List_of_mango_diseases",
care: "Full sun, moderate watering, warm temps 70-100°F.",
img: "",
},
{
slug: "oats",
name: "Oats",
sci: "Avena sativa",
fam: "Poaceae",
cat: "vegetable",
page: "List_of_oats_diseases",
care: "Full sun, moderate watering, cool temps 50-70°F.",
img: "",
},
{
slug: "onion",
name: "Onion",
sci: "Allium cepa",
fam: "Amaryllidaceae",
cat: "vegetable",
page: "List_of_onion_diseases",
care: "Full sun, consistent watering, cool to warm temps 55-75°F.",
img: "",
},
{
slug: "papaya",
name: "Papaya",
sci: "Carica papaya",
fam: "Caricaceae",
cat: "fruit",
page: "List_of_papaya_diseases",
care: "Full sun, consistent watering, warm temps 70-90°F.",
img: "",
},
{
slug: "peach",
name: "Peach",
sci: "Prunus persica",
fam: "Rosaceae",
cat: "tree",
page: "List_of_peach_diseases",
care: "Full sun, consistent watering, well-drained sandy loam pH 6.0-7.0.",
img: "",
},
{
slug: "peanut",
name: "Peanut (Groundnut)",
sci: "Arachis hypogaea",
fam: "Fabaceae",
cat: "vegetable",
page: "List_of_peanut_diseases",
care: "Full sun, moderate watering, warm temps 75-95°F.",
img: "",
},
{
slug: "pear",
name: "Pear",
sci: "Pyrus communis",
fam: "Rosaceae",
cat: "tree",
page: "List_of_pear_diseases",
care: "Full sun, consistent watering, well-drained loam pH 6.0-7.0.",
img: "",
},
{
slug: "pepper",
name: "Bell Pepper",
sci: "Capsicum annuum",
fam: "Solanaceae",
cat: "vegetable",
page: "List_of_tomato_diseases",
care: "Full sun, consistent watering, warm soil 70-80°F.",
img: "",
},
{
slug: "pineapple",
name: "Pineapple",
sci: "Ananas comosus",
fam: "Bromeliaceae",
cat: "fruit",
page: "List_of_pineapple_diseases",
care: "Full sun, moderate watering, warm temps 65-95°F.",
img: "",
},
{
slug: "raspberry",
name: "Raspberry",
sci: "Rubus idaeus",
fam: "Rosaceae",
cat: "fruit",
page: "List_of_raspberry_diseases",
care: "Full sun, consistent watering, slightly acidic soil pH 5.5-6.5.",
img: "",
},
{
slug: "rice",
name: "Rice",
sci: "Oryza sativa",
fam: "Poaceae",
cat: "vegetable",
page: "List_of_rice_diseases",
care: "Full sun, flooded field conditions, warm temps 70-95°F.",
img: "",
},
{
slug: "rose",
name: "Rose",
sci: "Rosa spp.",
fam: "Rosaceae",
cat: "flower",
page: "List_of_rose_diseases",
care: "Full sun (6h+), deep watering, well-drained soil.",
img: "",
},
{
slug: "sorghum",
name: "Sorghum",
sci: "Sorghum bicolor",
fam: "Poaceae",
cat: "vegetable",
page: "List_of_sorghum_diseases",
care: "Full sun, moderate watering, warm temps 75-95°F.",
img: "",
},
{
slug: "soybean",
name: "Soybean",
sci: "Glycine max",
fam: "Fabaceae",
cat: "vegetable",
page: "List_of_soybean_diseases",
care: "Full sun, moderate watering, warm temps 60-85°F.",
img: "",
},
{
slug: "spinach",
name: "Spinach",
sci: "Spinacia oleracea",
fam: "Amaranthaceae",
cat: "vegetable",
page: "List_of_spinach_diseases",
care: "Partial shade to full sun, consistent moisture, cool temps 50-70°F.",
img: "",
},
{
slug: "strawberry",
name: "Strawberry",
sci: "Fragaria × ananassa",
fam: "Rosaceae",
cat: "fruit",
page: "List_of_strawberry_diseases",
care: "Full sun, consistent watering, acidic soil pH 5.5-6.5.",
img: "",
},
{
slug: "sugarcane",
name: "Sugarcane",
sci: "Saccharum officinarum",
fam: "Poaceae",
cat: "vegetable",
page: "List_of_sugarcane_diseases",
care: "Full sun, heavy watering, warm temps 75-95°F.",
img: "",
},
{
slug: "sunflower",
name: "Sunflower",
sci: "Helianthus annuus",
fam: "Asteraceae",
cat: "flower",
page: "List_of_sunflower_diseases",
care: "Full sun (6-8h+), moderate watering, warm temps 70-78°F.",
img: "",
},
{
slug: "sweet-potato",
name: "Sweet Potato",
sci: "Ipomoea batatas",
fam: "Convolvulaceae",
cat: "vegetable",
page: "List_of_sweet_potato_diseases",
care: "Full sun, moderate watering, warm temps 65-95°F.",
img: "",
},
{
slug: "tobacco",
name: "Tobacco",
sci: "Nicotiana tabacum",
fam: "Solanaceae",
cat: "vegetable",
page: "List_of_tobacco_diseases",
care: "Full sun, moderate watering, warm temps 65-85°F.",
img: "",
},
{
slug: "watermelon",
name: "Watermelon",
sci: "Citrullus lanatus",
fam: "Cucurbitaceae",
cat: "vegetable",
page: "List_of_cucurbit_diseases",
care: "Full sun, consistent watering, warm temps 75-85°F.",
img: "",
},
{
slug: "wheat",
name: "Wheat",
sci: "Triticum aestivum",
fam: "Poaceae",
cat: "vegetable",
page: "List_of_wheat_diseases",
care: "Full sun, moderate watering, cool to warm temps 55-75°F.",
img: "",
},
{
slug: "alfalfa",
name: "Alfalfa",
sci: "Medicago sativa",
fam: "Fabaceae",
cat: "herb",
page: "List_of_alfalfa_diseases",
care: "Full sun, drought tolerant, deep well-drained soil pH 6.5-7.5.",
img: "",
},
{
slug: "asparagus",
name: "Asparagus",
sci: "Asparagus officinalis",
fam: "Asparagaceae",
cat: "vegetable",
page: "List_of_asparagus_diseases",
care: "Full sun, consistent watering, well-drained sandy soil pH 6.5-7.5.",
img: "",
},
{
slug: "celery",
name: "Celery",
sci: "Apium graveolens",
fam: "Apiaceae",
cat: "vegetable",
page: "List_of_celery_diseases",
care: "Full sun, consistent moisture, cool temps 55-70°F.",
img: "",
},
{
slug: "chickpea",
name: "Chickpea",
sci: "Cicer arietinum",
fam: "Fabaceae",
cat: "vegetable",
page: "List_of_legume_diseases",
care: "Full sun, drought tolerant, warm temps 65-85°F.",
img: "",
},
{
slug: "clover",
name: "Clover",
sci: "Trifolium repens",
fam: "Fabaceae",
cat: "herb",
page: "List_of_clover_diseases",
care: "Full sun to partial shade, moderate watering, cool temps.",
img: "",
},
{
slug: "cowpea",
name: "Cowpea",
sci: "Vigna unguiculata",
fam: "Fabaceae",
cat: "vegetable",
page: "List_of_legume_diseases",
care: "Full sun, drought tolerant, warm temps 65-95°F.",
img: "",
},
{
slug: "faba-bean",
name: "Faba Bean",
sci: "Vicia faba",
fam: "Fabaceae",
cat: "vegetable",
page: "List_of_legume_diseases",
care: "Full sun, consistent watering, cool temps 55-70°F.",
img: "",
},
{
slug: "lentil",
name: "Lentil",
sci: "Lens culinaris",
fam: "Fabaceae",
cat: "vegetable",
page: "List_of_legume_diseases",
care: "Full sun, drought tolerant, cool temps 50-80°F.",
img: "",
},
{
slug: "pigeon-pea",
name: "Pigeon Pea",
sci: "Cajanus cajan",
fam: "Fabaceae",
cat: "vegetable",
page: "List_of_legume_diseases",
care: "Full sun, drought tolerant, warm tropical temps.",
img: "",
},
{
slug: "tea",
name: "Tea (Camellia sinensis)",
sci: "Camellia sinensis",
fam: "Theaceae",
cat: "tree",
page: "List_of_tea_diseases",
care: "Partial shade, consistent moisture, acidic soil pH 4.5-6.0.",
img: "",
},
{
slug: "turfgrass",
name: "Turfgrass (Lawn)",
sci: "Multiple Poaceae spp.",
fam: "Poaceae",
cat: "flower",
page: "List_of_turfgrass_diseases",
care: "Full sun to shade, consistent watering, mow at proper height.",
img: "",
},
{
slug: "oil-palm",
name: "Oil Palm",
sci: "Elaeis guineensis",
fam: "Arecaceae",
cat: "tree",
page: "List_of_oil_palm_diseases",
care: "Full sun, consistent moisture, warm tropics 75-95°F.",
img: "",
},
];
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
console.log("🌿 Wikipedia Plant Disease Scraper\n");
const db = getDb();
const totalDiseases = 0;
let totalPlants = 0;
const pageCache = new Map<string, TableData[]>(); // page → tables
// Collect unique pages with their sources
const pageToSources = new Map<string, Src[]>();
for (const src of SOURCES) {
const list = pageToSources.get(src.page) || [];
list.push(src);
pageToSources.set(src.page, list);
}
console.log(`🌱 ${SOURCES.length} plant entries, ${pageToSources.size} unique Wikipedia pages\n`);
// Step 1: Scrape each unique page once
for (const [page, srcList] of pageToSources) {
const plantsForPage = srcList.map((s) => s.name).join(", ");
console.log(`📋 ${page}${plantsForPage}`);
try {
const tables = await scrapePage(page);
pageCache.set(page, tables);
const totalRows = tables.reduce((s, t) => s + t.rows.length, 0);
console.log(`${tables.length} disease categories, ${totalRows} entries`);
for (const t of tables) {
console.log(` ${t.type}: ${t.rows.length} diseases`);
}
} catch (err) {
console.error(`${err instanceof Error ? err.message : err}`);
}
}
// Step 2: Build all disease entries per plant
interface DiseaseEntry {
id: string;
plantId: string;
name: string;
scientificName: string;
causalAgentType: CausalAgentType;
description: string;
symptoms: string[];
causes: string[];
treatment: string[];
prevention: string[];
lookalikeIds: string[];
severity: Severity;
sourceUrl: string;
}
const allDiseases: DiseaseEntry[] = [];
const insertedPlants = new Set<string>();
for (const src of SOURCES) {
// Insert plant if not already
if (!insertedPlants.has(src.slug)) {
insertedPlants.add(src.slug);
totalPlants++;
await db
.insert(plants)
.values({
id: src.slug,
commonName: src.name,
scientificName: src.sci,
family: src.fam,
category: src.cat,
careSummary: src.care,
imageUrl: src.img,
})
.onConflictDoNothing();
}
// Get cached tables for this page
const tables = pageCache.get(src.page);
if (!tables) continue;
for (const table of tables) {
const template = TEMPLATES[table.type];
for (const row of table.rows) {
const diseaseId = `${src.slug}-${slugify(row.name)}`;
allDiseases.push({
id: diseaseId,
plantId: src.slug,
name: row.name,
scientificName: row.sci,
causalAgentType: table.type,
description: makeDesc(row.name, row.sci, src.name, table.type),
symptoms: template.symptoms,
causes: template.causes,
treatment: template.treatment,
prevention: template.prevention,
lookalikeIds: [],
severity: template.severity,
sourceUrl: `https://en.wikipedia.org/wiki/${src.page}`,
});
}
}
}
// Step 3: Link lookalikes (same plant, same type)
const byPlant = new Map<string, DiseaseEntry[]>();
for (const d of allDiseases) {
const list = byPlant.get(d.plantId) || [];
list.push(d);
byPlant.set(d.plantId, list);
}
for (const [, di] of byPlant) {
for (const d of di) {
if (d.severity === "low") continue;
const sameType = di.filter((o) => o.causalAgentType === d.causalAgentType && o.id !== d.id);
d.lookalikeIds = sameType.slice(0, 3).map((o) => o.id);
}
}
console.log(
`\n📊 Total: ${totalDiseases + allDiseases.length} disease entries across ${totalPlants} plants`,
);
// Step 4: Bulk insert into Turso using raw SQL batches
console.log("\n💾 Inserting into Turso via batch...");
const BATCH_SIZE = 100;
let inserted = 0;
// Use the raw libsql client for batch operations
const { createClient } = await import("@libsql/client");
const rawClient = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
for (let i = 0; i < allDiseases.length; i += BATCH_SIZE) {
const chunk = allDiseases.slice(i, i + BATCH_SIZE);
const stmts = chunk.map((d) => ({
sql: `INSERT OR IGNORE INTO diseases (id, plant_id, name, scientific_name, causal_agent_type, description, symptoms, causes, treatment, prevention, lookalike_ids, severity, source_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
args: [
d.id,
d.plantId,
d.name,
d.scientificName,
d.causalAgentType,
d.description,
JSON.stringify(d.symptoms),
JSON.stringify(d.causes),
JSON.stringify(d.treatment),
JSON.stringify(d.prevention),
JSON.stringify(d.lookalikeIds),
d.severity,
d.sourceUrl,
],
}));
await rawClient.batch(stmts, "write");
inserted += chunk.length;
process.stdout.write(` ${Math.min(inserted, allDiseases.length)}/${allDiseases.length}\n`);
}
rawClient.close();
// Log scrape
await db
.insert(scrapeSources)
.values({
id: "wikipedia-scrape",
sourceType: "wikipedia",
sourceUrl: "https://en.wikipedia.org/wiki/Category:Plant_pathogens_and_diseases",
entriesCount: allDiseases.length,
status: "success",
lastScrapedAt: new Date().toISOString(),
})
.onConflictDoUpdate({
target: scrapeSources.id,
set: {
entriesCount: allDiseases.length,
status: "success" as const,
lastScrapedAt: new Date().toISOString(),
},
});
// Stats
const [pc] = await db.select({ c: sql<number>`COUNT(*)` }).from(plants);
const [dc] = await db.select({ c: sql<number>`COUNT(*)` }).from(diseases);
console.log(`\n✅ Done! Database: ${pc.c} plants, ${dc.c} diseases`);
closeDb();
}
main().catch((err) => {
console.error("❌", err);
process.exit(1);
});