1141 lines
35 KiB
JavaScript
1141 lines
35 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Wikipedia Plant Disease Scraper
|
||
*
|
||
* Fetches disease data from Wikipedia "List of X diseases" pages via
|
||
* the MediaWiki API, parses wikitext tables, and stores in Turso.
|
||
*
|
||
* Usage: cd apps/web && npx tsx scripts/scrape-wikipedia.ts
|
||
*/
|
||
|
||
import "dotenv/config";
|
||
import { sql } from "drizzle-orm";
|
||
import { getDb, closeDb } from "../src/lib/db/index";
|
||
import { plants, diseases, scrapeSources } from "../src/lib/db/schema";
|
||
import type { CausalAgentType, Severity } from "../src/lib/types";
|
||
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
|
||
import { resolve, dirname } from "path";
|
||
import { fileURLToPath } from "url";
|
||
|
||
// ─── Paths ───────────────────────────────────────────────────────────────────
|
||
|
||
const __filedir = dirname(fileURLToPath(import.meta.url));
|
||
|
||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||
|
||
function slugify(s: string): string {
|
||
return s
|
||
.toLowerCase()
|
||
.replace(/[^a-z0-9\s-]/g, "")
|
||
.replace(/\s+/g, "-")
|
||
.replace(/-+/g, "-")
|
||
.trim()
|
||
.replace(/^-|-$/g, "");
|
||
}
|
||
|
||
function clean(t: string): string {
|
||
return t
|
||
.replace(/\[\[[^\]]*?\|([^\]]*)\]\]/g, "$1")
|
||
.replace(/\[\[([^\]]*)\]\]/g, "$1")
|
||
.replace(/'''?/g, "")
|
||
.replace(/''/g, "")
|
||
.replace(/<ref[^>]*>.*?<\/ref>/gi, "")
|
||
.replace(/<br\s*\/?>/gi, " ")
|
||
.replace(/&/g, "&")
|
||
.replace(/ /g, " ")
|
||
.replace(/{{[^}]*}}/g, "")
|
||
.replace(/\s{2,}/g, " ")
|
||
.trim();
|
||
}
|
||
|
||
// ─── Cache ───────────────────────────────────────────────────────────────────
|
||
|
||
function cacheGet(k: string): string | null {
|
||
const p = resolve(__filedir, ".scraper-cache", encodeURIComponent(k) + ".json");
|
||
return existsSync(p) ? readFileSync(p, "utf-8") : null;
|
||
}
|
||
function cacheSet(k: string, v: string) {
|
||
const d = resolve(__filedir, ".scraper-cache");
|
||
if (!existsSync(d)) mkdirSync(d, { recursive: true });
|
||
writeFileSync(resolve(d, encodeURIComponent(k) + ".json"), v, "utf-8");
|
||
}
|
||
|
||
// ─── Wikipedia API ───────────────────────────────────────────────────────────
|
||
|
||
let lastFetchTime = 0;
|
||
const MIN_DELAY_MS = 600; // Wait at least 600ms between requests
|
||
|
||
async function fetchWT(page: string): Promise<string> {
|
||
const key = `wt-${page}`;
|
||
const c = cacheGet(key);
|
||
if (c) return c;
|
||
|
||
// Rate limiting
|
||
const now = Date.now();
|
||
const wait = Math.max(0, MIN_DELAY_MS - (now - lastFetchTime));
|
||
if (wait > 0) await new Promise((r) => setTimeout(r, wait));
|
||
lastFetchTime = Date.now();
|
||
|
||
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&prop=wikitext&format=json&formatversion=2`;
|
||
const r = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } });
|
||
|
||
if (r.status === 429) {
|
||
// Rate limited — wait longer and retry once
|
||
console.log(` ⏳ Rate limited, waiting 5s...`);
|
||
await new Promise((r) => setTimeout(r, 5000));
|
||
const r2 = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } });
|
||
if (!r2.ok) throw new Error(`HTTP ${r2.status} for ${page} (after retry)`);
|
||
const d2 = (await r2.json()) as { parse: { wikitext: string }; error?: { info: string } };
|
||
if (d2.error) throw new Error(`API error: ${d2.error.info || JSON.stringify(d2.error)}`);
|
||
if (!d2.parse) throw new Error(`Page "${page}" not found`);
|
||
const wt2 = d2.parse.wikitext;
|
||
cacheSet(key, wt2);
|
||
return wt2;
|
||
}
|
||
|
||
if (!r.ok) throw new Error(`HTTP ${r.status} for ${page}`);
|
||
const d = (await r.json()) as { parse: { wikitext: string }; error?: { info: string } };
|
||
if (d.error) throw new Error(`API error: ${d.error.info || JSON.stringify(d.error)}`);
|
||
if (!d.parse) throw new Error(`Page "${page}" not found`);
|
||
const wt: string = d.parse.wikitext;
|
||
cacheSet(key, wt);
|
||
return wt;
|
||
}
|
||
|
||
// ─── Section → type ─────────────────────────────────────────────────────────
|
||
|
||
const SECTION_RULES: [RegExp, CausalAgentType][] = [
|
||
[/bacteri/i, "bacterial"],
|
||
[/phytoplasma/i, "bacterial"],
|
||
[/fungus|fungal|fungi/i, "fungal"],
|
||
[/oomycete/i, "fungal"],
|
||
[/viral|viroid/i, "viral"],
|
||
[/nematode/i, "environmental"],
|
||
[
|
||
/miscellaneous|disorder|abiotic|nutrient|physiological|insect|pest|lepidoptera|mite|parasitic/i,
|
||
"environmental",
|
||
],
|
||
];
|
||
|
||
function sectionType(name: string): CausalAgentType | null {
|
||
for (const [re, t] of SECTION_RULES) if (re.test(name)) return t;
|
||
return null;
|
||
}
|
||
|
||
// ─── Wikitable parser ────────────────────────────────────────────────────────
|
||
|
||
interface Row {
|
||
name: string;
|
||
sci: string;
|
||
}
|
||
|
||
function parseRows(table: string): Row[] {
|
||
const out: Row[] = [];
|
||
const lines = table.split("\n").map((l) => l.trim());
|
||
let cells: string[] = [],
|
||
inRow = false;
|
||
|
||
for (const line of lines) {
|
||
if (line === "|-") {
|
||
if (cells.length) {
|
||
const r = mkRow(cells);
|
||
if (r) out.push(r);
|
||
}
|
||
cells = [];
|
||
inRow = true;
|
||
} else if (inRow && (line.startsWith("|") || line.startsWith("!"))) {
|
||
if (line.includes("||"))
|
||
cells.push(...line.split("||").map((p) => p.replace(/^[|!]+/, "").trim()));
|
||
else cells.push(line.replace(/^[|!]+/, "").trim());
|
||
} else if (inRow && line && !line.startsWith("|") && !line.startsWith("!")) {
|
||
if (cells.length) cells[cells.length - 1] += " " + line;
|
||
}
|
||
}
|
||
if (cells.length) {
|
||
const r = mkRow(cells);
|
||
if (r) out.push(r);
|
||
}
|
||
return out;
|
||
}
|
||
|
||
function mkRow(c: string[]): Row | null {
|
||
const name = clean(c[0] || "");
|
||
if (!name || /^(Common|Scientific|colspan)/i.test(name)) return null;
|
||
// Find first non-empty cell after name
|
||
let sci = "";
|
||
for (let i = 1; i < c.length; i++) {
|
||
const cl = clean(c[i]);
|
||
if (cl && cl.length > 2 && !cl.startsWith("'")) {
|
||
sci = cl;
|
||
break;
|
||
}
|
||
}
|
||
return { name, sci };
|
||
}
|
||
|
||
// ─── Fetch & parse one page ──────────────────────────────────────────────────
|
||
|
||
interface TableData {
|
||
type: CausalAgentType;
|
||
rows: Row[];
|
||
}
|
||
|
||
async function scrapePage(page: string): Promise<TableData[]> {
|
||
const wt = await fetchWT(page);
|
||
const tables: TableData[] = [];
|
||
|
||
// Strategy 1: section headers with embedded wikitable
|
||
const seenKeys = new Set<string>();
|
||
const parts = wt.split(/\n(?===)/);
|
||
for (const part of parts) {
|
||
const h = part.match(/^==([^=]+)==/);
|
||
if (!h) continue;
|
||
const type = sectionType(h[1]);
|
||
if (!type) continue;
|
||
|
||
const tbl = part.match(/\{\|[\s\S]*?\|\}/);
|
||
if (!tbl) continue;
|
||
|
||
const rows = parseRows(tbl[0]);
|
||
if (rows.length) {
|
||
const key = type + "|" + rows.map((r) => r.name).join(",");
|
||
if (!seenKeys.has(key)) {
|
||
seenKeys.add(key);
|
||
tables.push({ type, rows });
|
||
}
|
||
}
|
||
}
|
||
|
||
// Strategy 2: tables with |+ caption (no section headers)
|
||
const capTbls = [...wt.matchAll(/\{\|[\s\S]*?\|\}/g)];
|
||
for (const m of capTbls) {
|
||
const blk = m[0];
|
||
const cap = blk.match(/^\|\+(.+)/m);
|
||
if (!cap) continue;
|
||
const type = sectionType(cap[1]);
|
||
if (!type) continue;
|
||
|
||
const rows = parseRows(blk);
|
||
if (rows.length) {
|
||
const key = type + "|" + rows.map((r) => r.name).join(",");
|
||
if (!seenKeys.has(key)) {
|
||
seenKeys.add(key);
|
||
tables.push({ type, rows });
|
||
}
|
||
}
|
||
}
|
||
|
||
return tables;
|
||
}
|
||
|
||
// ─── Disease templates (sourced from UW-Madison PDDC factsheets) ───────────
|
||
|
||
const TEMPLATES: Record<
|
||
CausalAgentType,
|
||
{
|
||
symptoms: string[];
|
||
causes: string[];
|
||
treatment: string[];
|
||
prevention: string[];
|
||
severity: Severity;
|
||
}
|
||
> = {
|
||
fungal: {
|
||
severity: "moderate",
|
||
symptoms: [
|
||
"Leaf spots or lesions with concentric rings or characteristic fungal growth",
|
||
"Yellowing and browning of infected plant tissue starting from lower leaves",
|
||
"Wilting, stunting, or dieback of infected plants under favorable conditions",
|
||
"Premature defoliation in moderate to severe cases",
|
||
"Reduced yield, fruit rot, or poor fruit quality on affected plants",
|
||
],
|
||
causes: [
|
||
"Fungal pathogens surviving in soil, plant debris, or on infected seed material",
|
||
"Warm humid conditions (60-85°F) with extended leaf wetness periods",
|
||
"Spores spread by wind, rain splash, insects, or contaminated tools and hands",
|
||
"Dense plantings with poor air circulation and frequent overhead irrigation",
|
||
],
|
||
treatment: [
|
||
"Remove and destroy all infected plant material — do not compost",
|
||
"Apply appropriate fungicide (copper, sulfur, chlorothalonil) as directed on label",
|
||
"Improve air circulation through proper plant spacing, pruning, and staking",
|
||
"Water at soil level using drip irrigation or soaker hoses to keep foliage dry",
|
||
"Apply 2-3 inches of organic mulch to reduce soil splash onto lower leaves",
|
||
],
|
||
prevention: [
|
||
"Plant resistant varieties when available",
|
||
"Practice 2-3 year crop rotation with non-host plant families",
|
||
"Space plants adequately for good air movement",
|
||
"Avoid overhead watering; water early in the day",
|
||
"Remove and dispose of all plant debris at end of growing season",
|
||
],
|
||
},
|
||
bacterial: {
|
||
severity: "high",
|
||
symptoms: [
|
||
"Water-soaked lesions on leaves, stems, and fruit that turn brown or black",
|
||
"Wilting of branches or entire plant despite adequate soil moisture",
|
||
"Vascular discoloration visible when stems are cut crosswise near soil line",
|
||
"Bacterial ooze or exudate from cut stems or infected tissue in humid weather",
|
||
"Cankers on stems with associated gumming or branch dieback",
|
||
],
|
||
causes: [
|
||
"Bacterial pathogens entering through wounds, stomata, or other natural openings",
|
||
"Spread by rain splash, irrigation water, insects, and contaminated pruning tools",
|
||
"Warm humid conditions (75-90°F) favor rapid bacterial multiplication",
|
||
"Bacteria survive in infected plant debris, soil, and on seed surfaces between seasons",
|
||
],
|
||
treatment: [
|
||
"Remove and destroy infected plants immediately — bag and remove from garden",
|
||
"Prune infected branches at least 12 inches below visible symptoms",
|
||
"Sterilize all pruning tools with 10% bleach or 70% alcohol between every cut",
|
||
"No chemical cure exists once plants are infected; copper may slow early infections",
|
||
"Disinfect hands, gloves, and clothing after handling infected plant material",
|
||
],
|
||
prevention: [
|
||
"Use certified disease-free seed and pathogen-free transplants",
|
||
"Practice long crop rotation (3-5 years) with unrelated crop families",
|
||
"Avoid overhead irrigation; use drip irrigation or soaker hoses instead",
|
||
"Control insect vectors (cucumber beetles, flea beetles) that spread bacteria",
|
||
"Sanitize garden tools, stakes, and cages regularly",
|
||
],
|
||
},
|
||
viral: {
|
||
severity: "high",
|
||
symptoms: [
|
||
"Mottled mosaic pattern of light and dark green patches on leaf surfaces",
|
||
"Leaf distortion, curling, puckering, or unusual narrowing of leaf blades",
|
||
"Yellowing along leaf veins (vein clearing) or intervenal chlorosis",
|
||
"Reduced plant vigor, stunted growth, and poor fruit or flower set",
|
||
"Discoloration, streaking, ringspots, or deformation on fruit and flowers",
|
||
],
|
||
causes: [
|
||
"Virus particles transmitted by insect vectors including aphids, thrips, and whiteflies",
|
||
"Mechanical transmission through contaminated hands, pruning tools, or clothing",
|
||
"Propagation from infected parent material (cuttings, tubers, bulbs, seeds)",
|
||
"Virus overwintering in perennial weed hosts or wild reservoir plants near fields",
|
||
],
|
||
treatment: [
|
||
"No cure available — remove and destroy infected plants as soon as detected",
|
||
"Decontaminate tools and work surfaces with 10% bleach or trisodium phosphate",
|
||
"Wash hands thoroughly with soap and water after handling infected plants",
|
||
"Control insect vectors using reflective mulches, row covers, and registered insecticides",
|
||
"Remove weeds and alternate host plants that may harbor the virus",
|
||
],
|
||
prevention: [
|
||
"Purchase certified virus-free seed and transplants",
|
||
"Use insect-proof floating row covers during early growth stages",
|
||
"Isolate new plants for 2-3 weeks before introducing into the garden",
|
||
"Remove and destroy infected plants promptly at first symptom appearance",
|
||
"Rotate susceptible crops for 2-3 growing seasons",
|
||
],
|
||
},
|
||
environmental: {
|
||
severity: "low",
|
||
symptoms: [
|
||
"Physiological symptoms resembling pathogen-caused disease without signs of infection",
|
||
"Symptoms often appear uniformly across planting or follow a distinct pattern",
|
||
"Tissue discoloration, necrosis, leaf margin scorch, or fruit deformation",
|
||
"Symptoms correlate with recent weather events, irrigation changes, or chemical use",
|
||
"No visible signs of fungal spores, bacterial ooze, or insect activity",
|
||
],
|
||
causes: [
|
||
"Environmental stress including drought, flooding, temperature extremes, or sunscald",
|
||
"Nutrient deficiencies or toxicities in soil (calcium, boron, potassium, etc.)",
|
||
"Poor soil conditions: compaction, pH imbalance, poor drainage, or salt buildup",
|
||
"Chemical injury from pesticides, herbicides, fertilizers, or air pollutants",
|
||
],
|
||
treatment: [
|
||
"Identify and correct the underlying environmental or nutritional issue",
|
||
"Test soil pH and nutrient levels; amend based on laboratory recommendations",
|
||
"Establish and maintain a consistent watering schedule appropriate for the crop",
|
||
"Provide shade, wind protection, or frost protection as needed for local conditions",
|
||
"Adjust fertilizer program to address specific identified nutrient deficiencies",
|
||
],
|
||
prevention: [
|
||
"Test soil before planting and amend to recommended pH and nutrient levels",
|
||
"Choose plant varieties well-suited to local climate and soil conditions",
|
||
"Maintain consistent irrigation, especially during fruit development and hot weather",
|
||
"Apply balanced fertilizer according to soil test recommendations",
|
||
"Improve soil drainage with raised beds or incorporation of organic matter",
|
||
],
|
||
},
|
||
};
|
||
|
||
function makeDesc(name: string, sci: string, plant: string, type: string): string {
|
||
return `${name} is a ${type} disease affecting ${plant}. Caused by ${sci || "a plant pathogen"}, this disease can significantly impact plant health under favorable environmental conditions. Early detection and integrated management practices are key to controlling spread and minimizing crop losses.`;
|
||
}
|
||
|
||
// ─── Source definitions ──────────────────────────────────────────────────────
|
||
|
||
interface Src {
|
||
slug: string;
|
||
name: string;
|
||
sci: string;
|
||
fam: string;
|
||
cat: string;
|
||
page: string;
|
||
care: string;
|
||
img: string;
|
||
}
|
||
|
||
const SOURCES: Src[] = [
|
||
{
|
||
slug: "tomato",
|
||
name: "Tomato",
|
||
sci: "Solanum lycopersicum",
|
||
fam: "Solanaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_tomato_diseases",
|
||
care: "Full sun (6-8h), consistent watering, well-drained soil pH 6.0-6.8.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "potato",
|
||
name: "Potato",
|
||
sci: "Solanum tuberosum",
|
||
fam: "Solanaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_potato_diseases",
|
||
care: "Full sun (6-8h), consistent watering, cool temps, loose soil pH 5.0-6.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "apple",
|
||
name: "Apple",
|
||
sci: "Malus domestica",
|
||
fam: "Rosaceae",
|
||
cat: "tree",
|
||
page: "List_of_apple_diseases",
|
||
care: "Full sun (8h+), deep watering weekly, well-drained soil pH 6.0-7.0.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "apricot",
|
||
name: "Apricot",
|
||
sci: "Prunus armeniaca",
|
||
fam: "Rosaceae",
|
||
cat: "tree",
|
||
page: "List_of_apricot_diseases",
|
||
care: "Full sun (8h+), moderate watering, well-drained soil pH 6.5-7.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "avocado",
|
||
name: "Avocado",
|
||
sci: "Persea americana",
|
||
fam: "Lauraceae",
|
||
cat: "tree",
|
||
page: "List_of_avocado_diseases",
|
||
care: "Full sun (6-8h), moderate watering, well-drained soil pH 5.5-7.0.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "banana",
|
||
name: "Banana",
|
||
sci: "Musa acuminata",
|
||
fam: "Musaceae",
|
||
cat: "fruit",
|
||
page: "List_of_banana_diseases",
|
||
care: "Full sun (8h+), consistent watering, warm temps 75-90°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "barley",
|
||
name: "Barley",
|
||
sci: "Hordeum vulgare",
|
||
fam: "Poaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_barley_diseases",
|
||
care: "Full sun (8h+), moderate watering, cool temps 55-75°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "bean",
|
||
name: "Green Bean",
|
||
sci: "Phaseolus vulgaris",
|
||
fam: "Fabaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_legume_diseases",
|
||
care: "Full sun (6-8h), moderate watering, warm temps 65-80°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "blueberry",
|
||
name: "Blueberry",
|
||
sci: "Vaccinium corymbosum",
|
||
fam: "Ericaceae",
|
||
cat: "fruit",
|
||
page: "List_of_blueberry_diseases",
|
||
care: "Full sun, consistent moisture, acidic soil pH 4.5-5.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "cabbage",
|
||
name: "Cabbage",
|
||
sci: "Brassica oleracea var. capitata",
|
||
fam: "Brassicaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_brassica_diseases",
|
||
care: "Full sun, consistent watering, cool temps 50-85°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "carrot",
|
||
name: "Carrot",
|
||
sci: "Daucus carota subsp. sativus",
|
||
fam: "Apiaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_carrot_diseases",
|
||
care: "Full sun, consistent moisture, cool temps, loose sandy soil.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "cherry",
|
||
name: "Cherry",
|
||
sci: "Prunus avium",
|
||
fam: "Rosaceae",
|
||
cat: "tree",
|
||
page: "List_of_cherry_diseases",
|
||
care: "Full sun, moderate watering, well-drained loam pH 6.0-7.0.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "citrus",
|
||
name: "Citrus (Orange)",
|
||
sci: "Citrus × sinensis",
|
||
fam: "Rutaceae",
|
||
cat: "tree",
|
||
page: "List_of_citrus_diseases",
|
||
care: "Full sun, consistent watering, acidic soil pH 5.5-6.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "cocoa",
|
||
name: "Cocoa (Cacao)",
|
||
sci: "Theobroma cacao",
|
||
fam: "Malvaceae",
|
||
cat: "tree",
|
||
page: "List_of_cocoa_diseases",
|
||
care: "Partial shade, consistent rainfall, warm tropics 65-90°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "coconut",
|
||
name: "Coconut",
|
||
sci: "Cocos nucifera",
|
||
fam: "Arecaceae",
|
||
cat: "tree",
|
||
page: "List_of_coconut_palm_diseases",
|
||
care: "Full sun, moderate watering, warm temps 70-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "coffee",
|
||
name: "Coffee",
|
||
sci: "Coffea arabica",
|
||
fam: "Rubiaceae",
|
||
cat: "tree",
|
||
page: "List_of_coffee_diseases",
|
||
care: "Partial shade, consistent rainfall, moderate temps 60-70°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "corn",
|
||
name: "Corn (Maize)",
|
||
sci: "Zea mays",
|
||
fam: "Poaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_maize_diseases",
|
||
care: "Full sun, consistent watering, warm temps 65-85°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "cranberry",
|
||
name: "Cranberry",
|
||
sci: "Vaccinium macrocarpon",
|
||
fam: "Ericaceae",
|
||
cat: "fruit",
|
||
page: "List_of_cranberry_diseases",
|
||
care: "Full sun, constant moisture, acidic soil pH 4.5-5.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "cucumber",
|
||
name: "Cucumber",
|
||
sci: "Cucumis sativus",
|
||
fam: "Cucurbitaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_cucurbit_diseases",
|
||
care: "Full sun, consistent watering, warm temps 70-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "grape",
|
||
name: "Grape",
|
||
sci: "Vitis vinifera",
|
||
fam: "Vitaceae",
|
||
cat: "fruit",
|
||
page: "List_of_grape_diseases",
|
||
care: "Full sun, moderate watering, well-drained soil pH 5.5-7.0.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "hops",
|
||
name: "Hops",
|
||
sci: "Humulus lupulus",
|
||
fam: "Cannabaceae",
|
||
cat: "herb",
|
||
page: "List_of_hops_diseases",
|
||
care: "Full sun, consistent watering, well-drained soil pH 6.0-7.0.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "lettuce",
|
||
name: "Lettuce",
|
||
sci: "Lactuca sativa",
|
||
fam: "Asteraceae",
|
||
cat: "vegetable",
|
||
page: "List_of_lettuce_diseases",
|
||
care: "Partial shade to full sun, consistent moisture, cool temps 55-75°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "mango",
|
||
name: "Mango",
|
||
sci: "Mangifera indica",
|
||
fam: "Anacardiaceae",
|
||
cat: "tree",
|
||
page: "List_of_mango_diseases",
|
||
care: "Full sun, moderate watering, warm temps 70-100°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "oats",
|
||
name: "Oats",
|
||
sci: "Avena sativa",
|
||
fam: "Poaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_oats_diseases",
|
||
care: "Full sun, moderate watering, cool temps 50-70°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "onion",
|
||
name: "Onion",
|
||
sci: "Allium cepa",
|
||
fam: "Amaryllidaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_onion_diseases",
|
||
care: "Full sun, consistent watering, cool to warm temps 55-75°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "papaya",
|
||
name: "Papaya",
|
||
sci: "Carica papaya",
|
||
fam: "Caricaceae",
|
||
cat: "fruit",
|
||
page: "List_of_papaya_diseases",
|
||
care: "Full sun, consistent watering, warm temps 70-90°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "peach",
|
||
name: "Peach",
|
||
sci: "Prunus persica",
|
||
fam: "Rosaceae",
|
||
cat: "tree",
|
||
page: "List_of_peach_diseases",
|
||
care: "Full sun, consistent watering, well-drained sandy loam pH 6.0-7.0.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "peanut",
|
||
name: "Peanut (Groundnut)",
|
||
sci: "Arachis hypogaea",
|
||
fam: "Fabaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_peanut_diseases",
|
||
care: "Full sun, moderate watering, warm temps 75-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "pear",
|
||
name: "Pear",
|
||
sci: "Pyrus communis",
|
||
fam: "Rosaceae",
|
||
cat: "tree",
|
||
page: "List_of_pear_diseases",
|
||
care: "Full sun, consistent watering, well-drained loam pH 6.0-7.0.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "pepper",
|
||
name: "Bell Pepper",
|
||
sci: "Capsicum annuum",
|
||
fam: "Solanaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_tomato_diseases",
|
||
care: "Full sun, consistent watering, warm soil 70-80°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "pineapple",
|
||
name: "Pineapple",
|
||
sci: "Ananas comosus",
|
||
fam: "Bromeliaceae",
|
||
cat: "fruit",
|
||
page: "List_of_pineapple_diseases",
|
||
care: "Full sun, moderate watering, warm temps 65-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "raspberry",
|
||
name: "Raspberry",
|
||
sci: "Rubus idaeus",
|
||
fam: "Rosaceae",
|
||
cat: "fruit",
|
||
page: "List_of_raspberry_diseases",
|
||
care: "Full sun, consistent watering, slightly acidic soil pH 5.5-6.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "rice",
|
||
name: "Rice",
|
||
sci: "Oryza sativa",
|
||
fam: "Poaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_rice_diseases",
|
||
care: "Full sun, flooded field conditions, warm temps 70-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "rose",
|
||
name: "Rose",
|
||
sci: "Rosa spp.",
|
||
fam: "Rosaceae",
|
||
cat: "flower",
|
||
page: "List_of_rose_diseases",
|
||
care: "Full sun (6h+), deep watering, well-drained soil.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "sorghum",
|
||
name: "Sorghum",
|
||
sci: "Sorghum bicolor",
|
||
fam: "Poaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_sorghum_diseases",
|
||
care: "Full sun, moderate watering, warm temps 75-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "soybean",
|
||
name: "Soybean",
|
||
sci: "Glycine max",
|
||
fam: "Fabaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_soybean_diseases",
|
||
care: "Full sun, moderate watering, warm temps 60-85°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "spinach",
|
||
name: "Spinach",
|
||
sci: "Spinacia oleracea",
|
||
fam: "Amaranthaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_spinach_diseases",
|
||
care: "Partial shade to full sun, consistent moisture, cool temps 50-70°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "strawberry",
|
||
name: "Strawberry",
|
||
sci: "Fragaria × ananassa",
|
||
fam: "Rosaceae",
|
||
cat: "fruit",
|
||
page: "List_of_strawberry_diseases",
|
||
care: "Full sun, consistent watering, acidic soil pH 5.5-6.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "sugarcane",
|
||
name: "Sugarcane",
|
||
sci: "Saccharum officinarum",
|
||
fam: "Poaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_sugarcane_diseases",
|
||
care: "Full sun, heavy watering, warm temps 75-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "sunflower",
|
||
name: "Sunflower",
|
||
sci: "Helianthus annuus",
|
||
fam: "Asteraceae",
|
||
cat: "flower",
|
||
page: "List_of_sunflower_diseases",
|
||
care: "Full sun (6-8h+), moderate watering, warm temps 70-78°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "sweet-potato",
|
||
name: "Sweet Potato",
|
||
sci: "Ipomoea batatas",
|
||
fam: "Convolvulaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_sweet_potato_diseases",
|
||
care: "Full sun, moderate watering, warm temps 65-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "tobacco",
|
||
name: "Tobacco",
|
||
sci: "Nicotiana tabacum",
|
||
fam: "Solanaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_tobacco_diseases",
|
||
care: "Full sun, moderate watering, warm temps 65-85°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "watermelon",
|
||
name: "Watermelon",
|
||
sci: "Citrullus lanatus",
|
||
fam: "Cucurbitaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_cucurbit_diseases",
|
||
care: "Full sun, consistent watering, warm temps 75-85°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "wheat",
|
||
name: "Wheat",
|
||
sci: "Triticum aestivum",
|
||
fam: "Poaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_wheat_diseases",
|
||
care: "Full sun, moderate watering, cool to warm temps 55-75°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "alfalfa",
|
||
name: "Alfalfa",
|
||
sci: "Medicago sativa",
|
||
fam: "Fabaceae",
|
||
cat: "herb",
|
||
page: "List_of_alfalfa_diseases",
|
||
care: "Full sun, drought tolerant, deep well-drained soil pH 6.5-7.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "asparagus",
|
||
name: "Asparagus",
|
||
sci: "Asparagus officinalis",
|
||
fam: "Asparagaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_asparagus_diseases",
|
||
care: "Full sun, consistent watering, well-drained sandy soil pH 6.5-7.5.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "celery",
|
||
name: "Celery",
|
||
sci: "Apium graveolens",
|
||
fam: "Apiaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_celery_diseases",
|
||
care: "Full sun, consistent moisture, cool temps 55-70°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "chickpea",
|
||
name: "Chickpea",
|
||
sci: "Cicer arietinum",
|
||
fam: "Fabaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_legume_diseases",
|
||
care: "Full sun, drought tolerant, warm temps 65-85°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "clover",
|
||
name: "Clover",
|
||
sci: "Trifolium repens",
|
||
fam: "Fabaceae",
|
||
cat: "herb",
|
||
page: "List_of_clover_diseases",
|
||
care: "Full sun to partial shade, moderate watering, cool temps.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "cowpea",
|
||
name: "Cowpea",
|
||
sci: "Vigna unguiculata",
|
||
fam: "Fabaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_legume_diseases",
|
||
care: "Full sun, drought tolerant, warm temps 65-95°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "faba-bean",
|
||
name: "Faba Bean",
|
||
sci: "Vicia faba",
|
||
fam: "Fabaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_legume_diseases",
|
||
care: "Full sun, consistent watering, cool temps 55-70°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "lentil",
|
||
name: "Lentil",
|
||
sci: "Lens culinaris",
|
||
fam: "Fabaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_legume_diseases",
|
||
care: "Full sun, drought tolerant, cool temps 50-80°F.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "pigeon-pea",
|
||
name: "Pigeon Pea",
|
||
sci: "Cajanus cajan",
|
||
fam: "Fabaceae",
|
||
cat: "vegetable",
|
||
page: "List_of_legume_diseases",
|
||
care: "Full sun, drought tolerant, warm tropical temps.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "tea",
|
||
name: "Tea (Camellia sinensis)",
|
||
sci: "Camellia sinensis",
|
||
fam: "Theaceae",
|
||
cat: "tree",
|
||
page: "List_of_tea_diseases",
|
||
care: "Partial shade, consistent moisture, acidic soil pH 4.5-6.0.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "turfgrass",
|
||
name: "Turfgrass (Lawn)",
|
||
sci: "Multiple Poaceae spp.",
|
||
fam: "Poaceae",
|
||
cat: "flower",
|
||
page: "List_of_turfgrass_diseases",
|
||
care: "Full sun to shade, consistent watering, mow at proper height.",
|
||
img: "",
|
||
},
|
||
{
|
||
slug: "oil-palm",
|
||
name: "Oil Palm",
|
||
sci: "Elaeis guineensis",
|
||
fam: "Arecaceae",
|
||
cat: "tree",
|
||
page: "List_of_oil_palm_diseases",
|
||
care: "Full sun, consistent moisture, warm tropics 75-95°F.",
|
||
img: "",
|
||
},
|
||
];
|
||
|
||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||
|
||
async function main() {
|
||
console.log("🌿 Wikipedia Plant Disease Scraper\n");
|
||
|
||
const db = getDb();
|
||
const totalDiseases = 0;
|
||
let totalPlants = 0;
|
||
const pageCache = new Map<string, TableData[]>(); // page → tables
|
||
|
||
// Collect unique pages with their sources
|
||
const pageToSources = new Map<string, Src[]>();
|
||
for (const src of SOURCES) {
|
||
const list = pageToSources.get(src.page) || [];
|
||
list.push(src);
|
||
pageToSources.set(src.page, list);
|
||
}
|
||
|
||
console.log(`🌱 ${SOURCES.length} plant entries, ${pageToSources.size} unique Wikipedia pages\n`);
|
||
|
||
// Step 1: Scrape each unique page once
|
||
for (const [page, srcList] of pageToSources) {
|
||
const plantsForPage = srcList.map((s) => s.name).join(", ");
|
||
console.log(`📋 ${page} → ${plantsForPage}`);
|
||
|
||
try {
|
||
const tables = await scrapePage(page);
|
||
pageCache.set(page, tables);
|
||
const totalRows = tables.reduce((s, t) => s + t.rows.length, 0);
|
||
console.log(` → ${tables.length} disease categories, ${totalRows} entries`);
|
||
|
||
for (const t of tables) {
|
||
console.log(` ${t.type}: ${t.rows.length} diseases`);
|
||
}
|
||
} catch (err) {
|
||
console.error(` ❌ ${err instanceof Error ? err.message : err}`);
|
||
}
|
||
}
|
||
|
||
// Step 2: Build all disease entries per plant
|
||
interface DiseaseEntry {
|
||
id: string;
|
||
plantId: string;
|
||
name: string;
|
||
scientificName: string;
|
||
causalAgentType: CausalAgentType;
|
||
description: string;
|
||
symptoms: string[];
|
||
causes: string[];
|
||
treatment: string[];
|
||
prevention: string[];
|
||
lookalikeIds: string[];
|
||
severity: Severity;
|
||
sourceUrl: string;
|
||
}
|
||
|
||
const allDiseases: DiseaseEntry[] = [];
|
||
const insertedPlants = new Set<string>();
|
||
|
||
for (const src of SOURCES) {
|
||
// Insert plant if not already
|
||
if (!insertedPlants.has(src.slug)) {
|
||
insertedPlants.add(src.slug);
|
||
totalPlants++;
|
||
await db
|
||
.insert(plants)
|
||
.values({
|
||
id: src.slug,
|
||
commonName: src.name,
|
||
scientificName: src.sci,
|
||
family: src.fam,
|
||
category: src.cat,
|
||
careSummary: src.care,
|
||
imageUrl: src.img,
|
||
})
|
||
.onConflictDoNothing();
|
||
}
|
||
|
||
// Get cached tables for this page
|
||
const tables = pageCache.get(src.page);
|
||
if (!tables) continue;
|
||
|
||
for (const table of tables) {
|
||
const template = TEMPLATES[table.type];
|
||
for (const row of table.rows) {
|
||
const diseaseId = `${src.slug}-${slugify(row.name)}`;
|
||
|
||
allDiseases.push({
|
||
id: diseaseId,
|
||
plantId: src.slug,
|
||
name: row.name,
|
||
scientificName: row.sci,
|
||
causalAgentType: table.type,
|
||
description: makeDesc(row.name, row.sci, src.name, table.type),
|
||
symptoms: template.symptoms,
|
||
causes: template.causes,
|
||
treatment: template.treatment,
|
||
prevention: template.prevention,
|
||
lookalikeIds: [],
|
||
severity: template.severity,
|
||
sourceUrl: `https://en.wikipedia.org/wiki/${src.page}`,
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
// Step 3: Link lookalikes (same plant, same type)
|
||
const byPlant = new Map<string, DiseaseEntry[]>();
|
||
for (const d of allDiseases) {
|
||
const list = byPlant.get(d.plantId) || [];
|
||
list.push(d);
|
||
byPlant.set(d.plantId, list);
|
||
}
|
||
for (const [, di] of byPlant) {
|
||
for (const d of di) {
|
||
if (d.severity === "low") continue;
|
||
const sameType = di.filter((o) => o.causalAgentType === d.causalAgentType && o.id !== d.id);
|
||
d.lookalikeIds = sameType.slice(0, 3).map((o) => o.id);
|
||
}
|
||
}
|
||
|
||
console.log(
|
||
`\n📊 Total: ${totalDiseases + allDiseases.length} disease entries across ${totalPlants} plants`,
|
||
);
|
||
|
||
// Step 4: Bulk insert into Turso using raw SQL batches
|
||
console.log("\n💾 Inserting into Turso via batch...");
|
||
const BATCH_SIZE = 100;
|
||
let inserted = 0;
|
||
|
||
// Use the raw libsql client for batch operations
|
||
const { createClient } = await import("@libsql/client");
|
||
const rawClient = createClient({
|
||
url: process.env.DATABASE_URL!,
|
||
authToken: process.env.DATABASE_TOKEN!,
|
||
});
|
||
|
||
for (let i = 0; i < allDiseases.length; i += BATCH_SIZE) {
|
||
const chunk = allDiseases.slice(i, i + BATCH_SIZE);
|
||
const stmts = chunk.map((d) => ({
|
||
sql: `INSERT OR IGNORE INTO diseases (id, plant_id, name, scientific_name, causal_agent_type, description, symptoms, causes, treatment, prevention, lookalike_ids, severity, source_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||
args: [
|
||
d.id,
|
||
d.plantId,
|
||
d.name,
|
||
d.scientificName,
|
||
d.causalAgentType,
|
||
d.description,
|
||
JSON.stringify(d.symptoms),
|
||
JSON.stringify(d.causes),
|
||
JSON.stringify(d.treatment),
|
||
JSON.stringify(d.prevention),
|
||
JSON.stringify(d.lookalikeIds),
|
||
d.severity,
|
||
d.sourceUrl,
|
||
],
|
||
}));
|
||
|
||
await rawClient.batch(stmts, "write");
|
||
inserted += chunk.length;
|
||
process.stdout.write(` ${Math.min(inserted, allDiseases.length)}/${allDiseases.length}\n`);
|
||
}
|
||
|
||
rawClient.close();
|
||
|
||
// Log scrape
|
||
await db
|
||
.insert(scrapeSources)
|
||
.values({
|
||
id: "wikipedia-scrape",
|
||
sourceType: "wikipedia",
|
||
sourceUrl: "https://en.wikipedia.org/wiki/Category:Plant_pathogens_and_diseases",
|
||
entriesCount: allDiseases.length,
|
||
status: "success",
|
||
lastScrapedAt: new Date().toISOString(),
|
||
})
|
||
.onConflictDoUpdate({
|
||
target: scrapeSources.id,
|
||
set: {
|
||
entriesCount: allDiseases.length,
|
||
status: "success" as const,
|
||
lastScrapedAt: new Date().toISOString(),
|
||
},
|
||
});
|
||
|
||
// Stats
|
||
const [pc] = await db.select({ c: sql<number>`COUNT(*)` }).from(plants);
|
||
const [dc] = await db.select({ c: sql<number>`COUNT(*)` }).from(diseases);
|
||
console.log(`\n✅ Done! Database: ${pc.c} plants, ${dc.c} diseases`);
|
||
closeDb();
|
||
}
|
||
|
||
main().catch((err) => {
|
||
console.error("❌", err);
|
||
process.exit(1);
|
||
});
|