re-init
This commit is contained in:
53
scripts/apply-flag-migration.ts
Normal file
53
scripts/apply-flag-migration.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
/**
|
||||
* apply-flag-migration.ts
|
||||
*
|
||||
* Applies the flagged_content table migration to Turso.
|
||||
* Run with: npx tsx scripts/apply-flag-migration.ts
|
||||
*/
|
||||
|
||||
import dotenv from "dotenv";
|
||||
import path from "node:path";
|
||||
|
||||
const envFile =
|
||||
process.env.NODE_ENV === "production" ? "../.env.production" : "../.env.development";
|
||||
dotenv.config({ path: path.resolve(__dirname, envFile) });
|
||||
|
||||
import { createClient } from "@libsql/client";
|
||||
|
||||
async function main() {
|
||||
const db = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
console.log("Applying migration: create flagged_content table...");
|
||||
|
||||
await db.execute(`
|
||||
CREATE TABLE IF NOT EXISTS flagged_content (
|
||||
id text PRIMARY KEY NOT NULL,
|
||||
content_type text NOT NULL,
|
||||
content_id text NOT NULL,
|
||||
field_name text NOT NULL,
|
||||
notes text DEFAULT '',
|
||||
flag_count integer DEFAULT 1 NOT NULL,
|
||||
created_at text DEFAULT (datetime('now')) NOT NULL,
|
||||
updated_at text DEFAULT (datetime('now')) NOT NULL
|
||||
)
|
||||
`);
|
||||
|
||||
await db.execute(`
|
||||
CREATE INDEX IF NOT EXISTS idx_flagged_content_type ON flagged_content (content_type)
|
||||
`);
|
||||
|
||||
await db.execute(`
|
||||
CREATE INDEX IF NOT EXISTS idx_flagged_content_id ON flagged_content (content_id)
|
||||
`);
|
||||
|
||||
console.log("Migration applied successfully.");
|
||||
db.close();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Migration failed:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
23
scripts/apply-migration.ts
Normal file
23
scripts/apply-migration.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import "dotenv/config";
|
||||
import { createClient } from "@libsql/client";
|
||||
|
||||
async function main() {
|
||||
const db = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
console.log("Applying migration: add image_url to diseases...");
|
||||
await db.execute("ALTER TABLE diseases ADD COLUMN image_url TEXT DEFAULT ''");
|
||||
await db.execute("UPDATE diseases SET image_url = '' WHERE image_url IS NULL");
|
||||
|
||||
// Mark migration as applied
|
||||
await db.execute(
|
||||
"INSERT INTO __drizzle_migrations (hash, created_at) VALUES ('0001_add-disease-images', datetime('now'))",
|
||||
);
|
||||
|
||||
console.log("Migration applied successfully.");
|
||||
db.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
19
scripts/check-progress.mjs
Normal file
19
scripts/check-progress.mjs
Normal file
@@ -0,0 +1,19 @@
|
||||
import { createClient } from "@libsql/client";
|
||||
const c = createClient({
|
||||
url: process.env.DATABASE_URL,
|
||||
authToken: process.env.DATABASE_TOKEN,
|
||||
});
|
||||
const r = await c.execute("SELECT COUNT(*) as cnt FROM diseases");
|
||||
const r2 = await c.execute(
|
||||
`SELECT SUM(CASE WHEN image_url IS NOT NULL AND image_url != '' THEN 1 ELSE 0 END) as has, SUM(CASE WHEN image_url IS NULL OR image_url = '' THEN 1 ELSE 0 END) as miss FROM diseases`,
|
||||
);
|
||||
const r3 = await c.execute(
|
||||
`SELECT severity, COUNT(*) as total, SUM(CASE WHEN image_url IS NOT NULL AND image_url != '' THEN 1 ELSE 0 END) as has FROM diseases GROUP BY severity ORDER BY severity`,
|
||||
);
|
||||
console.log(
|
||||
`Total: ${r.rows[0].cnt} | With images: ${r2.rows[0].has} | Missing: ${r2.rows[0].miss}`,
|
||||
);
|
||||
for (const row of r3.rows) {
|
||||
console.log(` ${row.severity?.padEnd(10)}: ${row.has}/${row.total}`);
|
||||
}
|
||||
c.close();
|
||||
296
scripts/convert-keras-to-tfjs.py
Normal file
296
scripts/convert-keras-to-tfjs.py
Normal file
@@ -0,0 +1,296 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Inspect and convert a .keras plant disease model to TF.js GraphModel format.
|
||||
|
||||
Uses tensorflowjs_converter CLI to avoid Keras version deserialization issues.
|
||||
|
||||
Usage:
|
||||
pip3 install tensorflowjs # also pulls tensorflow as dependency
|
||||
python3 scripts/convert-keras-to-tfjs.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
MODEL_PATH = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||
"public",
|
||||
"models",
|
||||
"plant-disease-classifier",
|
||||
"best_mnv2_pv_original.keras",
|
||||
)
|
||||
|
||||
OUTPUT_DIR = os.path.join(
|
||||
os.path.dirname(MODEL_PATH),
|
||||
"tfjs_model",
|
||||
)
|
||||
|
||||
|
||||
def inspect_keras_metadata():
|
||||
"""Read .keras archive metadata without loading the model."""
|
||||
print("=" * 60)
|
||||
print("MODEL INSPECTION (metadata only)")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
import zipfile
|
||||
except ImportError:
|
||||
print("ERROR: zipfile not available")
|
||||
sys.exit(1)
|
||||
|
||||
if not os.path.exists(MODEL_PATH):
|
||||
print(f"ERROR: Model not found at {MODEL_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\nModel file: {MODEL_PATH}")
|
||||
print(
|
||||
f"File size: {os.path.getsize(MODEL_PATH):,} bytes ({os.path.getsize(MODEL_PATH) / 1024 / 1024:.1f} MB)"
|
||||
)
|
||||
|
||||
# .keras files are ZIP archives
|
||||
with zipfile.ZipFile(MODEL_PATH) as zf:
|
||||
names = zf.namelist()
|
||||
print(f"\nArchive contents ({len(names)} entries):")
|
||||
for name in names:
|
||||
info = zf.getinfo(name)
|
||||
print(f" {name:<40s} {info.file_size:>10,} bytes")
|
||||
|
||||
# Read config.json for model architecture info
|
||||
config_path = None
|
||||
for name in names:
|
||||
if name.endswith("config.json"):
|
||||
config_path = name
|
||||
break
|
||||
|
||||
if config_path:
|
||||
print(f"\nReading {config_path}...")
|
||||
with zf.open(config_path) as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Extract key info
|
||||
model_type = config.get("class_name", "unknown")
|
||||
print(f"Model class: {model_type}")
|
||||
|
||||
# Try to find output layer info
|
||||
if "config" in config:
|
||||
inner_config = config["config"]
|
||||
|
||||
# Look for output shape in config
|
||||
if "output_shape" in inner_config:
|
||||
print(f"Output shape: {inner_config['output_shape']}")
|
||||
|
||||
# Look through layers for the final dense layer
|
||||
if "layers" in inner_config:
|
||||
layers = inner_config["layers"]
|
||||
print(f"\nLayers ({len(layers)} total):")
|
||||
for layer in layers:
|
||||
layer_name = layer.get("config", {}).get("name", "?")
|
||||
layer_class = layer.get("class_name", "?")
|
||||
layer_module = layer.get("module", "?")
|
||||
|
||||
# Extract units/activation for dense layers
|
||||
layer_config = layer.get("config", {})
|
||||
units = layer_config.get("units")
|
||||
activation = layer_config.get("activation")
|
||||
|
||||
detail = ""
|
||||
if units:
|
||||
detail = f" units={units}"
|
||||
if activation:
|
||||
detail += f" activation={activation}"
|
||||
|
||||
print(f" {layer_name:<30s} {layer_class:<20s}{detail}")
|
||||
|
||||
# Find last dense layer for class count
|
||||
for layer in reversed(layers):
|
||||
if layer.get("class_name") == "Dense":
|
||||
units = layer.get("config", {}).get("units")
|
||||
activation = layer.get("config", {}).get("activation")
|
||||
print("\nClassification head:")
|
||||
print(f" Units (classes): {units}")
|
||||
print(f" Activation: {activation}")
|
||||
print(
|
||||
f" Layer name: {layer.get('config', {}).get('name', '?')}"
|
||||
)
|
||||
break
|
||||
|
||||
# Check compile config
|
||||
if "compile_config" in config:
|
||||
compile_cfg = config["compile_config"]
|
||||
optimizer = compile_cfg.get("optimizer", {})
|
||||
if isinstance(optimizer, dict):
|
||||
opt_name = optimizer.get("class_name", "?")
|
||||
lr = optimizer.get("config", {}).get("learning_rate")
|
||||
print("\nTraining config:")
|
||||
print(f" Optimizer: {opt_name}")
|
||||
if lr:
|
||||
print(f" Learning rate: {lr}")
|
||||
loss = compile_cfg.get("loss", "?")
|
||||
metrics = compile_cfg.get("metrics", [])
|
||||
print(f" Loss: {loss}")
|
||||
print(f" Metrics: {metrics}")
|
||||
|
||||
# Check input shape
|
||||
if "build_config" in config:
|
||||
build_cfg = config["build_config"]
|
||||
if "input_shape" in build_cfg:
|
||||
print(f"\nInput shape: {build_cfg['input_shape']}")
|
||||
|
||||
|
||||
def convert_to_tfjs():
|
||||
"""Convert using tensorflowjs_converter CLI."""
|
||||
print("\n" + "=" * 60)
|
||||
print("CONVERTING TO TF.JS GRAPH MODEL")
|
||||
print("=" * 60)
|
||||
|
||||
# Check tensorflowjs_converter CLI is available
|
||||
converter = shutil.which("tensorflowjs_converter")
|
||||
if not converter:
|
||||
print("ERROR: tensorflowjs_converter not found in PATH.")
|
||||
print(" pip3 install tensorflowjs")
|
||||
sys.exit(1)
|
||||
|
||||
# Clean output dir
|
||||
if os.path.exists(OUTPUT_DIR):
|
||||
print(f"Removing existing output dir: {OUTPUT_DIR}")
|
||||
shutil.rmtree(OUTPUT_DIR)
|
||||
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
print(f"\nConverting {MODEL_PATH} -> {OUTPUT_DIR}/")
|
||||
print("(this may take a minute...)")
|
||||
|
||||
# Use the venv's python to run the converter (avoids import issues)
|
||||
python_exe = sys.executable # the python running this script
|
||||
result = subprocess.run(
|
||||
[
|
||||
python_exe,
|
||||
"-m",
|
||||
"tensorflowjs.converters.converter",
|
||||
"--input_format=keras",
|
||||
"--output_format=tfjs_graph_model",
|
||||
MODEL_PATH,
|
||||
OUTPUT_DIR,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print("\nERROR: Conversion failed!")
|
||||
print(f"stdout: {result.stdout}")
|
||||
print(f"stderr: {result.stderr}")
|
||||
sys.exit(1)
|
||||
|
||||
if result.stdout:
|
||||
print(result.stdout)
|
||||
if result.stderr:
|
||||
# Some warnings are normal
|
||||
print(f"Converter output: {result.stderr}")
|
||||
|
||||
# Verify output
|
||||
model_json_path = os.path.join(OUTPUT_DIR, "model.json")
|
||||
if not os.path.exists(model_json_path):
|
||||
print("ERROR: Conversion did not produce model.json")
|
||||
sys.exit(1)
|
||||
|
||||
# List output files
|
||||
files = os.listdir(OUTPUT_DIR)
|
||||
total_size = sum(
|
||||
os.path.getsize(os.path.join(OUTPUT_DIR, f))
|
||||
for f in files
|
||||
if os.path.isfile(os.path.join(OUTPUT_DIR, f))
|
||||
)
|
||||
|
||||
print("\nConversion complete!")
|
||||
print(f"Output directory: {OUTPUT_DIR}/")
|
||||
print(f"Files: {len(files)}")
|
||||
for f in sorted(files):
|
||||
fpath = os.path.join(OUTPUT_DIR, f)
|
||||
if os.path.isfile(fpath):
|
||||
size = os.path.getsize(fpath)
|
||||
print(f" {f:<30s} {size:>10,} bytes")
|
||||
print(f"Total size: {total_size:,} bytes ({total_size / 1024 / 1024:.1f} MB)")
|
||||
|
||||
# Read model.json to check config
|
||||
with open(model_json_path) as f:
|
||||
model_json = json.load(f)
|
||||
|
||||
print(f"\nTF.js model format: {model_json.get('format', 'unknown')}")
|
||||
print(f"Generated by: {model_json.get('generatedBy', 'unknown')}")
|
||||
|
||||
# Inspect model topology
|
||||
if "modelTopology" in model_json:
|
||||
topology = model_json["modelTopology"]
|
||||
print("\nModel topology:")
|
||||
print(f" Name: {topology.get('model_name', 'unnamed')}")
|
||||
print(f" Ops: {len(topology.get('node', []))} nodes")
|
||||
|
||||
# Input/output nodes
|
||||
inputs = topology.get("inputs", {})
|
||||
outputs = topology.get("outputs", {})
|
||||
print(f" Inputs: {list(inputs.keys())}")
|
||||
for name, info in inputs.items():
|
||||
shape = info.get("tensorShape", {})
|
||||
print(f" {name}: shape={shape.get('dim', 'unknown')}")
|
||||
print(f" Outputs: {list(outputs.keys())}")
|
||||
for name, info in outputs.items():
|
||||
shape = info.get("tensorShape", {})
|
||||
print(f" {name}: shape={shape.get('dim', 'unknown')}")
|
||||
|
||||
# Check weights specification
|
||||
if "weightsManifest" in model_json:
|
||||
manifest = model_json["weightsManifest"]
|
||||
print(f"\nWeight manifests: {len(manifest)}")
|
||||
for i, m in enumerate(manifest):
|
||||
shards = m.get("shards", [])
|
||||
print(f" Manifest {i}: {len(shards)} shard(s)")
|
||||
|
||||
return OUTPUT_DIR
|
||||
|
||||
|
||||
def main():
|
||||
if not os.path.exists(MODEL_PATH):
|
||||
print(f"ERROR: Model not found at {MODEL_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
# Step 1: Inspect metadata
|
||||
inspect_keras_metadata()
|
||||
|
||||
# Step 2: Convert
|
||||
output_dir = convert_to_tfjs()
|
||||
|
||||
# Step 3: Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("NEXT STEPS")
|
||||
print("=" * 60)
|
||||
print(f"""
|
||||
1. Move the TF.js model to the expected location:
|
||||
The model-loader expects model.json at:
|
||||
public/models/plant-disease-classifier/model.json
|
||||
|
||||
Move files:
|
||||
mv {output_dir}/model.json public/models/plant-disease-classifier/
|
||||
mv {output_dir}/group1-shard* public/models/plant-disease-classifier/
|
||||
|
||||
2. IMPORTANT: This model has 38 output classes (original PlantVillage).
|
||||
Your labels.ts expects 95 classes (93 diseases + healthy + unknown).
|
||||
You'll need to either:
|
||||
a) Fine-tune the model with your 95-class dataset, OR
|
||||
b) Map the 38 PlantVillage classes to your disease IDs
|
||||
|
||||
3. Install @tensorflow/tfjs in your project:
|
||||
npm install @tensorflow/tfjs
|
||||
|
||||
4. Test with your API:
|
||||
npm run dev
|
||||
POST /api/identify with an uploaded image
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2337
scripts/disease-templates.ts
Normal file
2337
scripts/disease-templates.ts
Normal file
File diff suppressed because it is too large
Load Diff
691
scripts/expand-diseases.ts
Normal file
691
scripts/expand-diseases.ts
Normal file
@@ -0,0 +1,691 @@
|
||||
/**
|
||||
* Expand DB with comprehensive plant disease list from Wikipedia.
|
||||
*
|
||||
* Reads /tmp/plant_diseases/plant_diseases_comprehensive.txt,
|
||||
* compares against existing DB entries (by name, case-insensitive),
|
||||
* and inserts new entries with reasonable defaults.
|
||||
*
|
||||
* Usage:
|
||||
* cd apps/web && export $(grep -v '^#' .env.development | xargs) && npx tsx scripts/expand-diseases.ts
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync } from "fs";
|
||||
import { eq, sql } from "drizzle-orm";
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { plants, diseases } from "../src/lib/db/schema";
|
||||
import type { CausalAgentType, Severity } from "../src/lib/types";
|
||||
|
||||
// ─── Parse the comprehensive list ─────────────────────────────────────────────
|
||||
|
||||
interface DiseaseEntry {
|
||||
name: string;
|
||||
sourceUrl: string;
|
||||
}
|
||||
|
||||
function parseComprehensiveList(filePath: string): DiseaseEntry[] {
|
||||
const content = readFileSync(filePath, "utf-8");
|
||||
const entries: DiseaseEntry[] = [];
|
||||
const lines = content.split("\n");
|
||||
const nameRe = /^\d+\.\s+(.+)$/;
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const nameMatch = lines[i].match(nameRe);
|
||||
if (nameMatch) {
|
||||
const name = nameMatch[1].trim();
|
||||
const urlLine = lines[i + 1]?.trim() || "";
|
||||
// Only add if the next line is a valid URL
|
||||
if (urlLine.startsWith("http")) {
|
||||
entries.push({ name, sourceUrl: urlLine });
|
||||
i++; // skip the URL line
|
||||
} else {
|
||||
entries.push({ name, sourceUrl: "" });
|
||||
}
|
||||
}
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
// ─── Infer causal agent type from disease name ────────────────────────────────
|
||||
|
||||
function inferCausalAgent(name: string): CausalAgentType {
|
||||
const lower = name.toLowerCase();
|
||||
|
||||
// Bacterial indicators
|
||||
if (
|
||||
lower.startsWith("bacterial ") ||
|
||||
lower.includes(" xanthomonas") ||
|
||||
lower.includes(" pseudomonas") ||
|
||||
lower.includes(" erwinia") ||
|
||||
lower.includes(" ralstonia") ||
|
||||
lower.includes(" clavibacter") ||
|
||||
lower.includes(" streptomyces") ||
|
||||
lower.includes(" agrobacterium") ||
|
||||
lower.includes(" corynebacterium") ||
|
||||
lower.includes(" pectobacterium") ||
|
||||
lower.includes(" dickeya")
|
||||
) {
|
||||
return "bacterial";
|
||||
}
|
||||
|
||||
// Viral indicators - strong signals
|
||||
if (
|
||||
lower.includes(" mosaic") ||
|
||||
lower.includes(" yellows") ||
|
||||
lower.includes(" leaf roll") ||
|
||||
lower.includes(" leafroll") ||
|
||||
lower.includes(" ringspot") ||
|
||||
lower.includes(" ring spot") ||
|
||||
lower.includes(" enation") ||
|
||||
lower.includes(" phyllody") ||
|
||||
lower.includes(" witches") ||
|
||||
lower.includes(" witches'") ||
|
||||
lower.includes(" crinkle") ||
|
||||
lower.includes(" rosette") ||
|
||||
lower.includes(" shoestring") ||
|
||||
lower.includes(" tristeza") ||
|
||||
lower.includes(" psorosis") ||
|
||||
lower.includes(" stubborn") ||
|
||||
lower.includes(" greening") ||
|
||||
lower.includes(" vein banding") ||
|
||||
lower.includes(" vein mottle") ||
|
||||
lower.includes(" vein clearing") ||
|
||||
lower.includes(" leaf pucker") ||
|
||||
lower.includes(" pucker leaf") ||
|
||||
lower.includes(" latent") ||
|
||||
lower.includes(" motley") ||
|
||||
lower.includes(" rugose")
|
||||
) {
|
||||
return "viral";
|
||||
}
|
||||
|
||||
// Viral - names containing "virus" or "viroid"
|
||||
if (lower.includes(" virus") || lower.includes(" viroid") || lower.includes(" virosis")) {
|
||||
return "viral";
|
||||
}
|
||||
|
||||
// Nematodes
|
||||
if (
|
||||
lower.includes(" nematode") ||
|
||||
lower.includes(" nematodes") ||
|
||||
lower.includes(" eelworm") ||
|
||||
lower.includes(" root knot") ||
|
||||
lower.includes(" root-knot") ||
|
||||
lower.includes(" cyst ") ||
|
||||
lower.includes(" dagger ") ||
|
||||
lower.includes(" lance ") ||
|
||||
lower.includes(" lesion ") ||
|
||||
lower.includes(" ring ") ||
|
||||
lower.includes(" spiral ") ||
|
||||
lower.includes(" sting ") ||
|
||||
lower.includes(" stubby ") ||
|
||||
lower.includes(" needle ") ||
|
||||
lower.includes(" foliar ") ||
|
||||
lower.includes(" bulb ") ||
|
||||
lower.includes(" reniform ") ||
|
||||
lower.includes(" burrowing ")
|
||||
) {
|
||||
// Check if it's really a nematode name
|
||||
if (lower.includes("nematode")) return "environmental";
|
||||
}
|
||||
|
||||
// Fungal indicators
|
||||
if (
|
||||
lower.includes(" mildew") ||
|
||||
lower.includes(" rust") ||
|
||||
lower.includes(" smut") ||
|
||||
lower.includes(" blight") ||
|
||||
lower.includes(" canker") ||
|
||||
lower.includes(" rot") ||
|
||||
lower.includes(" scab") ||
|
||||
lower.includes(" mold") ||
|
||||
lower.includes(" anthracnose") ||
|
||||
lower.includes(" bunt") ||
|
||||
lower.includes(" ergot") ||
|
||||
lower.includes(" dieback") ||
|
||||
lower.includes(" scald") ||
|
||||
lower.includes(" blotch") ||
|
||||
lower.includes(" speckle") ||
|
||||
lower.includes(" sooty") ||
|
||||
lower.includes(" flyspeck") ||
|
||||
lower.includes(" fusarium") ||
|
||||
lower.includes(" alternaria") ||
|
||||
lower.includes(" botrytis") ||
|
||||
lower.includes(" rhizoctonia") ||
|
||||
lower.includes(" pythium") ||
|
||||
lower.includes(" phytophthora") ||
|
||||
lower.includes(" sclerotinia") ||
|
||||
lower.includes(" verticillium") ||
|
||||
lower.includes(" ascochyta") ||
|
||||
lower.includes(" cercospora") ||
|
||||
lower.includes(" septoria") ||
|
||||
lower.includes(" colletotrichum") ||
|
||||
lower.includes(" phomopsis") ||
|
||||
lower.includes(" diaporthe") ||
|
||||
lower.includes(" diplodia") ||
|
||||
lower.includes(" macrophomina") ||
|
||||
lower.includes(" cylindrocladium") ||
|
||||
lower.includes(" mycosphaerella") ||
|
||||
lower.includes(" helminthosporium") ||
|
||||
lower.includes(" curvularia") ||
|
||||
lower.includes(" bipolaris") ||
|
||||
lower.includes(" exserohilum") ||
|
||||
lower.includes(" dothiorella") ||
|
||||
lower.includes(" fusicoccum") ||
|
||||
lower.includes(" pestalotia") ||
|
||||
lower.includes(" glomerella") ||
|
||||
lower.includes(" nectria") ||
|
||||
lower.includes(" eutypa") ||
|
||||
lower.includes(" armillaria") ||
|
||||
lower.includes(" ganoderma") ||
|
||||
lower.includes(" phoma") ||
|
||||
lower.includes(" cladosporium") ||
|
||||
lower.includes(" penicillium") ||
|
||||
lower.includes(" aspergillus") ||
|
||||
lower.includes(" rhizopus") ||
|
||||
lower.includes(" mucor") ||
|
||||
lower.includes(" downy mildew") ||
|
||||
lower.includes(" powdery mildew") ||
|
||||
lower.includes(" pink rot") ||
|
||||
lower.includes(" pink mold") ||
|
||||
lower.includes(" pink root") ||
|
||||
lower.includes(" gray mold") ||
|
||||
lower.includes(" grey mold") ||
|
||||
lower.includes(" white rot") ||
|
||||
lower.includes(" white mold") ||
|
||||
lower.includes(" brown rot") ||
|
||||
lower.includes(" black rot") ||
|
||||
lower.includes(" soft rot") ||
|
||||
lower.includes(" dry rot") ||
|
||||
lower.includes(" fruit rot") ||
|
||||
lower.includes(" root rot") ||
|
||||
lower.includes(" stem rot") ||
|
||||
lower.includes(" ear rot") ||
|
||||
lower.includes(" crown rot") ||
|
||||
lower.includes(" collar rot") ||
|
||||
lower.includes(" pod rot") ||
|
||||
lower.includes(" kernel rot") ||
|
||||
lower.includes(" stalk rot") ||
|
||||
lower.includes(" head rot") ||
|
||||
lower.includes(" butt rot") ||
|
||||
lower.includes(" stump rot") ||
|
||||
lower.includes(" wood rot") ||
|
||||
lower.includes(" seed rot") ||
|
||||
lower.includes(" leaf spot") ||
|
||||
lower.includes(" leaf blight") ||
|
||||
lower.includes(" leaf blotch") ||
|
||||
lower.includes(" leaf rust") ||
|
||||
lower.includes(" brown spot") ||
|
||||
lower.includes(" black spot") ||
|
||||
lower.includes(" black leg") ||
|
||||
lower.includes(" blackleg") ||
|
||||
lower.includes(" black foot") ||
|
||||
lower.includes(" white rust") ||
|
||||
lower.includes(" white smut") ||
|
||||
lower.includes(" white scab") ||
|
||||
lower.includes(" tar spot") ||
|
||||
lower.includes(" target spot") ||
|
||||
lower.includes(" dollar spot") ||
|
||||
lower.includes(" fairy ring") ||
|
||||
lower.includes(" snow mold") ||
|
||||
lower.includes(" pink disease") ||
|
||||
lower.includes(" thread blight") ||
|
||||
lower.includes(" web blight") ||
|
||||
lower.includes(" sclerotial") ||
|
||||
lower.includes(" sore shin") ||
|
||||
lower.includes(" wart") ||
|
||||
lower.includes(" scurf") ||
|
||||
lower.includes(" silver scurf") ||
|
||||
lower.includes(" shot hole") ||
|
||||
lower.includes(" timber rot") ||
|
||||
lower.includes(" cottony rot") ||
|
||||
lower.includes(" watery rot") ||
|
||||
lower.includes(" sour rot") ||
|
||||
lower.includes(" seepage") ||
|
||||
lower.includes(" bunch rot") ||
|
||||
lower.includes(" noble rot") ||
|
||||
lower.includes(" bitter rot") ||
|
||||
lower.includes(" ripe rot") ||
|
||||
lower.includes(" ring rot") ||
|
||||
lower.includes(" coral spot") ||
|
||||
lower.includes(" stem canker") ||
|
||||
lower.includes(" branch canker") ||
|
||||
lower.includes(" perennial canker") ||
|
||||
lower.includes(" brand canker") ||
|
||||
lower.includes(" blister canker") ||
|
||||
lower.includes(" bleeding canker") ||
|
||||
lower.includes(" bark canker") ||
|
||||
lower.includes(" gum canker") ||
|
||||
lower.includes(" collar crack") ||
|
||||
lower.includes(" fasciation") ||
|
||||
lower.includes(" exobasidium") ||
|
||||
lower.includes(" mycorrhiza") ||
|
||||
lower.includes(" lichen") ||
|
||||
lower.includes(" algal") ||
|
||||
lower.includes(" chlorosis") ||
|
||||
lower.includes(" leaf blister") ||
|
||||
lower.includes(" leaf curl")
|
||||
) {
|
||||
return "fungal";
|
||||
}
|
||||
|
||||
// Physiological / environmental indicators
|
||||
if (
|
||||
lower.includes(" sunscald") ||
|
||||
lower.includes(" sunburn") ||
|
||||
lower.includes(" chilling") ||
|
||||
lower.includes(" blossom end rot") ||
|
||||
lower.includes(" edema") ||
|
||||
lower.includes(" deficiency") ||
|
||||
lower.includes(" toxicity") ||
|
||||
lower.includes(" ozone") ||
|
||||
lower.includes(" drought") ||
|
||||
lower.includes(" frost") ||
|
||||
lower.includes(" herbicide") ||
|
||||
lower.includes(" pesticide") ||
|
||||
lower.includes(" phytotoxicity") ||
|
||||
lower.includes(" catface") ||
|
||||
lower.includes(" fruit cracking") ||
|
||||
lower.includes(" russeting") ||
|
||||
lower.includes(" growth crack") ||
|
||||
lower.includes(" mealiness") ||
|
||||
lower.includes(" wind scar") ||
|
||||
lower.includes(" hail") ||
|
||||
lower.includes(" salt ") ||
|
||||
lower.includes(" nutritional") ||
|
||||
lower.includes(" mineral") ||
|
||||
lower.includes(" overwatering") ||
|
||||
lower.includes(" under watering") ||
|
||||
lower.includes(" waterlogging") ||
|
||||
lower.includes(" chemical injury") ||
|
||||
lower.includes(" spray injury") ||
|
||||
lower.includes(" fertilizer burn") ||
|
||||
lower.includes(" lightning") ||
|
||||
lower.includes(" bruising") ||
|
||||
lower.includes(" pressure bruise") ||
|
||||
lower.includes(" impact damage") ||
|
||||
lower.includes(" transit rot")
|
||||
) {
|
||||
return "environmental";
|
||||
}
|
||||
|
||||
// Insect/mite/pest indicators
|
||||
if (
|
||||
lower.includes(" mite") ||
|
||||
lower.includes(" beetle") ||
|
||||
lower.includes(" weevil") ||
|
||||
lower.includes(" aphid") ||
|
||||
lower.includes(" bollworm") ||
|
||||
lower.includes(" leaf miner") ||
|
||||
lower.includes(" mealybug") ||
|
||||
lower.includes(" thrips") ||
|
||||
lower.includes(" whitefly") ||
|
||||
lower.includes(" caterpillar") ||
|
||||
lower.includes(" sawfly") ||
|
||||
lower.includes(" scale ") ||
|
||||
lower.includes(" leafhopper") ||
|
||||
lower.includes(" psylla") ||
|
||||
lower.includes(" slug") ||
|
||||
lower.includes(" snail") ||
|
||||
lower.includes(" borer") ||
|
||||
lower.includes(" maggot") ||
|
||||
lower.includes(" grub") ||
|
||||
lower.includes(" earwig") ||
|
||||
lower.includes(" grasshopper")
|
||||
) {
|
||||
return "environmental";
|
||||
}
|
||||
|
||||
// Fungal genus names
|
||||
const fungalGenera = [
|
||||
"armillaria",
|
||||
"aspergillus",
|
||||
"alternaria",
|
||||
"botrytis",
|
||||
"cercospora",
|
||||
"cladosporium",
|
||||
"colletotrichum",
|
||||
"curvularia",
|
||||
"cylindrocladium",
|
||||
"diplodia",
|
||||
"fusarium",
|
||||
"ganoderma",
|
||||
"glomerella",
|
||||
"helminthosporium",
|
||||
"macrophomina",
|
||||
"mycosphaerella",
|
||||
"nectria",
|
||||
"penicillium",
|
||||
"pestalotia",
|
||||
"phoma",
|
||||
"phomopsis",
|
||||
"phytophthora",
|
||||
"pythium",
|
||||
"rhizoctonia",
|
||||
"sclerotinia",
|
||||
"septoria",
|
||||
"verticillium",
|
||||
"ascochyta",
|
||||
"cercoseptoria",
|
||||
"phaeoisariopsis",
|
||||
"phaeoseptoria",
|
||||
"stagonospora",
|
||||
"stemphylium",
|
||||
"myrothecium",
|
||||
"myriogenospora",
|
||||
"dactuliophora",
|
||||
"dilophospora",
|
||||
"coniothecium",
|
||||
"coniosporium",
|
||||
"cryptostictis",
|
||||
"catacauma",
|
||||
"botryodiplodia",
|
||||
"botryosphaeria",
|
||||
"cephalosporium",
|
||||
"ceratocystis",
|
||||
"chalara",
|
||||
"choanephora",
|
||||
"clitocybe",
|
||||
"coprinus",
|
||||
"cordana",
|
||||
"corticium",
|
||||
"corynespora",
|
||||
"coryneum",
|
||||
"cylindrocarpon",
|
||||
"cylindrocladiella",
|
||||
"cylindrosporium",
|
||||
"cytospora",
|
||||
"cytosporina",
|
||||
"dematophora",
|
||||
"didymella",
|
||||
"dothiorella",
|
||||
"drechslera",
|
||||
"endothia",
|
||||
"eutypa",
|
||||
"eutypella",
|
||||
"exobasidium",
|
||||
"fusicladium",
|
||||
"fusicoccum",
|
||||
"gibberella",
|
||||
"glomerella",
|
||||
"gnomonia",
|
||||
"graphiola",
|
||||
"guignardia",
|
||||
"hendersonia",
|
||||
"hendersonula",
|
||||
"hymenochaete",
|
||||
"hypoxylon",
|
||||
"lasiodiplodia",
|
||||
"leptosphaeria",
|
||||
"leucostoma",
|
||||
"lophodermium",
|
||||
"macrophoma",
|
||||
"marasmiellus",
|
||||
"marasmius",
|
||||
"massaria",
|
||||
"monilia",
|
||||
"monosporascus",
|
||||
"mystrosporium",
|
||||
"neocosmospora",
|
||||
"nigrospora",
|
||||
"omphalia",
|
||||
"ophiobolus",
|
||||
"ovulinia",
|
||||
"ozonium",
|
||||
"panagrolaimus",
|
||||
"periconia",
|
||||
"pestalosphaeria",
|
||||
"pestalotiopsis",
|
||||
"phialophora",
|
||||
"phymatotrichum",
|
||||
"physalospora",
|
||||
"phytophthora",
|
||||
"plasmodiophora",
|
||||
"plectosporium",
|
||||
"polyporus",
|
||||
"poria",
|
||||
"pseudocercosporella",
|
||||
"pseudopeziza",
|
||||
"pseudoseptoria",
|
||||
"puccinia",
|
||||
"pyrenochaeta",
|
||||
"pythium",
|
||||
"ramularia",
|
||||
"rhizoctonia",
|
||||
"rhizopus",
|
||||
"rhynchosporium",
|
||||
"rosellinia",
|
||||
"sclerophthora",
|
||||
"sclerotinia",
|
||||
"sclerotium",
|
||||
"septoria",
|
||||
"sphaceloma",
|
||||
"sphaeropsis",
|
||||
"spongospora",
|
||||
"stagonospora",
|
||||
"stemphylium",
|
||||
"stereum",
|
||||
"stigmina",
|
||||
"thanatephorus",
|
||||
"thielaviopsis",
|
||||
"tippula",
|
||||
"typhula",
|
||||
"ulocladium",
|
||||
"uredo",
|
||||
"ustilago",
|
||||
"valsa",
|
||||
"venturia",
|
||||
"verticillium",
|
||||
"xylaria",
|
||||
];
|
||||
for (const genus of fungalGenera) {
|
||||
if (lower.includes(genus)) return "fungal";
|
||||
}
|
||||
|
||||
// Default to fungal (most plant diseases are fungal)
|
||||
return "fungal";
|
||||
}
|
||||
|
||||
// ─── Infer severity ───────────────────────────────────────────────────────────
|
||||
|
||||
function inferSeverity(name: string): Severity {
|
||||
const lower = name.toLowerCase();
|
||||
if (
|
||||
lower.includes(" lethal") ||
|
||||
lower.includes(" devastating") ||
|
||||
lower.includes(" destructive") ||
|
||||
lower.includes(" fatal") ||
|
||||
lower.includes(" severe") ||
|
||||
lower.includes(" blight") ||
|
||||
lower.includes(" wilt") ||
|
||||
lower.includes(" canker") ||
|
||||
lower.includes(" dieback") ||
|
||||
lower.includes(" decline") ||
|
||||
lower.includes(" rot") ||
|
||||
lower.includes(" gall") ||
|
||||
lower.includes(" gummosis") ||
|
||||
lower.includes(" necrosis") ||
|
||||
lower.includes(" erwinia")
|
||||
) {
|
||||
return "high";
|
||||
}
|
||||
if (
|
||||
lower.includes(" minor") ||
|
||||
lower.includes(" mild") ||
|
||||
lower.includes(" slight") ||
|
||||
lower.includes(" speckle") ||
|
||||
lower.includes(" fleck") ||
|
||||
lower.includes(" freckle") ||
|
||||
lower.includes(" chlorosis") ||
|
||||
lower.includes(" translucence") ||
|
||||
lower.includes(" superficial")
|
||||
) {
|
||||
return "low";
|
||||
}
|
||||
return "moderate";
|
||||
}
|
||||
|
||||
// ─── Generate a deterministic slug ────────────────────────────────────────────
|
||||
|
||||
function toSlug(name: string): string {
|
||||
return (
|
||||
"wiki-" +
|
||||
name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, "-")
|
||||
.replace(/^-|-$/g, "")
|
||||
.replace(/-+/g, "-")
|
||||
);
|
||||
}
|
||||
|
||||
// ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const db = getDb();
|
||||
|
||||
// 1. Get existing disease names from DB
|
||||
const existingDiseases = await db.select({ name: diseases.name }).from(diseases);
|
||||
const existingNames = new Set(existingDiseases.map((d) => d.name.toLowerCase().trim()));
|
||||
|
||||
console.log(`Existing diseases in DB: ${existingNames.size}`);
|
||||
|
||||
// 2. Parse the comprehensive list
|
||||
const entries = parseComprehensiveList("/tmp/plant_diseases/plant_diseases_comprehensive.txt");
|
||||
console.log(`Total entries in comprehensive file: ${entries.length}`);
|
||||
|
||||
// 3. Find or create catch-all plants
|
||||
for (const plantId of ["general", "unknown"]) {
|
||||
const existing = await db.select().from(plants).where(eq(plants.id, plantId)).get();
|
||||
|
||||
if (!existing) {
|
||||
console.log(`Creating '${plantId}' plant for catch-all diseases...`);
|
||||
await db.insert(plants).values({
|
||||
id: plantId,
|
||||
commonName: plantId === "general" ? "General (Multiple Plants)" : "Unknown Plant",
|
||||
scientificName: "Various",
|
||||
family: "Various",
|
||||
category: "houseplant",
|
||||
careSummary:
|
||||
plantId === "general"
|
||||
? "General plant diseases affecting multiple species."
|
||||
: "Plant disease with unknown host plant.",
|
||||
imageUrl: "",
|
||||
});
|
||||
console.log(`Created '${plantId}' plant.`);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Filter new entries (deduplicate within file + against DB)
|
||||
const newEntries: DiseaseEntry[] = [];
|
||||
const skipped: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const entry of entries) {
|
||||
const key = entry.name.toLowerCase().trim();
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
if (existingNames.has(key)) {
|
||||
skipped.push(entry.name);
|
||||
} else {
|
||||
newEntries.push(entry);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nNew entries to insert: ${newEntries.length}`);
|
||||
console.log(`Already existing (skipped): ${skipped.length}`);
|
||||
|
||||
if (skipped.length > 0) {
|
||||
console.log(`\nFirst 10 skipped (of ${skipped.length}):`);
|
||||
skipped.slice(0, 10).forEach((s) => console.log(` - ${s}`));
|
||||
}
|
||||
|
||||
// 5. Insert new entries in batches
|
||||
if (newEntries.length === 0) {
|
||||
console.log("\n✅ No new diseases to insert.");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
const BATCH_SIZE = 50;
|
||||
let inserted = 0;
|
||||
let errors = 0;
|
||||
|
||||
for (let i = 0; i < newEntries.length; i += BATCH_SIZE) {
|
||||
const batch = newEntries.slice(i, i + BATCH_SIZE);
|
||||
const values = batch.map((entry) => {
|
||||
const causalAgent = inferCausalAgent(entry.name);
|
||||
const severity = inferSeverity(entry.name);
|
||||
return {
|
||||
id: toSlug(entry.name),
|
||||
plantId: "general",
|
||||
name: entry.name,
|
||||
scientificName: "",
|
||||
causalAgentType: causalAgent,
|
||||
description: `A plant disease known as "${entry.name}". Source: Wikipedia.`,
|
||||
symptoms: [],
|
||||
causes: [],
|
||||
treatment: [],
|
||||
prevention: [],
|
||||
lookalikeIds: [],
|
||||
severity,
|
||||
sourceUrl: entry.sourceUrl,
|
||||
imageUrl: "",
|
||||
};
|
||||
});
|
||||
|
||||
try {
|
||||
await db.insert(diseases).values(values).onConflictDoNothing();
|
||||
inserted += values.length;
|
||||
} catch (err) {
|
||||
// Fall back to individual inserts for this batch if batch fails
|
||||
console.log(` Batch failed, trying individually...`);
|
||||
for (const val of values) {
|
||||
try {
|
||||
await db.insert(diseases).values(val).onConflictDoNothing();
|
||||
inserted++;
|
||||
} catch (e2) {
|
||||
// If it's a duplicate key, count it as skipped
|
||||
if (String(e2).includes("UNIQUE") || String(e2).includes("duplicate")) {
|
||||
// Already handled by onConflictDoNothing, shouldn't happen
|
||||
inserted++;
|
||||
} else {
|
||||
console.error(` Error inserting "${val.name}":`, e2);
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((i + BATCH_SIZE) % 200 === 0 || i + BATCH_SIZE >= newEntries.length) {
|
||||
console.log(
|
||||
` Progress: ${Math.min(i + BATCH_SIZE, newEntries.length)}/${newEntries.length} (${inserted} inserted, ${errors} errors)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Summary
|
||||
const totalDiseases = await db
|
||||
.select({ count: sql<number>`COUNT(*)` })
|
||||
.from(diseases)
|
||||
.get();
|
||||
const totalPlants = await db
|
||||
.select({ count: sql<number>`COUNT(*)` })
|
||||
.from(plants)
|
||||
.get();
|
||||
|
||||
console.log(`\n📊 Results:`);
|
||||
console.log(` Inserted: ${inserted}`);
|
||||
console.log(` Errors: ${errors}`);
|
||||
console.log(` Skipped (already existed): ${skipped.length}`);
|
||||
console.log(`\n📊 Database now has:`);
|
||||
console.log(` ${totalPlants?.count ?? 0} plants`);
|
||||
console.log(` ${totalDiseases?.count ?? 0} diseases`);
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("❌ Failed:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
414
scripts/fill-brave-images-v2.ts
Normal file
414
scripts/fill-brave-images-v2.ts
Normal file
@@ -0,0 +1,414 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-brave-images-v2.ts — Brave Image Search for remaining disease images.
|
||||
*
|
||||
* Prioritizes by severity (critical → high → moderate → low).
|
||||
* Runs at 1 request/sec (Brave free tier rate limit).
|
||||
* Updates Turso DB directly with found images.
|
||||
* When current key is exhausted, prompts for next key.
|
||||
* Falls back to duckduckgo-images-api when all keys are spent.
|
||||
*
|
||||
* Usage:
|
||||
* cd apps/web && npx tsx scripts/fill-brave-images-v2.ts
|
||||
*
|
||||
* Pass additional API keys as args:
|
||||
* npx tsx scripts/fill-brave-images-v2.ts KEY2 KEY3
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// Load env
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
// Also try .env.local for BRAVE_API_KEY
|
||||
try {
|
||||
const envLocal = readFileSync(resolve(__dirname, "../.env.local"), "utf-8");
|
||||
for (const line of envLocal.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed.startsWith("BRAVE_API_KEY=")) {
|
||||
const val = trimmed.slice("BRAVE_API_KEY=".length).trim();
|
||||
if (!process.env.BRAVE_API_KEY) process.env.BRAVE_API_KEY = val;
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases } from "../src/lib/db/schema";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
interface DiseaseRow {
|
||||
id: string;
|
||||
name: string;
|
||||
scientificName: string;
|
||||
severity: string;
|
||||
plantId: string;
|
||||
}
|
||||
|
||||
// ─── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
const BRAVE_DELAY = 1100; // ms between calls (1 req/sec)
|
||||
const DB_FLUSH_BATCH = 50;
|
||||
const MAX_PER_KEY = 1800; // Leave 200 buffer of the 2000/mo limit
|
||||
const STATE_FILE = resolve(__dirname, ".brave-progress.json");
|
||||
|
||||
let currentKeyIndex = 0;
|
||||
let braveKeys: string[] = [];
|
||||
let callsThisKey = 0;
|
||||
let totalFound = 0;
|
||||
// totalSkipped tracking removed — not needed for v2
|
||||
|
||||
// ─── State persistence ───────────────────────────────────────────────────────
|
||||
|
||||
interface RunState {
|
||||
processedIds: string[];
|
||||
currentKeyIndex: number;
|
||||
callsThisKey: number;
|
||||
totalFound: number;
|
||||
}
|
||||
|
||||
function loadState(): RunState | null {
|
||||
try {
|
||||
return JSON.parse(readFileSync(STATE_FILE, "utf-8"));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function saveState(processedIds: string[]) {
|
||||
writeFileSync(
|
||||
STATE_FILE,
|
||||
JSON.stringify(
|
||||
{
|
||||
processedIds,
|
||||
currentKeyIndex,
|
||||
callsThisKey,
|
||||
totalFound,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
"utf-8",
|
||||
);
|
||||
}
|
||||
|
||||
// ─── Brave API ───────────────────────────────────────────────────────────────
|
||||
|
||||
async function braveImageSearch(query: string): Promise<string | null> {
|
||||
const key = braveKeys[currentKeyIndex];
|
||||
if (!key) return null;
|
||||
|
||||
const url = new URL("https://api.search.brave.com/res/v1/images/search");
|
||||
url.searchParams.set("q", query);
|
||||
url.searchParams.set("count", "3");
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetch(url.toString(), {
|
||||
headers: { "X-Subscription-Token": key, Accept: "application/json" },
|
||||
});
|
||||
|
||||
if (res.status === 429) {
|
||||
console.log("\n [RATE LIMITED] Key " + (currentKeyIndex + 1) + " exhausted!");
|
||||
return "RATE_LIMITED";
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
|
||||
callsThisKey++;
|
||||
const data = (await res.json()) as {
|
||||
results?: Array<{ url: string; thumbnail?: { src?: string } }>;
|
||||
};
|
||||
const results = data?.results ?? [];
|
||||
if (results.length === 0) return null;
|
||||
|
||||
// Prefer non-stock images
|
||||
for (const r of results) {
|
||||
const src = r.thumbnail?.src ?? r.url;
|
||||
if (src && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(src)) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
return results[0].thumbnail?.src ?? results[0].url;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── DuckDuckGo fallback ────────────────────────────────────────────────────
|
||||
|
||||
async function ddgFallbackSearch(query: string): Promise<string | null> {
|
||||
try {
|
||||
// Try to use duckduckgo-images-api if installed
|
||||
const ddg = await import("duckduckgo-images-api").catch(() => null);
|
||||
if (ddg) {
|
||||
const results = await ddg.image_search({ query, moderate: true });
|
||||
if (results && results.length > 0) {
|
||||
for (const r of results) {
|
||||
if (r.image && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(r.image)) {
|
||||
return r.image;
|
||||
}
|
||||
}
|
||||
return results[0].image || null;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// duckduckgo-images-api not installed
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log("\n🔍 Brave Disease Image Filler v2\n");
|
||||
|
||||
// Parse keys from args + env
|
||||
const argsKeys = process.argv.slice(2).filter((a) => !a.startsWith("-"));
|
||||
const envKey = process.env.BRAVE_API_KEY;
|
||||
braveKeys = [envKey, ...argsKeys].filter(Boolean) as string[];
|
||||
braveKeys = [...new Set(braveKeys)]; // dedup
|
||||
|
||||
if (braveKeys.length === 0) {
|
||||
console.log("❌ No Brave API keys found.");
|
||||
console.log(" Set BRAVE_API_KEY in .env.local or pass as argument.\n");
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`🔑 ${braveKeys.length} Brave API key(s) available\n`);
|
||||
|
||||
// Load state
|
||||
const state = loadState();
|
||||
if (state) {
|
||||
currentKeyIndex = state.currentKeyIndex;
|
||||
callsThisKey = state.callsThisKey;
|
||||
totalFound = state.totalFound;
|
||||
console.log(
|
||||
`📋 Resuming from previous run (${state.processedIds.length} processed, ${totalFound} found)\n`,
|
||||
);
|
||||
}
|
||||
|
||||
// Get diseases from DB
|
||||
const db = getDb();
|
||||
const allDiseases = (await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
name: diseases.name,
|
||||
scientificName: diseases.scientificName,
|
||||
severity: diseases.severity,
|
||||
plantId: diseases.plantId,
|
||||
})
|
||||
.from(diseases)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all()) as DiseaseRow[];
|
||||
|
||||
console.log(`📋 ${allDiseases.length} diseases need images\n`);
|
||||
|
||||
if (allDiseases.length === 0) {
|
||||
console.log("✅ All diseases already have images!\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
// Sort by severity priority
|
||||
const severityOrder = { critical: 0, high: 1, moderate: 2, low: 3 };
|
||||
allDiseases.sort(
|
||||
(a, b) =>
|
||||
(severityOrder[a.severity as keyof typeof severityOrder] || 99) -
|
||||
(severityOrder[b.severity as keyof typeof severityOrder] || 99),
|
||||
);
|
||||
|
||||
// Filter out already-processed from state
|
||||
const processedSet = new Set(state?.processedIds || []);
|
||||
const pending = allDiseases.filter((d) => !processedSet.has(d.id));
|
||||
|
||||
console.log(
|
||||
`📊 Prioritization: critical=${allDiseases.filter((d) => d.severity === "critical" && !processedSet.has(d.id)).length}, high=${allDiseases.filter((d) => d.severity === "high" && !processedSet.has(d.id)).length}, moderate=${allDiseases.filter((d) => d.severity === "moderate" && !processedSet.has(d.id)).length}, low=${allDiseases.filter((d) => d.severity === "low" && !processedSet.has(d.id)).length}\n`,
|
||||
);
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log("✅ All remaining diseases already attempted\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
const raw = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
let updates: Array<{ id: string; url: string }> = [];
|
||||
const processedIds: string[] = state?.processedIds || [];
|
||||
let found = totalFound;
|
||||
let ddgMode = false;
|
||||
|
||||
for (let i = 0; i < pending.length; i++) {
|
||||
const d = pending[i];
|
||||
|
||||
// Check if current key needs rotating
|
||||
if (!ddgMode && callsThisKey >= MAX_PER_KEY) {
|
||||
if (currentKeyIndex < braveKeys.length - 1) {
|
||||
currentKeyIndex++;
|
||||
callsThisKey = 0;
|
||||
console.log(`\n 🔄 Rotating to key ${currentKeyIndex + 1}/${braveKeys.length}\n`);
|
||||
} else {
|
||||
console.log(
|
||||
`\n ⚠️ All ${braveKeys.length} Brave keys exhausted. Switching to DuckDuckGo fallback.\n`,
|
||||
);
|
||||
ddgMode = true;
|
||||
// Install duckduckgo-images-api if not available
|
||||
try {
|
||||
await import("duckduckgo-images-api");
|
||||
} catch {
|
||||
console.log(" Installing duckduckgo-images-api...");
|
||||
const { execSync } = await import("child_process");
|
||||
execSync("npm install duckduckgo-images-api", {
|
||||
cwd: resolve(__dirname, ".."),
|
||||
stdio: "pipe",
|
||||
});
|
||||
console.log(" Done.\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build search query
|
||||
const plantName = d.plantId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
||||
const query = `${d.name} ${d.scientificName} ${plantName} plant disease`;
|
||||
const sev = d.severity.padEnd(8);
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 40).padEnd(42)} `,
|
||||
);
|
||||
|
||||
let url: string | null = null;
|
||||
|
||||
if (ddgMode) {
|
||||
url = await ddgFallbackSearch(query);
|
||||
if (!url) {
|
||||
// Try a simpler query
|
||||
url = await ddgFallbackSearch(`${d.name} disease`);
|
||||
}
|
||||
} else {
|
||||
url = await braveImageSearch(query);
|
||||
if (url === "RATE_LIMITED") {
|
||||
// Key exhausted mid-query, try next
|
||||
if (currentKeyIndex < braveKeys.length - 1) {
|
||||
currentKeyIndex++;
|
||||
callsThisKey = 0;
|
||||
console.log("\n 🔄 Rotating key...");
|
||||
url = await braveImageSearch(query);
|
||||
} else {
|
||||
console.log("\n ⚠️ All keys exhausted mid-batch!");
|
||||
ddgMode = true;
|
||||
url = await ddgFallbackSearch(query);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (url) {
|
||||
updates.push({ id: d.id, url });
|
||||
found++;
|
||||
processedIds.push(d.id);
|
||||
console.log("✅");
|
||||
} else {
|
||||
processedIds.push(d.id); // Mark as attempted even if not found
|
||||
console.log("❌");
|
||||
}
|
||||
|
||||
// Flush to DB
|
||||
if (updates.length >= DB_FLUSH_BATCH) {
|
||||
await raw.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
updates = [];
|
||||
}
|
||||
|
||||
// Save state every 50
|
||||
if ((i + 1) % 50 === 0) {
|
||||
saveState(processedIds);
|
||||
}
|
||||
|
||||
// Rate limit (even for DDG to be polite)
|
||||
await new Promise((r) => setTimeout(r, ddgMode ? 500 : BRAVE_DELAY));
|
||||
}
|
||||
|
||||
// Final flush
|
||||
if (updates.length > 0) {
|
||||
await raw.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
}
|
||||
|
||||
saveState(processedIds);
|
||||
raw.close();
|
||||
|
||||
// Final report
|
||||
const finalList = await db
|
||||
.select({ id: diseases.id, name: diseases.name, imageUrl: diseases.imageUrl })
|
||||
.from(diseases)
|
||||
.all();
|
||||
const w = finalList.filter((d) => d.imageUrl);
|
||||
const wo = finalList.filter((d) => !d.imageUrl);
|
||||
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`📊 BRAVE IMAGE SEARCH COMPLETE`);
|
||||
console.log(`${"═".repeat(50)}`);
|
||||
console.log(` Processed: ${pending.length}`);
|
||||
console.log(` Found this run: ${found - totalFound}`);
|
||||
console.log(` Total with images: ${w.length}/${finalList.length}`);
|
||||
console.log(` Still missing: ${wo.length}`);
|
||||
console.log(` Brave keys used: ${currentKeyIndex + 1}`);
|
||||
console.log(` Calls on current key: ${callsThisKey}`);
|
||||
console.log(` DuckDuckGo mode: ${ddgMode}`);
|
||||
|
||||
if (wo.length > 0) {
|
||||
const rp = resolve(__dirname, ".disease-image-review-needed.md");
|
||||
let report = "# Disease Images - Still Missing\n\n";
|
||||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
report += `## Summary\n\n`;
|
||||
report += `- Total: ${finalList.length}\n`;
|
||||
report += `- With images: ${w.length}\n`;
|
||||
report += `- Still missing: ${wo.length}\n\n`;
|
||||
report += `## Missing Diseases\n\n`;
|
||||
for (const d of wo) {
|
||||
report += `- ${d.name} (\`${d.id}\`)\n`;
|
||||
}
|
||||
writeFileSync(rp, report, "utf-8");
|
||||
console.log(`\n📝 Report: ${rp}`);
|
||||
} else {
|
||||
console.log("\n✅ ALL diseases now have images!");
|
||||
}
|
||||
|
||||
closeDb();
|
||||
console.log("\n");
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌", err);
|
||||
process.exit(1);
|
||||
});
|
||||
152
scripts/fill-brave-images.ts
Normal file
152
scripts/fill-brave-images.ts
Normal file
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-brave-images.ts — Brave-only pass for remaining disease images.
|
||||
*
|
||||
* Runs at 1 request/sec (Brave rate limit).
|
||||
* Updates diseases.json and Turso DB.
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fill-brave-images.ts
|
||||
*/
|
||||
|
||||
import dotenv from "dotenv"; dotenv.config({ path: resolve(__dirname, "../.env.local") });
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { closeDb } from "../src/lib/db/index";
|
||||
|
||||
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
|
||||
const BRAVE_KEY = process.env.BRAVE_API_KEY ?? "";
|
||||
|
||||
interface DiseaseSeed {
|
||||
id: string;
|
||||
plantId: string;
|
||||
name: string;
|
||||
scientificName: string;
|
||||
imageUrl?: string;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
function load(): DiseaseSeed[] {
|
||||
return JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
|
||||
}
|
||||
|
||||
async function searchBraveImage(query: string): Promise<string | null> {
|
||||
const url = new URL("https://api.search.brave.com/res/v1/images/search");
|
||||
url.searchParams.set("q", query);
|
||||
url.searchParams.set("count", "3");
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetch(url.toString(), {
|
||||
headers: { "X-Subscription-Token": BRAVE_KEY, Accept: "application/json" },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await new Promise((r) => setTimeout(r, 5000 * 2 ** attempt));
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
const data = (await res.json()) as {
|
||||
results?: Array<{ url: string; thumbnail?: { src?: string } }>;
|
||||
};
|
||||
const results = data?.results ?? [];
|
||||
if (results.length === 0) return null;
|
||||
|
||||
// Prefer non-stock direct-looking images
|
||||
for (const r of results) {
|
||||
const src = r.thumbnail?.src ?? r.url;
|
||||
if (src && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(src)) return src;
|
||||
}
|
||||
return results[0].thumbnail?.src ?? results[0].url;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("\n🔍 Brave Image Search — remaining disease images\n");
|
||||
|
||||
if (!BRAVE_KEY) {
|
||||
console.log("❌ No BRAVE_API_KEY in .env.local\n");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const diseases = load();
|
||||
const pending = diseases.filter((d) => !d.imageUrl);
|
||||
console.log(`📋 ${pending.length} diseases need images\n`);
|
||||
|
||||
let found = 0;
|
||||
|
||||
for (let i = 0; i < pending.length; i++) {
|
||||
const d = pending[i];
|
||||
const plant = diseases.find((p) => p.id === d.plantId);
|
||||
const plantName = plant?.name ?? d.plantId;
|
||||
const query = `${d.name} ${plantName} plant disease symptom`;
|
||||
|
||||
process.stdout.write(` [${String(i + 1).padStart(2, " ")}/${pending.length}] ${d.name.padEnd(35)} `);
|
||||
|
||||
const url = await searchBraveImage(query);
|
||||
if (url) {
|
||||
d.imageUrl = url;
|
||||
found++;
|
||||
console.log(`✅`);
|
||||
} else {
|
||||
console.log(`❌`);
|
||||
}
|
||||
|
||||
// 1 req/sec rate limit
|
||||
await new Promise((r) => setTimeout(r, 1100));
|
||||
}
|
||||
|
||||
// Write updated JSON
|
||||
writeFileSync(DISEASES_JSON, JSON.stringify(diseases, null, 2) + "\n", "utf-8");
|
||||
console.log(`\n✅ diseases.json updated: ${found}/${pending.length} images found\n`);
|
||||
|
||||
// Update DB
|
||||
try {
|
||||
const dbUrl = process.env.DATABASE_URL;
|
||||
const dbToken = process.env.DATABASE_TOKEN;
|
||||
if (dbUrl && dbToken) {
|
||||
const raw = createClient({ url: dbUrl, authToken: dbToken });
|
||||
const updates = pending.filter((d) => d.imageUrl);
|
||||
for (let i = 0; i < updates.length; i += 50) {
|
||||
await raw.batch(
|
||||
updates.slice(i, i + 50).map((d) => ({
|
||||
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
||||
args: [d.imageUrl!, d.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
}
|
||||
raw.close();
|
||||
console.log(`✅ Turso DB updated: ${updates.length} rows`);
|
||||
} else {
|
||||
console.log("⏭️ Skipping DB — no DATABASE_URL/TOKEN");
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(` ⚠️ DB: ${err instanceof Error ? err.message : err}`);
|
||||
}
|
||||
|
||||
// Summary
|
||||
const finalDiseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
|
||||
const stillMissing = finalDiseases.filter((d) => !d.imageUrl);
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`📊 FINAL: ${finalDiseases.length} total`);
|
||||
console.log(` With images: ${finalDiseases.length - stillMissing.length}`);
|
||||
console.log(` Still missing: ${stillMissing.length}`);
|
||||
if (stillMissing.length > 0) {
|
||||
console.log(`\nStill need human curation:`);
|
||||
for (const d of stillMissing) {
|
||||
console.log(` ❌ ${d.name} (${d.id})`);
|
||||
}
|
||||
}
|
||||
console.log(`${"═".repeat(50)}\n`);
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌ Fatal:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
268
scripts/fill-ddg-images.ts
Normal file
268
scripts/fill-ddg-images.ts
Normal file
@@ -0,0 +1,268 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-ddg-images.ts — DuckDuckGo Image Search for remaining disease images.
|
||||
*
|
||||
* No API key needed. Searches DuckDuckGo Images API for each disease
|
||||
* without an image and updates the Turso DB.
|
||||
*
|
||||
* Prioritizes by severity (critical → high → moderate → low).
|
||||
* Runs at 1 request/sec to be polite to DuckDuckGo.
|
||||
* Resumable via state file (scripts/.ddg-progress.json).
|
||||
*
|
||||
* Usage:
|
||||
* cd apps/web && npx tsx scripts/fill-ddg-images.ts
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// Load .env.development for DB creds
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases } from "../src/lib/db/schema";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
// DuckDuckGo
|
||||
import { imageSearch } from "@mudbill/duckduckgo-images-api";
|
||||
|
||||
interface DiseaseRow {
|
||||
id: string;
|
||||
name: string;
|
||||
scientificName: string;
|
||||
severity: string;
|
||||
plantId: string;
|
||||
}
|
||||
|
||||
// ─── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
const POLITE_DELAY = 800; // ms between calls
|
||||
const DB_FLUSH_BATCH = 50;
|
||||
const STATE_FILE = resolve(__dirname, ".ddg-progress.json");
|
||||
|
||||
interface RunState {
|
||||
processedIds: string[];
|
||||
totalFound: number;
|
||||
}
|
||||
|
||||
function loadState(): RunState | null {
|
||||
try {
|
||||
return JSON.parse(readFileSync(STATE_FILE, "utf-8"));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function saveState(processedIds: string[], totalFound: number) {
|
||||
writeFileSync(STATE_FILE, JSON.stringify({ processedIds, totalFound }, null, 2), "utf-8");
|
||||
}
|
||||
|
||||
// ─── DuckDuckGo Search ───────────────────────────────────────────────────────
|
||||
|
||||
async function searchImage(query: string): Promise<string | null> {
|
||||
try {
|
||||
const results = await imageSearch({ query, safe: true, iterations: 1, retries: 2 });
|
||||
if (!results || results.length === 0) return null;
|
||||
|
||||
// Prefer non-stock images
|
||||
for (const r of results) {
|
||||
if (r.image && !/(dreamstime|shutterstock|alamy|istock|123rf)/i.test(r.image)) {
|
||||
return r.image;
|
||||
}
|
||||
}
|
||||
return results[0].image || results[0].thumbnail || null;
|
||||
} catch {
|
||||
// DuckDuckGo may block or timeout; silently skip
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log("\n🦆 DuckDuckGo Disease Image Filler\n");
|
||||
|
||||
const db = getDb();
|
||||
|
||||
// Load state
|
||||
const state = loadState();
|
||||
const processedSet = new Set(state?.processedIds || []);
|
||||
const totalFoundPrev = state?.totalFound ?? 0;
|
||||
|
||||
// Get all diseases that still need images
|
||||
const allDiseases = (await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
name: diseases.name,
|
||||
scientificName: diseases.scientificName,
|
||||
severity: diseases.severity,
|
||||
plantId: diseases.plantId,
|
||||
})
|
||||
.from(diseases)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all()) as DiseaseRow[];
|
||||
|
||||
console.log(`📋 ${allDiseases.length} diseases need images\n`);
|
||||
|
||||
if (allDiseases.length === 0) {
|
||||
console.log("✅ All diseases already have images!\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
// Sort by severity: critical > high > moderate > low
|
||||
const severityOrder: Record<string, number> = { critical: 0, high: 1, moderate: 2, low: 3 };
|
||||
allDiseases.sort((a, b) => (severityOrder[a.severity] ?? 99) - (severityOrder[b.severity] ?? 99));
|
||||
|
||||
// Filter out already-processed
|
||||
const pending = allDiseases.filter((d) => !processedSet.has(d.id));
|
||||
|
||||
console.log(
|
||||
`📊 Remaining: critical=${allDiseases.filter((d) => d.severity === "critical" && !processedSet.has(d.id)).length}, ` +
|
||||
`high=${allDiseases.filter((d) => d.severity === "high" && !processedSet.has(d.id)).length}, ` +
|
||||
`moderate=${allDiseases.filter((d) => d.severity === "moderate" && !processedSet.has(d.id)).length}, ` +
|
||||
`low=${allDiseases.filter((d) => d.severity === "low" && !processedSet.has(d.id)).length}\n`,
|
||||
);
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log("✅ All remaining diseases already attempted\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
const raw = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
const processedIds: string[] = state?.processedIds ?? [];
|
||||
let found = totalFoundPrev;
|
||||
let updates: Array<{ id: string; url: string }> = [];
|
||||
|
||||
for (let i = 0; i < pending.length; i++) {
|
||||
const d = pending[i];
|
||||
const sev = d.severity.padEnd(8);
|
||||
|
||||
// Build search query — "[disease] on [plant]" phrasing for better specificity
|
||||
const plantName = d.plantId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
||||
const query1 = `${d.name} on ${plantName} plant disease`;
|
||||
const query2 = `${d.scientificName || d.name} on ${plantName} disease`;
|
||||
const query3 = `${d.name} plant disease ${plantName}`;
|
||||
const query4 = `${d.name} plant`;
|
||||
const query5 = `${d.name} symptom`;
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 42).padEnd(44)} `,
|
||||
);
|
||||
|
||||
// Try queries in order until we get a result
|
||||
let url: string | null = null;
|
||||
for (const q of [query1, query2, query3, query4, query5]) {
|
||||
url = await searchImage(q);
|
||||
if (url) break;
|
||||
}
|
||||
|
||||
if (url) {
|
||||
updates.push({ id: d.id, url });
|
||||
found++;
|
||||
processedIds.push(d.id);
|
||||
console.log("✅");
|
||||
} else {
|
||||
processedIds.push(d.id);
|
||||
console.log("❌");
|
||||
}
|
||||
|
||||
// Flush to DB in batches
|
||||
if (updates.length >= DB_FLUSH_BATCH) {
|
||||
await raw.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
updates = [];
|
||||
}
|
||||
|
||||
// Save state every 50
|
||||
if ((i + 1) % 50 === 0) {
|
||||
saveState(processedIds, found);
|
||||
}
|
||||
|
||||
// Be polite — 1 req/sec
|
||||
await new Promise((r) => setTimeout(r, POLITE_DELAY));
|
||||
}
|
||||
|
||||
// Final flush
|
||||
if (updates.length > 0) {
|
||||
await raw.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ?, updated_at = datetime() WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
}
|
||||
|
||||
saveState(processedIds, found);
|
||||
raw.close();
|
||||
|
||||
// Final report
|
||||
const finalList = await db
|
||||
.select({ id: diseases.id, name: diseases.name, imageUrl: diseases.imageUrl })
|
||||
.from(diseases)
|
||||
.all();
|
||||
const w = finalList.filter((d) => d.imageUrl);
|
||||
const wo = finalList.filter((d) => !d.imageUrl);
|
||||
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`🦆 DUCKDUCKGO SEARCH COMPLETE`);
|
||||
console.log(`${"═".repeat(50)}`);
|
||||
console.log(` Processed: ${pending.length}`);
|
||||
console.log(` Found this run: ${found - totalFoundPrev}`);
|
||||
console.log(` Total with images: ${w.length}/${finalList.length}`);
|
||||
console.log(` Still missing: ${wo.length}`);
|
||||
|
||||
if (wo.length > 0) {
|
||||
const reportPath = resolve(__dirname, ".ddg-image-review-needed.md");
|
||||
let report = "# Disease Images - Still Missing (DDG)\n\n";
|
||||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
report += `## Summary\n\n`;
|
||||
report += `- Total: ${finalList.length}\n`;
|
||||
report += `- With images: ${w.length}\n`;
|
||||
report += `- Still missing: ${wo.length}\n\n`;
|
||||
report += `## Missing Diseases\n\n`;
|
||||
for (const d of wo) {
|
||||
report += `- ${d.name} (\`${d.id}\`)\n`;
|
||||
}
|
||||
writeFileSync(reportPath, report, "utf-8");
|
||||
console.log(`\n📝 Missing report: ${reportPath}`);
|
||||
} else {
|
||||
console.log("\n✅ ALL diseases now have images!");
|
||||
}
|
||||
|
||||
closeDb();
|
||||
console.log();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌ Fatal:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
440
scripts/fill-disease-images.ts
Normal file
440
scripts/fill-disease-images.ts
Normal file
@@ -0,0 +1,440 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-disease-images.ts — Three-stage disease image pipeline
|
||||
*
|
||||
* For every disease without an imageUrl, tries:
|
||||
* Stage 1 — Wikipedia search → pageimages
|
||||
* Stage 2 — Wikimedia Commons search
|
||||
* Stage 3 — Brave Image Search API (fallback, 1 req/sec, 2000/mo)
|
||||
*
|
||||
* Updates both diseases.json (seed) and the Turso DB.
|
||||
* Flags anything found only via Brave for human review.
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fill-disease-images.ts
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync, writeFileSync, existsSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { closeDb } from "../src/lib/db/index";
|
||||
|
||||
// ─── Types & Config ──────────────────────────────────────────────────────────
|
||||
|
||||
interface DiseaseSeed {
|
||||
id: string;
|
||||
plantId: string;
|
||||
name: string;
|
||||
scientificName: string;
|
||||
commonName?: string;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
interface ImageResult {
|
||||
url: string;
|
||||
source: "wikipedia" | "commons" | "brave" | "missing";
|
||||
quality: "good" | "fallback" | "missing";
|
||||
}
|
||||
|
||||
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
|
||||
const RESULTS_FILE = resolve(__dirname, ".image-results.json");
|
||||
const REPORT_FILE = resolve(__dirname, ".image-review-needed.md");
|
||||
|
||||
const WIKI_API = "https://en.wikipedia.org/w/api.php";
|
||||
const COMMONS_API = "https://commons.wikimedia.org/w/api.php";
|
||||
const BRAVE_KEY = process.env.BRAVE_API_KEY ?? "";
|
||||
const BRAVE_DELAY = 1100;
|
||||
const MAX_BRAVE = 2000;
|
||||
const UA = "PlantHealthKB/1.0 (plant-disease-id)";
|
||||
const ORIGIN = "*";
|
||||
|
||||
let braveCount = 0;
|
||||
|
||||
// ─── Wikipedia Stage ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Search Wikipedia and get thumbnails in ONE API call using generator=search.
|
||||
* Returns first thumbnail found, or null.
|
||||
*/
|
||||
async function wikiSearchAndThumb(query: string): Promise<string | null> {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
generator: "search",
|
||||
gsrsearch: query,
|
||||
gsrlimit: "5",
|
||||
prop: "pageimages",
|
||||
pithumbsize: "600",
|
||||
format: "json",
|
||||
origin: ORIGIN,
|
||||
});
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetchWithTimeout(`${WIKI_API}?${params}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await delay(3000 * 2 ** attempt);
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
const data = (await res.json()) as {
|
||||
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
|
||||
};
|
||||
const pages = data?.query?.pages;
|
||||
if (!pages) return null;
|
||||
for (const [, p] of Object.entries(pages)) {
|
||||
const src = (p as { thumbnail?: { source: string } })?.thumbnail?.source;
|
||||
if (src) return src;
|
||||
}
|
||||
return null;
|
||||
} catch {
|
||||
await delay(2000);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to find a Wikipedia image for a disease.
|
||||
* Uses generator=search which combines search + thumbnails in one call.
|
||||
*/
|
||||
async function wikiStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
||||
// Try 1: disease name + plant name (most specific)
|
||||
return wikiSearchAndThumb(`"${d.name}" ${plantName}`);
|
||||
}
|
||||
|
||||
// ─── Commons Stage ───────────────────────────────────────────────────────────
|
||||
|
||||
/** Fetch with timeout. Aborts after `ms` milliseconds. */
|
||||
async function fetchWithTimeout(url: string, opts: RequestInit, ms = 15000): Promise<Response> {
|
||||
const ctrl = new AbortController();
|
||||
const timer = setTimeout(() => ctrl.abort(), ms);
|
||||
try {
|
||||
const res = await fetch(url, { ...opts, signal: ctrl.signal });
|
||||
return res;
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
async function commonsSearchAndThumb(query: string): Promise<string | null> {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
list: "search",
|
||||
srsearch: query,
|
||||
srnamespace: "6",
|
||||
srlimit: "5",
|
||||
format: "json",
|
||||
origin: ORIGIN,
|
||||
});
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetchWithTimeout(`${COMMONS_API}?${params}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await delay(3000 * 2 ** attempt);
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
const data = (await res.json()) as {
|
||||
query?: { search?: Array<{ pageid: number; title: string }> };
|
||||
};
|
||||
const hits = data?.query?.search ?? [];
|
||||
if (hits.length === 0) return null;
|
||||
|
||||
// Batch-fetch imageinfo for all found page IDs
|
||||
const pageids = hits.map((h) => h.pageid).join("|");
|
||||
const imgParams = new URLSearchParams({
|
||||
action: "query",
|
||||
pageids,
|
||||
prop: "imageinfo",
|
||||
iiprop: "url",
|
||||
iiurlwidth: "600",
|
||||
format: "json",
|
||||
origin: ORIGIN,
|
||||
});
|
||||
|
||||
const imgRes = await fetchWithTimeout(`${COMMONS_API}?${imgParams}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (!imgRes.ok) return null;
|
||||
const imgData = (await imgRes.json()) as {
|
||||
query?: { pages?: Record<string, unknown> };
|
||||
};
|
||||
const imgPages = imgData?.query?.pages;
|
||||
if (!imgPages) return null;
|
||||
|
||||
for (const [, pg] of Object.entries(imgPages)) {
|
||||
const p = pg as Record<string, unknown>;
|
||||
const info = (p.imageinfo as Array<Record<string, string>> | undefined)?.[0];
|
||||
if (info?.thumburl) return info.thumburl as string;
|
||||
if (info?.url) return info.url as string;
|
||||
}
|
||||
return null;
|
||||
} catch {
|
||||
await delay(2000);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function commonsStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
||||
let q: string;
|
||||
if (d.scientificName && !d.scientificName.includes("spp.") && !d.scientificName.includes("/")) {
|
||||
q = `${d.scientificName} ${plantName}`;
|
||||
} else {
|
||||
q = `${d.name} ${plantName} disease`;
|
||||
}
|
||||
|
||||
const url = await commonsSearchAndThumb(q);
|
||||
return url ?? null;
|
||||
}
|
||||
|
||||
// ─── Brave Stage ─────────────────────────────────────────────────────────────
|
||||
|
||||
async function braveStage(d: DiseaseSeed, plantName: string): Promise<string | null> {
|
||||
if (!BRAVE_KEY || braveCount >= MAX_BRAVE) return null;
|
||||
|
||||
const url = new URL("https://api.search.brave.com/res/v1/images/search");
|
||||
url.searchParams.set("q", `${d.name} ${plantName} plant disease symptom`);
|
||||
url.searchParams.set("count", "5");
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetchWithTimeout(url.toString(), {
|
||||
headers: { "X-Subscription-Token": BRAVE_KEY, Accept: "application/json" },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await delay(5000 * 2 ** attempt);
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
braveCount++;
|
||||
const data = (await res.json()) as {
|
||||
results?: Array<{ url: string; thumbnail?: { src?: string } }>;
|
||||
};
|
||||
const results = data?.results ?? [];
|
||||
if (results.length === 0) return null;
|
||||
|
||||
// Prefer non-stock thumbnails
|
||||
for (const r of results) {
|
||||
const src = r.thumbnail?.src ?? r.url;
|
||||
if (src && !src.includes("dreamstime") && !src.includes("shutterstock") &&
|
||||
!src.includes("alamy") && !src.includes("istock") && !src.includes("123rf")) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
return results[0].thumbnail?.src ?? results[0].url;
|
||||
} catch {
|
||||
await delay(2000);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
function loadDiseases(): DiseaseSeed[] {
|
||||
return JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
|
||||
}
|
||||
|
||||
function getPlantName(diseases: DiseaseSeed[], diseaseId: string): string {
|
||||
const plant = diseases.find((p) => p.id === diseaseId);
|
||||
return plant?.commonName ?? plant?.name ?? diseaseId;
|
||||
}
|
||||
|
||||
// ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log("\n🔍 Plant Disease Image Filler\n");
|
||||
|
||||
const diseases = loadDiseases();
|
||||
console.log(`📋 ${diseases.length} diseases loaded\n`);
|
||||
|
||||
// Load existing results
|
||||
let results: Record<string, ImageResult> = {};
|
||||
if (existsSync(RESULTS_FILE)) {
|
||||
try { results = JSON.parse(readFileSync(RESULTS_FILE, "utf-8")); } catch { /* fresh */ }
|
||||
}
|
||||
|
||||
const pending = diseases.filter((d) => {
|
||||
if ((d.imageUrl as string)?.length) return false;
|
||||
return !results[d.id];
|
||||
});
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log("✅ All done\n");
|
||||
await applyResults(diseases, results);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`⏳ ${pending.length} need images\n`);
|
||||
|
||||
// ── Stage 1: Wikipedia ──────────────────────────────────────────────
|
||||
const s1 = pending.filter((d) => !results[d.id]);
|
||||
let s1ok = 0;
|
||||
console.log("─── Wikipedia ───\n");
|
||||
|
||||
for (let i = 0; i < s1.length; i++) {
|
||||
const d = s1[i];
|
||||
const plantName = getPlantName(diseases, d.plantId);
|
||||
const url = await wikiStage(d, plantName);
|
||||
if (url) {
|
||||
results[d.id] = { url, source: "wikipedia", quality: "good" };
|
||||
s1ok++;
|
||||
}
|
||||
const pct = ((i + 1) / s1.length * 100).toFixed(0);
|
||||
process.stdout.write(` [${pct}% ${i + 1}/${s1.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`);
|
||||
if ((i + 1) % 25 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
}
|
||||
|
||||
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
console.log(`\n → ${s1ok}/${s1.length} found\n`);
|
||||
|
||||
// ── Stage 2: Commons ─────────────────────────────────────────────────
|
||||
const s2 = pending.filter((d) => !results[d.id]);
|
||||
let s2ok = 0;
|
||||
|
||||
if (s2.length > 0) {
|
||||
console.log("─── Wikimedia Commons ───\n");
|
||||
for (let i = 0; i < s2.length; i++) {
|
||||
const d = s2[i];
|
||||
const plantName = getPlantName(diseases, d.plantId);
|
||||
let url: string | null = null;
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
commonsStage(d, plantName),
|
||||
new Promise<null>((_, reject) => setTimeout(() => reject(new Error("timeout")), 25000)),
|
||||
]);
|
||||
url = result;
|
||||
} catch { /* timeout */ }
|
||||
if (url) {
|
||||
results[d.id] = { url, source: "commons", quality: "good" };
|
||||
s2ok++;
|
||||
}
|
||||
const pct = ((i + 1) / s2.length * 100).toFixed(0);
|
||||
process.stdout.write(` [${pct}% ${i + 1}/${s2.length}] ${d.name.substring(0, 40).padEnd(42)} ${url ? "✅" : "⏭️"}\n`);
|
||||
|
||||
if ((i + 1) % 10 === 0) writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
}
|
||||
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
console.log(`\n → ${s2ok}/${s2.length} found\n`);
|
||||
}
|
||||
|
||||
// ── Stage 3: Brave ───────────────────────────────────────────────────
|
||||
const s3 = pending.filter((d) => !results[d.id]);
|
||||
let s3ok = 0;
|
||||
|
||||
if (s3.length > 0 && BRAVE_KEY) {
|
||||
console.log("─── Brave Image Search ───\n");
|
||||
for (const d of s3) {
|
||||
if (braveCount >= MAX_BRAVE) {
|
||||
results[d.id] = { url: "", source: "missing", quality: "missing" };
|
||||
continue;
|
||||
}
|
||||
const plantName = getPlantName(diseases, d.plantId);
|
||||
const url = await braveStage(d, plantName);
|
||||
if (url) {
|
||||
results[d.id] = { url, source: "brave", quality: "fallback" };
|
||||
s3ok++;
|
||||
process.stdout.write(` ✅ ${d.name}\n`);
|
||||
} else {
|
||||
results[d.id] = { url: "", source: "missing", quality: "missing" };
|
||||
process.stdout.write(` ❌ ${d.name}\n`);
|
||||
}
|
||||
await delay(BRAVE_DELAY);
|
||||
}
|
||||
writeFileSync(RESULTS_FILE, JSON.stringify(results, null, 2));
|
||||
console.log(`\n → ${s3ok}/${s3.length} found via Brave\n`);
|
||||
} else if (s3.length > 0) {
|
||||
console.log("─── Brave Image Search ─── → skipped (no key)\n");
|
||||
for (const d of s3) results[d.id] = { url: "", source: "missing", quality: "missing" };
|
||||
}
|
||||
|
||||
// ── Apply ───────────────────────────────────────────────────────────
|
||||
await applyResults(diseases, results);
|
||||
|
||||
// ── Report ──────────────────────────────────────────────────────────
|
||||
const good = Object.values(results).filter((r) => r.quality === "good").length;
|
||||
const fallback = Object.values(results).filter((r) => r.quality === "fallback").length;
|
||||
const missing = Object.values(results).filter((r) => r.quality === "missing").length;
|
||||
|
||||
let report = `# Disease Images — Human Review Needed\n\n`;
|
||||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
|
||||
for (const [label, ids, type] of [
|
||||
["Fallback (Brave)", Object.entries(results).filter(([, r]) => r.quality === "fallback").map(([id]) => id), "fallback"],
|
||||
["Missing", Object.entries(results).filter(([, r]) => r.quality === "missing").map(([id]) => id), "missing"],
|
||||
] as const) {
|
||||
if (ids.length === 0) continue;
|
||||
report += `## ${type === "fallback" ? "⚠️" : "🚫"} ${label}\n\n`;
|
||||
for (const id of ids) {
|
||||
const d = diseases.find((x) => x.id === id);
|
||||
const r = results[id];
|
||||
report += `- **${d?.name ?? id}** (${d?.scientificName ?? ""}) on \`${d?.plantId ?? ""}\``;
|
||||
if (r?.url) report += `\n ${r.url}`;
|
||||
report += `\n\n`;
|
||||
}
|
||||
}
|
||||
|
||||
if (good === diseases.length) report += `## ✅ All images found!\n`;
|
||||
writeFileSync(REPORT_FILE, report, "utf-8");
|
||||
console.log(`📝 Review report: ${REPORT_FILE}`);
|
||||
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`📊 Total: ${diseases.length} Good: ${good} Fallback: ${fallback} Missing: ${missing}`);
|
||||
console.log(` Brave calls: ${braveCount}`);
|
||||
console.log(`${"═".repeat(50)}\n`);
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
// ─── Apply results to JSON + DB ──────────────────────────────────────────────
|
||||
|
||||
async function applyResults(diseases: DiseaseSeed[], results: Record<string, ImageResult>) {
|
||||
const urlMap = new Map(
|
||||
Object.entries(results).filter(([id, r]) => r.url.length > 0 && diseases.some((d) => d.id === id)),
|
||||
);
|
||||
if (urlMap.size === 0) return console.log("⏭️ No images to apply");
|
||||
|
||||
// JSON
|
||||
let n = 0;
|
||||
const updated = diseases.map((d) => {
|
||||
const img = urlMap.get(d.id);
|
||||
if (img) { n++; return { ...d, imageUrl: img.url, imageQuality: img.quality }; }
|
||||
return d;
|
||||
});
|
||||
writeFileSync(DISEASES_JSON, JSON.stringify(updated, null, 2) + "\n");
|
||||
console.log(`✅ diseases.json: ${n} images`);
|
||||
|
||||
// DB
|
||||
try {
|
||||
const dbUrl = process.env.DATABASE_URL;
|
||||
const dbToken = process.env.DATABASE_TOKEN;
|
||||
if (!dbUrl || !dbToken) return console.log(" ⏭️ DB: no DATABASE_URL/TOKEN");
|
||||
const raw = createClient({ url: dbUrl, authToken: dbToken });
|
||||
const entries = Array.from(urlMap.entries());
|
||||
for (let i = 0; i < entries.length; i += 50) {
|
||||
await raw.batch(
|
||||
entries.slice(i, i + 50).map(([id, img]) => ({
|
||||
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
||||
args: [img.url, id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
}
|
||||
raw.close();
|
||||
console.log(`✅ Turso DB: ${entries.length} rows`);
|
||||
} catch (err) {
|
||||
console.log(` ⚠️ DB: ${err instanceof Error ? err.message : err}`);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => { console.error("\n❌", err); process.exit(1); });
|
||||
301
scripts/fill-plant-images-v2.ts
Normal file
301
scripts/fill-plant-images-v2.ts
Normal file
@@ -0,0 +1,301 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-plant-images-v2.ts — Batch Wikipedia image fetch for remaining plants.
|
||||
*
|
||||
* Phase 1: Query 50 scientific names at a time via pageimages.
|
||||
* Phase 2: Query 50 common names at a time.
|
||||
* Phase 3: Search individually for stragglers.
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fill-plant-images-v2.ts
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// Load env
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) {
|
||||
process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {}
|
||||
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { plants } from "../src/lib/db/schema";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
const API = "https://en.wikipedia.org/w/api.php";
|
||||
const UA = "PlantHealthKB/1.0";
|
||||
const BATCH = 50;
|
||||
|
||||
interface PlantRow {
|
||||
id: string;
|
||||
commonName: string;
|
||||
scientificName: string;
|
||||
}
|
||||
|
||||
function clean(s: string): string {
|
||||
return s
|
||||
.replace(/[xX]/g, "x")
|
||||
.replace(/\s*spp\.?\s*/gi, "")
|
||||
.replace(/[.\u00d7']/g, "")
|
||||
.trim();
|
||||
}
|
||||
|
||||
async function fetchThumbs(titles: string[]): Promise<Map<string, string>> {
|
||||
if (titles.length === 0) {
|
||||
return new Map();
|
||||
}
|
||||
const p = new URLSearchParams({
|
||||
action: "query",
|
||||
titles: titles.join("|"),
|
||||
prop: "pageimages",
|
||||
pithumbsize: "400",
|
||||
redirects: "1",
|
||||
format: "json",
|
||||
});
|
||||
for (let a = 0; a < 3; a++) {
|
||||
try {
|
||||
const r = await fetch(API + "?" + p.toString(), {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (r.status === 429) {
|
||||
await new Promise((rr) => setTimeout(rr, 5000 * Math.pow(2, a)));
|
||||
continue;
|
||||
}
|
||||
if (!r.ok) {
|
||||
return new Map();
|
||||
}
|
||||
const d = (await r.json()) as any;
|
||||
const pages = d?.query?.pages;
|
||||
if (!pages) {
|
||||
return new Map();
|
||||
}
|
||||
const m = new Map<string, string>();
|
||||
for (const [, pg] of Object.entries(pages)) {
|
||||
const p2 = pg as any;
|
||||
if (!p2.missing && p2.thumbnail?.source) {
|
||||
m.set(p2.title.toLowerCase(), p2.thumbnail.source);
|
||||
}
|
||||
}
|
||||
return m;
|
||||
} catch (e) {
|
||||
await new Promise((rr) => setTimeout(rr, 2000));
|
||||
}
|
||||
}
|
||||
return new Map();
|
||||
}
|
||||
|
||||
async function searchOne(query: string): Promise<string | null> {
|
||||
const p = new URLSearchParams({
|
||||
action: "query",
|
||||
generator: "search",
|
||||
gsrsearch: query,
|
||||
gsrlimit: "3",
|
||||
prop: "pageimages",
|
||||
pithumbsize: "400",
|
||||
format: "json",
|
||||
});
|
||||
for (let a = 0; a < 3; a++) {
|
||||
try {
|
||||
const r = await fetch(API + "?" + p.toString(), {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (r.status === 429) {
|
||||
await new Promise((rr) => setTimeout(rr, 5000 * Math.pow(2, a)));
|
||||
continue;
|
||||
}
|
||||
if (!r.ok) {
|
||||
return null;
|
||||
}
|
||||
const d = (await r.json()) as any;
|
||||
const pages = d?.query?.pages;
|
||||
if (!pages) {
|
||||
return null;
|
||||
}
|
||||
for (const [, pg] of Object.entries(pages)) {
|
||||
const p2 = pg as any;
|
||||
if (p2.thumbnail?.source) {
|
||||
return p2.thumbnail.source;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
} catch (e) {
|
||||
await new Promise((rr) => setTimeout(rr, 2000));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function batchPhase(
|
||||
plants: PlantRow[],
|
||||
titleFn: (p: PlantRow) => string,
|
||||
label: string,
|
||||
dbClient: any,
|
||||
): Promise<PlantRow[]> {
|
||||
const remaining: PlantRow[] = [];
|
||||
const updates: Array<{ id: string; url: string }> = [];
|
||||
|
||||
for (let i = 0; i < plants.length; i += BATCH) {
|
||||
const chunk = plants.slice(i, i + BATCH);
|
||||
const titles = chunk.map(titleFn).filter((t) => t.length > 2);
|
||||
console.log(
|
||||
" [" +
|
||||
label +
|
||||
"] " +
|
||||
(i + 1) +
|
||||
"-" +
|
||||
Math.min(i + BATCH, plants.length) +
|
||||
"/" +
|
||||
plants.length +
|
||||
" ",
|
||||
);
|
||||
const imageMap = await fetchThumbs(titles);
|
||||
let n = 0;
|
||||
for (const pl of chunk) {
|
||||
const t = titleFn(pl).toLowerCase();
|
||||
const img = imageMap.get(t);
|
||||
if (img) {
|
||||
updates.push({ id: pl.id, url: img });
|
||||
n++;
|
||||
} else {
|
||||
remaining.push(pl);
|
||||
}
|
||||
}
|
||||
console.log(" found: " + n);
|
||||
if (updates.length >= 100) {
|
||||
await dbClient.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
updates.length = 0;
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 1500));
|
||||
}
|
||||
|
||||
if (updates.length > 0) {
|
||||
await dbClient.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
}
|
||||
|
||||
return remaining;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("\nPlant Image Filler v2\n");
|
||||
const db = getDb();
|
||||
const allPlants = (await db
|
||||
.select({
|
||||
id: plants.id,
|
||||
commonName: plants.commonName,
|
||||
scientificName: plants.scientificName,
|
||||
})
|
||||
.from(plants)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all()) as PlantRow[];
|
||||
|
||||
console.log("Plants needing images: " + allPlants.length + "\n");
|
||||
if (allPlants.length === 0) {
|
||||
console.log("All plants have images!\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
const raw = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
let found = 0;
|
||||
|
||||
// Phase 1: Scientific name
|
||||
console.log("--- Phase 1: Scientific names ---\n");
|
||||
let remaining = await batchPhase(allPlants, (p) => clean(p.scientificName), "sci", raw);
|
||||
|
||||
// Phase 2: Common name
|
||||
if (remaining.length > 0) {
|
||||
console.log("\n--- Phase 2: Common names (" + remaining.length + ") ---\n");
|
||||
remaining = await batchPhase(remaining, (p) => p.commonName, "common", raw);
|
||||
}
|
||||
|
||||
// Phase 3: Search
|
||||
if (remaining.length > 0) {
|
||||
console.log("\n--- Phase 3: Search (" + remaining.length + ") ---\n");
|
||||
for (let i = 0; i < remaining.length; i++) {
|
||||
const pl = remaining[i];
|
||||
const q = clean(pl.scientificName) + " " + pl.commonName;
|
||||
console.log(" [" + (i + 1) + "/" + remaining.length + "] " + pl.commonName);
|
||||
const img = await searchOne(q);
|
||||
if (img) {
|
||||
await raw.execute({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [img, pl.id],
|
||||
});
|
||||
found++;
|
||||
console.log(" OK");
|
||||
} else {
|
||||
console.log(" MISS");
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 500));
|
||||
}
|
||||
}
|
||||
|
||||
raw.close();
|
||||
|
||||
// Report
|
||||
const finalList = await db
|
||||
.select({
|
||||
id: plants.id,
|
||||
commonName: plants.commonName,
|
||||
imageUrl: plants.imageUrl,
|
||||
})
|
||||
.from(plants)
|
||||
.all();
|
||||
const w = finalList.filter((p) => p.imageUrl);
|
||||
const wo = finalList.filter((p) => !p.imageUrl);
|
||||
|
||||
console.log("\n" + "=".repeat(50));
|
||||
console.log("FINAL: " + finalList.length + " plants");
|
||||
console.log(" With images: " + w.length);
|
||||
console.log(" Missing: " + wo.length);
|
||||
|
||||
if (wo.length > 0) {
|
||||
const rp = resolve(__dirname, ".plant-image-review-needed.md");
|
||||
let report = "# Plant Images - Still Missing\n\n";
|
||||
report += "Generated: " + new Date().toISOString() + "\n\n";
|
||||
report += "## Missing (" + wo.length + ")\n\n";
|
||||
for (const p of wo) {
|
||||
report += "- " + p.commonName + " (" + p.id + ")\n";
|
||||
}
|
||||
writeFileSync(rp, report, "utf-8");
|
||||
console.log("Report: " + rp);
|
||||
} else {
|
||||
console.log("\nALL PLANTS HAVE IMAGES!");
|
||||
}
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch((err: any) => {
|
||||
console.error("Error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
308
scripts/fill-plant-images.ts
Normal file
308
scripts/fill-plant-images.ts
Normal file
@@ -0,0 +1,308 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-plant-images.ts — Fetch plant images from Wikipedia for plants missing them.
|
||||
*
|
||||
* Uses the Wikipedia API to search for the plant's scientific name
|
||||
* and grab the page thumbnail.
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fill-plant-images.ts
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// Load env
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { plants } from "../src/lib/db/schema";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
const WIKI_API = "https://en.wikipedia.org/w/api.php";
|
||||
const UA = "PlantHealthKB/1.0 (plant-images)";
|
||||
const DELAY_MS = 500;
|
||||
const BATCH_SIZE = 50;
|
||||
|
||||
/** Direct page lookup by title — more reliable for known scientific names. */
|
||||
async function directPageLookup(title: string): Promise<string | null> {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
titles: title,
|
||||
prop: "pageimages",
|
||||
pithumbsize: "400",
|
||||
format: "json",
|
||||
origin: "*",
|
||||
});
|
||||
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetch(`${WIKI_API}?${params}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return null;
|
||||
const data = (await res.json()) as {
|
||||
query?: { pages?: Record<string, { thumbnail?: { source: string }; missing?: boolean }> };
|
||||
};
|
||||
const pages = data?.query?.pages;
|
||||
if (!pages) return null;
|
||||
for (const [, p] of Object.entries(pages)) {
|
||||
if (!p.missing && p.thumbnail?.source) return p.thumbnail.source;
|
||||
}
|
||||
return null;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("\n🌿 Fetching plant images from Wikipedia\n");
|
||||
|
||||
const db = getDb();
|
||||
const allPlants = await db
|
||||
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
|
||||
.from(plants)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all();
|
||||
|
||||
console.log(`📋 ${allPlants.length} plants need images\n`);
|
||||
|
||||
if (allPlants.length === 0) {
|
||||
console.log("✅ All plants already have images!\n");
|
||||
closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
const rawClient = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
let found = 0;
|
||||
const updates: { id: string; url: string }[] = [];
|
||||
|
||||
// Phase 1: Try direct page lookup by scientific name (most accurate)
|
||||
console.log("─── Phase 1: Direct page lookup ───\n");
|
||||
|
||||
for (let i = 0; i < allPlants.length; i++) {
|
||||
const plant = allPlants[i];
|
||||
const sciName = plant.scientificName
|
||||
.replace(/[×'"]/g, "")
|
||||
.replace(/\s*spp\.?\s*/i, "")
|
||||
.trim();
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(3)}/${allPlants.length}] ${plant.commonName.padEnd(30)} `,
|
||||
);
|
||||
|
||||
let url: string | null = null;
|
||||
|
||||
// Try scientific name first
|
||||
if (sciName && sciName !== "Unknown" && sciName !== "Various") {
|
||||
url = await directPageLookup(sciName);
|
||||
}
|
||||
|
||||
// Try common name if scientific name didn't work
|
||||
if (!url) {
|
||||
url = await directPageLookup(plant.commonName);
|
||||
}
|
||||
|
||||
// Try genus name
|
||||
if (!url && sciName) {
|
||||
const genus = sciName.split(/\s+/)[0];
|
||||
if (genus && genus.length > 3) {
|
||||
url = await directPageLookup(genus);
|
||||
}
|
||||
}
|
||||
|
||||
if (url) {
|
||||
updates.push({ id: plant.id, url });
|
||||
found++;
|
||||
process.stdout.write("✅\n");
|
||||
} else {
|
||||
process.stdout.write("⏭️\n");
|
||||
}
|
||||
|
||||
// Flush to DB in batches
|
||||
if (updates.length >= BATCH_SIZE) {
|
||||
await rawClient.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
updates.length = 0;
|
||||
}
|
||||
|
||||
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||||
}
|
||||
|
||||
// Flush remaining
|
||||
if (updates.length > 0) {
|
||||
await rawClient.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
console.log(` → Flushed ${updates.length} to DB`);
|
||||
updates.length = 0;
|
||||
}
|
||||
|
||||
console.log(`\n✅ Phase 1 done: ${found}/${allPlants.length} plants got images\n`);
|
||||
|
||||
// Phase 2: Try remaining via search API
|
||||
const stillMissing = await db
|
||||
.select({ id: plants.id, commonName: plants.commonName, scientificName: plants.scientificName })
|
||||
.from(plants)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`)
|
||||
.all();
|
||||
|
||||
if (stillMissing.length > 0) {
|
||||
console.log(`─── Phase 2: Search API for ${stillMissing.length} remaining ───\n`);
|
||||
|
||||
for (let i = 0; i < stillMissing.length; i++) {
|
||||
const plant = stillMissing[i];
|
||||
const sciName = plant.scientificName.replace(/[×'"]/g, "").trim();
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(3)}/${stillMissing.length}] ${plant.commonName.padEnd(30)} `,
|
||||
);
|
||||
|
||||
// Search with scientific name
|
||||
const searchTerm = `${sciName} ${plant.commonName}`;
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
list: "search",
|
||||
srsearch: searchTerm,
|
||||
srlimit: "3",
|
||||
format: "json",
|
||||
origin: "*",
|
||||
});
|
||||
|
||||
let url: string | null = null;
|
||||
for (let attempt = 0; attempt < 3; attempt++) {
|
||||
try {
|
||||
const res = await fetch(`${WIKI_API}?${params}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
await new Promise((r) => setTimeout(r, 3000 * 2 ** attempt));
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) break;
|
||||
const data = (await res.json()) as {
|
||||
query?: { search?: Array<{ title: string; pageid: number }> };
|
||||
};
|
||||
const hits = data?.query?.search ?? [];
|
||||
if (hits.length === 0) break;
|
||||
|
||||
// Get thumbnail for first result
|
||||
for (const hit of hits) {
|
||||
const pageParams = new URLSearchParams({
|
||||
action: "query",
|
||||
pageids: String(hit.pageid),
|
||||
prop: "pageimages",
|
||||
pithumbsize: "400",
|
||||
format: "json",
|
||||
origin: "*",
|
||||
});
|
||||
const pageRes = await fetch(`${WIKI_API}?${pageParams}`, {
|
||||
headers: { "User-Agent": UA },
|
||||
});
|
||||
if (!pageRes.ok) continue;
|
||||
const pageData = (await pageRes.json()) as {
|
||||
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
|
||||
};
|
||||
const pages = pageData?.query?.pages;
|
||||
if (!pages) continue;
|
||||
for (const [, p] of Object.entries(pages)) {
|
||||
if (p.thumbnail?.source) {
|
||||
url = p.thumbnail.source;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (url) break;
|
||||
}
|
||||
break;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
|
||||
if (url) {
|
||||
await rawClient.execute({
|
||||
sql: "UPDATE plants SET image_url = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [url, plant.id],
|
||||
});
|
||||
found++;
|
||||
process.stdout.write("✅\n");
|
||||
} else {
|
||||
process.stdout.write("❌\n");
|
||||
}
|
||||
|
||||
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||||
}
|
||||
}
|
||||
|
||||
// Final count
|
||||
const final = await db
|
||||
.select({ id: plants.id, commonName: plants.commonName, imageUrl: plants.imageUrl })
|
||||
.from(plants)
|
||||
.all();
|
||||
const withImg = final.filter((p) => p.imageUrl);
|
||||
const withoutImg = final.filter((p) => !p.imageUrl);
|
||||
|
||||
console.log(`\n${"═".repeat(50)}`);
|
||||
console.log(`📊 FINAL: ${final.length} plants`);
|
||||
console.log(` With images: ${withImg.length}`);
|
||||
console.log(` Missing images: ${withoutImg.length}`);
|
||||
|
||||
if (withoutImg.length > 0) {
|
||||
console.log(`\n📝 Plants still needing images:`);
|
||||
withoutImg.forEach((p) => console.log(` ❌ ${p.id}: ${p.commonName}`));
|
||||
// Save to file for reference
|
||||
const reportPath = resolve(__dirname, ".plant-image-review-needed.md");
|
||||
let report = "# Plant Images — Still Missing\n\n";
|
||||
report += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
report += `## 🚫 Plants without images (${withoutImg.length})\n\n`;
|
||||
for (const p of withoutImg) {
|
||||
report += `- **${p.commonName}** (\`${p.id}\`)\n`;
|
||||
}
|
||||
writeFileSync(reportPath, report, "utf-8");
|
||||
console.log(` 📝 Review report: ${reportPath}`);
|
||||
} else {
|
||||
console.log("\n✅ All plants now have images!");
|
||||
}
|
||||
|
||||
rawClient.close();
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌", err);
|
||||
process.exit(1);
|
||||
});
|
||||
927
scripts/fill-training-dataset.ts
Normal file
927
scripts/fill-training-dataset.ts
Normal file
@@ -0,0 +1,927 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-training-dataset.ts
|
||||
*
|
||||
* Scans the existing dataset directory and downloads any missing images
|
||||
* to reach the target counts (200 per disease, 400 for healthy).
|
||||
*
|
||||
* Does NOT re-run prevalence queries — just fills gaps from image sources.
|
||||
* Each run scans the directory, reports deficits, then fills them.
|
||||
* Interrupt-safe: re-run to pick up where you left off.
|
||||
*
|
||||
* Parallelism strategy:
|
||||
* - Disease-level: 30 diseases processed concurrently
|
||||
* - Per disease: all 3 DDG queries run in parallel
|
||||
* - Per query: all search pages fetched in parallel
|
||||
* - Per disease: DDG, iNaturalist, and Wikimedia Commons all run concurrently
|
||||
* - A shared DDG token-bucket rate limiter prevents bans
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fill-training-dataset.ts
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync, readdirSync, writeFileSync, existsSync, mkdirSync } from "fs";
|
||||
import { resolve, extname } from "path";
|
||||
|
||||
// Load .env.development for DB creds
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "@/lib/db/index";
|
||||
import { diseases } from "@/lib/db/schema";
|
||||
|
||||
// ─── Config ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const DATASET_DIR = resolve(__dirname, "../data/dataset");
|
||||
const SEEN_CACHE_FILE = resolve(DATASET_DIR, ".fill-seen-urls.json");
|
||||
|
||||
/** Target images per disease */
|
||||
const TARGET_PER_DISEASE = 200;
|
||||
|
||||
/** Target images for the "healthy" class */
|
||||
const TARGET_HEALTHY = 400;
|
||||
|
||||
/**
|
||||
* How many diseases to process in parallel.
|
||||
* Each disease is I/O-bound (HTTP requests), so high concurrency is safe.
|
||||
* The global DDG rate limiter prevents us from overwhelming DuckDuckGo.
|
||||
*/
|
||||
const DISEASE_CONCURRENCY = 20;
|
||||
|
||||
/**
|
||||
* Max DDG requests per second (shared across all concurrent diseases).
|
||||
* DuckDuckGo is fairly tolerant, but we still want to be polite.
|
||||
* With DISEASE_CONCURRENCY=30, each disease fires 3 parallel queries with
|
||||
* parallel pages = 9 parallel DDG requests per disease at peak.
|
||||
* The rate limiter serializes this so we don't get banned.
|
||||
*/
|
||||
const DDG_RATE_LIMIT_RPS = 2;
|
||||
|
||||
/** Max concurrent image downloads per disease */
|
||||
const CONCURRENT_DOWNLOADS = 2;
|
||||
|
||||
/** Minimum image size in bytes to accept */
|
||||
const MIN_IMAGE_SIZE = 10_000; // 10KB
|
||||
|
||||
/** Maximum image size in bytes */
|
||||
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
|
||||
|
||||
/** Allowed file extensions */
|
||||
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
|
||||
|
||||
/** User agent for requests */
|
||||
const UA =
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1";
|
||||
|
||||
/** Healthy class directory name */
|
||||
const HEALTHY_CLASS = "healthy";
|
||||
|
||||
/** How often (in diseases processed) to flush the seen-URLs cache to disk */
|
||||
const SEEN_CACHE_FLUSH_INTERVAL = 20;
|
||||
|
||||
/** Max DDG pages to fetch per query.
|
||||
* Each page returns ~100 image results, so 3 pages × 3 queries = ~900 raw URLs
|
||||
* before dedup — more than enough to find 200 unique, valid images. */
|
||||
const MAX_DDG_PAGES = 3;
|
||||
|
||||
/** Healthy source queries limit */
|
||||
const MAX_HEALTHY_QUERIES = 20;
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface DuckDuckGoImageResult {
|
||||
image: string;
|
||||
title: string;
|
||||
url: string;
|
||||
thumbnail: string;
|
||||
height: number;
|
||||
width: number;
|
||||
}
|
||||
|
||||
interface DiseaseInfo {
|
||||
id: string;
|
||||
name: string;
|
||||
plantId: string;
|
||||
have: number;
|
||||
needed: number;
|
||||
}
|
||||
|
||||
interface CollectResult {
|
||||
urls: string[];
|
||||
exhausted: boolean;
|
||||
}
|
||||
|
||||
// ─── Token-Bucket Rate Limiter ──────────────────────────────────────────────
|
||||
|
||||
class TokenBucket {
|
||||
private tokens: number;
|
||||
private lastRefill: number;
|
||||
private readonly capacity: number;
|
||||
private readonly refillInterval: number; // ms per token (e.g., 100ms for 10 rps)
|
||||
|
||||
constructor(rps: number) {
|
||||
this.capacity = rps;
|
||||
this.tokens = rps;
|
||||
this.lastRefill = Date.now();
|
||||
this.refillInterval = 1000 / rps;
|
||||
}
|
||||
|
||||
/** Acquire one token, blocking until one is available. */
|
||||
async acquire(): Promise<void> {
|
||||
while (true) {
|
||||
this.refill();
|
||||
if (this.tokens >= 1) {
|
||||
this.tokens -= 1;
|
||||
return;
|
||||
}
|
||||
// No tokens — wait for the next one to arrive, then retry
|
||||
await sleep(Math.ceil(this.refillInterval));
|
||||
}
|
||||
}
|
||||
|
||||
private refill(): void {
|
||||
const now = Date.now();
|
||||
const elapsed = now - this.lastRefill;
|
||||
const newTokens = Math.floor(elapsed / this.refillInterval);
|
||||
if (newTokens > 0) {
|
||||
this.tokens = Math.min(this.capacity, this.tokens + newTokens);
|
||||
this.lastRefill = now - (elapsed % this.refillInterval);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Global DDG rate limiter — all concurrent diseases share this
|
||||
const ddgLimiter = new TokenBucket(DDG_RATE_LIMIT_RPS);
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/** Count actual image files in a directory (matching img_* pattern). */
|
||||
function countImagesInDir(dir: string): number {
|
||||
if (!existsSync(dir)) return 0;
|
||||
try {
|
||||
const files = readdirSync(dir);
|
||||
return files.filter((f) => f.startsWith("img_")).length;
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Seen-URLs Cache ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Load the per-disease seen-URLs cache from disk.
|
||||
* This prevents re-fetching the same URLs across runs.
|
||||
*/
|
||||
function loadSeenUrlsCache(): Record<string, string[]> {
|
||||
if (existsSync(SEEN_CACHE_FILE)) {
|
||||
try {
|
||||
return JSON.parse(readFileSync(SEEN_CACHE_FILE, "utf-8"));
|
||||
} catch {}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the seen-URLs cache to disk.
|
||||
*/
|
||||
function saveSeenUrlsCache(cache: Record<string, string[]>): void {
|
||||
writeFileSync(SEEN_CACHE_FILE, JSON.stringify(cache, null, 2));
|
||||
}
|
||||
|
||||
// ─── DDG VQD Token Cache ──────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Simple in-memory cache for DDG VQD tokens.
|
||||
* Tokens are per-query, but if we've fetched one for a similar query recently,
|
||||
* we can skip the initial HTML page fetch.
|
||||
*/
|
||||
const vqdCache = new Map<string, { token: string; expiresAt: number }>();
|
||||
|
||||
function getCachedVqd(query: string): string | undefined {
|
||||
const entry = vqdCache.get(query);
|
||||
if (entry && entry.expiresAt > Date.now()) return entry.token;
|
||||
vqdCache.delete(query);
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function setCachedVqd(query: string, token: string): void {
|
||||
// VQD tokens seem to be valid for a few minutes; cache for 5 min
|
||||
vqdCache.set(query, { token, expiresAt: Date.now() + 5 * 60 * 1000 });
|
||||
// Evict oldest entries if cache grows too large (unlikely but safe)
|
||||
if (vqdCache.size > 500) {
|
||||
const firstKey = vqdCache.keys().next().value;
|
||||
if (firstKey) vqdCache.delete(firstKey);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
|
||||
|
||||
async function getVqdToken(query: string): Promise<string> {
|
||||
const cached = getCachedVqd(query);
|
||||
if (cached) return cached;
|
||||
|
||||
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
|
||||
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA, Accept: "text/html" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
|
||||
|
||||
const html = await res.text();
|
||||
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
|
||||
if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
|
||||
|
||||
setCachedVqd(query, match[1]);
|
||||
return match[1];
|
||||
}
|
||||
|
||||
async function searchImagesDuckDuckGo(
|
||||
query: string,
|
||||
vqd: string,
|
||||
page: number,
|
||||
): Promise<DuckDuckGoImageResult[]> {
|
||||
// Rate-limit before making the request
|
||||
await ddgLimiter.acquire();
|
||||
|
||||
const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
|
||||
query,
|
||||
)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
|
||||
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent": UA,
|
||||
Accept: "application/json",
|
||||
Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
|
||||
},
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
if (res.status === 429) {
|
||||
// Rate limited — wait and retry once
|
||||
await sleep(5_000);
|
||||
return searchImagesDuckDuckGo(query, vqd, page);
|
||||
}
|
||||
if (res.status === 403) return [];
|
||||
// Don't throw for transient errors — just return empty
|
||||
return [];
|
||||
}
|
||||
|
||||
const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
|
||||
return data.results ?? [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect images from DDG for a single query.
|
||||
* Fetches up to MAX_DDG_PAGES pages in PARALLEL (rate-limited via ddgLimiter).
|
||||
*/
|
||||
async function collectFromDdgQuery(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<CollectResult> {
|
||||
const results: string[] = [];
|
||||
|
||||
let vqd: string;
|
||||
try {
|
||||
vqd = await getVqdToken(query);
|
||||
} catch (err) {
|
||||
console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
return { urls: [], exhausted: true };
|
||||
}
|
||||
|
||||
// Fetch all pages in parallel
|
||||
const pageFetches: Promise<DuckDuckGoImageResult[]>[] = [];
|
||||
for (let page = 1; page <= MAX_DDG_PAGES; page++) {
|
||||
pageFetches.push(searchImagesDuckDuckGo(query, vqd, page));
|
||||
}
|
||||
|
||||
const pageResults = await Promise.allSettled(pageFetches);
|
||||
|
||||
for (const settled of pageResults) {
|
||||
if (settled.status !== "fulfilled") continue;
|
||||
if (results.length >= target) break;
|
||||
|
||||
for (const r of settled.value) {
|
||||
if (results.length >= target) break;
|
||||
const imgUrl = r.image || r.url;
|
||||
if (!imgUrl || typeof imgUrl !== "string") continue;
|
||||
if (seenUrls.has(imgUrl)) continue;
|
||||
try {
|
||||
new URL(imgUrl);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
seenUrls.add(imgUrl);
|
||||
results.push(imgUrl);
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: results.slice(0, target), exhausted: results.length < target };
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect images from DDG across ALL queries for a disease.
|
||||
* Runs all queries in PARALLEL, then merges deduplicated results.
|
||||
*/
|
||||
async function collectImagesDuckDuckGo(
|
||||
queries: string[],
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<{ urls: string[]; exhausted: boolean }> {
|
||||
// Run all queries in parallel
|
||||
const queryResults = await Promise.allSettled(
|
||||
queries.map((q) => collectFromDdgQuery(q, target, seenUrls)),
|
||||
);
|
||||
|
||||
// Merge results — seenUrls already deduplicates across queries
|
||||
const merged: string[] = [];
|
||||
for (const settled of queryResults) {
|
||||
if (settled.status === "fulfilled") {
|
||||
merged.push(...settled.value.urls);
|
||||
if (merged.length >= target) break;
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: merged.slice(0, target), exhausted: merged.length < target };
|
||||
}
|
||||
|
||||
// ─── iNaturalist API ───────────────────────────────────────────────────────
|
||||
|
||||
async function searchImagesInaturalist(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<CollectResult> {
|
||||
const results: string[] = [];
|
||||
const perPage = Math.min(target, 200);
|
||||
|
||||
const apiUrl =
|
||||
`https://api.inaturalist.org/v1/observations` +
|
||||
`?q=${encodeURIComponent(query)}` +
|
||||
`&photos_only=true` +
|
||||
`&quality_grade=research` +
|
||||
`&per_page=${perPage}` +
|
||||
`&order_by=observed_on&order=desc`;
|
||||
|
||||
try {
|
||||
const res = await fetch(apiUrl, {
|
||||
headers: { "User-Agent": UA, Accept: "application/json" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!res.ok) return { urls: [], exhausted: false };
|
||||
|
||||
const data = (await res.json()) as {
|
||||
results: Array<{ photos: Array<{ url: string }> }>;
|
||||
};
|
||||
|
||||
for (const obs of data.results ?? []) {
|
||||
if (results.length >= target) break;
|
||||
for (const photo of obs.photos ?? []) {
|
||||
if (results.length >= target) break;
|
||||
const url = photo.url;
|
||||
if (!url || seenUrls.has(url)) continue;
|
||||
const fullUrl = url.replace("/medium.", "/original.");
|
||||
seenUrls.add(fullUrl);
|
||||
results.push(fullUrl);
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: results, exhausted: results.length < target };
|
||||
} catch {
|
||||
return { urls: results, exhausted: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Wikimedia Commons API ─────────────────────────────────────────────────
|
||||
|
||||
async function searchImagesCommons(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<CollectResult> {
|
||||
const results: string[] = [];
|
||||
let sroffset = 0;
|
||||
|
||||
while (results.length < target) {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
list: "search",
|
||||
srsearch: query,
|
||||
srnamespace: "6",
|
||||
srlimit: "50",
|
||||
sroffset: String(sroffset),
|
||||
format: "json",
|
||||
});
|
||||
|
||||
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA },
|
||||
signal: AbortSignal.timeout(10_000),
|
||||
});
|
||||
if (!res.ok) break;
|
||||
|
||||
const data = (await res.json()) as {
|
||||
query?: { search?: Array<{ title: string }> };
|
||||
continue?: { sroffset?: number };
|
||||
};
|
||||
|
||||
const hits = data.query?.search ?? [];
|
||||
if (hits.length === 0) break;
|
||||
|
||||
for (const hit of hits) {
|
||||
if (results.length >= target) break;
|
||||
const filename = hit.title.replace(/^File:/, "");
|
||||
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
|
||||
filename,
|
||||
)}`;
|
||||
if (seenUrls.has(imgUrl)) continue;
|
||||
seenUrls.add(imgUrl);
|
||||
results.push(imgUrl);
|
||||
}
|
||||
|
||||
sroffset = data.continue?.sroffset ?? sroffset + hits.length;
|
||||
} catch {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: results, exhausted: results.length < target };
|
||||
}
|
||||
|
||||
// ─── Image Download ─────────────────────────────────────────────────────────
|
||||
|
||||
async function downloadImage(url: string, destPath: string): Promise<boolean> {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
|
||||
signal: AbortSignal.timeout(8_000),
|
||||
});
|
||||
if (!res.ok) return false;
|
||||
|
||||
const contentType = res.headers.get("content-type") || "";
|
||||
if (contentType.includes("text/html")) return false;
|
||||
|
||||
const buffer = Buffer.from(await res.arrayBuffer());
|
||||
if (buffer.length < MIN_IMAGE_SIZE) return false;
|
||||
if (buffer.length > MAX_IMAGE_SIZE) return false;
|
||||
|
||||
let ext = extname(new URL(url).pathname).toLowerCase();
|
||||
if (!ALLOWED_EXTENSIONS.includes(ext)) {
|
||||
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
|
||||
else if (contentType.includes("png")) ext = ".png";
|
||||
else if (contentType.includes("webp")) ext = ".webp";
|
||||
else ext = ".jpg";
|
||||
}
|
||||
|
||||
const filePath = destPath.replace(/\.\w+$/, ext);
|
||||
writeFileSync(filePath, buffer);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function downloadBatch(
|
||||
urls: string[],
|
||||
classDir: string,
|
||||
startIndex: number,
|
||||
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
|
||||
let downloaded = 0;
|
||||
let failed = 0;
|
||||
let index = startIndex;
|
||||
|
||||
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
|
||||
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
|
||||
|
||||
const results = await Promise.all(
|
||||
chunk.map(async (url) => {
|
||||
const paddedIndex = String(index).padStart(4, "0");
|
||||
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
|
||||
const success = await downloadImage(url, destPath);
|
||||
return { success, index: index++ };
|
||||
}),
|
||||
);
|
||||
|
||||
for (const r of results) {
|
||||
if (r.success) downloaded++;
|
||||
else failed++;
|
||||
}
|
||||
}
|
||||
|
||||
return { downloaded, failed, lastIndex: index };
|
||||
}
|
||||
|
||||
// ─── Query Building ─────────────────────────────────────────────────────────
|
||||
|
||||
function buildSearchQueries(name: string, plant: string): string[] {
|
||||
return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
|
||||
}
|
||||
|
||||
function buildHealthyQueries(plant: string): string[] {
|
||||
const name = plant.replace(/-/g, " ");
|
||||
return [
|
||||
`healthy ${name} leaf`,
|
||||
`${name} leaf closeup`,
|
||||
`healthy ${name} plant`,
|
||||
`${name} foliage`,
|
||||
];
|
||||
}
|
||||
|
||||
// ─── Fill Logic ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Try to collect up to `needed` images for a disease by hitting all three
|
||||
* sources IN PARALLEL. Returns how many new images were actually downloaded.
|
||||
*
|
||||
* Sources (DDG with its 3 internal queries, iNat, Commons) all run concurrently.
|
||||
* As soon as any source completes, its URLs are downloaded immediately while
|
||||
* other sources are still searching (pipeline).
|
||||
*/
|
||||
async function fillClass(
|
||||
_diseaseId: string,
|
||||
queries: string[],
|
||||
needed: number,
|
||||
classDir: string,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<number> {
|
||||
if (needed <= 0) return 0;
|
||||
|
||||
mkdirSync(classDir, { recursive: true });
|
||||
const startCount = countImagesInDir(classDir);
|
||||
|
||||
// ── Run all sources in parallel, pipelining downloads ──────────────────
|
||||
// Start downloading from each source as soon as it returns results, rather
|
||||
// than waiting for all sources to complete. DDG is (by far) the richest
|
||||
// source, so its results start saving to disk while iNat and Commons are
|
||||
// still searching.
|
||||
//
|
||||
// Each source gets a DEDICATED index range so there's no race condition
|
||||
// writing files. DDG gets [startCount, startCount+199], iNat gets
|
||||
// [startCount+200, startCount+399], Commons gets [startCount+400,...].
|
||||
// The 4-digit filename supports up to 9999, well beyond our 200 target.
|
||||
|
||||
let totalDownloaded = 0;
|
||||
let totalFailed = 0;
|
||||
let anySuccess = false;
|
||||
|
||||
const collectAndDownload = async (
|
||||
label: string,
|
||||
collector: () => Promise<CollectResult>,
|
||||
indexOffset: number,
|
||||
): Promise<void> => {
|
||||
const result = await collector();
|
||||
if (result.urls.length === 0) return;
|
||||
console.log(` ${label}: ${result.urls.length} new URLs`);
|
||||
|
||||
// Each source writes to its own non-overlapping range
|
||||
const { downloaded, failed } = await downloadBatch(result.urls, classDir, indexOffset);
|
||||
totalDownloaded += downloaded;
|
||||
totalFailed += failed;
|
||||
if (downloaded > 0) anySuccess = true;
|
||||
};
|
||||
|
||||
await Promise.allSettled([
|
||||
collectAndDownload("DDG", () => collectImagesDuckDuckGo(queries, needed, seenUrls), startCount),
|
||||
collectAndDownload(
|
||||
"iNat",
|
||||
() => searchImagesInaturalist(queries[0], needed, seenUrls),
|
||||
startCount + TARGET_PER_DISEASE,
|
||||
),
|
||||
collectAndDownload(
|
||||
"Commons",
|
||||
() => searchImagesCommons(queries[0], needed, seenUrls),
|
||||
startCount + 2 * TARGET_PER_DISEASE,
|
||||
),
|
||||
]);
|
||||
|
||||
if (!anySuccess) {
|
||||
console.log(` ✗ No new images found from any source`);
|
||||
return 0;
|
||||
}
|
||||
|
||||
const newTotal = countImagesInDir(classDir);
|
||||
const gained = newTotal - startCount;
|
||||
console.log(
|
||||
` ✓ ${totalDownloaded}/${totalDownloaded + totalFailed} downloaded` +
|
||||
` (${totalFailed} failed, ${gained} new files)`,
|
||||
);
|
||||
|
||||
return gained;
|
||||
}
|
||||
|
||||
// ─── Directory Scanner ─────────────────────────────────────────────────────
|
||||
|
||||
interface ScanResult {
|
||||
/** Disease id → how many images currently on disk */
|
||||
diseaseCounts: Map<string, number>;
|
||||
/** How many healthy images on disk */
|
||||
healthyCount: number;
|
||||
}
|
||||
|
||||
function scanDataset(): ScanResult {
|
||||
const diseaseCounts = new Map<string, number>();
|
||||
let healthyCount = 0;
|
||||
|
||||
if (!existsSync(DATASET_DIR)) {
|
||||
return { diseaseCounts, healthyCount: 0 };
|
||||
}
|
||||
|
||||
const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
if (!entry.isDirectory()) continue;
|
||||
if (entry.name.startsWith(".")) continue;
|
||||
|
||||
if (entry.name === HEALTHY_CLASS) {
|
||||
healthyCount = countImagesInDir(resolve(DATASET_DIR, entry.name));
|
||||
} else {
|
||||
const count = countImagesInDir(resolve(DATASET_DIR, entry.name));
|
||||
if (count > 0) {
|
||||
diseaseCounts.set(entry.name, count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { diseaseCounts, healthyCount };
|
||||
}
|
||||
|
||||
// ─── CLI Flags ──────────────────────────────────────────────────────────────
|
||||
|
||||
function parseFlags(): { reverse: boolean } {
|
||||
const args = process.argv.slice(2);
|
||||
return {
|
||||
reverse: args.includes("--reverse") || args.includes("-r"),
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Main ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const flags = parseFlags();
|
||||
|
||||
console.log("=".repeat(60));
|
||||
console.log("TRAINING DATASET FILL — Parallelized gap-filling download");
|
||||
if (flags.reverse) console.log(" (reverse order — processing from lowest deficit first)");
|
||||
console.log("=".repeat(60));
|
||||
|
||||
// Ensure dataset directory exists
|
||||
mkdirSync(DATASET_DIR, { recursive: true });
|
||||
|
||||
// ── Step 1: Scan what we already have ────────────────────────────────────
|
||||
console.log("\nScanning existing dataset...");
|
||||
const { diseaseCounts, healthyCount } = scanDataset();
|
||||
console.log(` Found ${diseaseCounts.size} disease directories, ${healthyCount} healthy images`);
|
||||
|
||||
// ── Step 2: Load disease info from DB ────────────────────────────────────
|
||||
console.log("\nLoading disease info from database...");
|
||||
const db = getDb();
|
||||
|
||||
const allDiseases = await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
plantId: diseases.plantId,
|
||||
name: diseases.name,
|
||||
})
|
||||
.from(diseases);
|
||||
|
||||
// Build a deduplicated map: disease id → first disease info found
|
||||
const diseaseInfo = new Map<string, { name: string; plantId: string }>();
|
||||
for (const d of allDiseases) {
|
||||
if (!diseaseInfo.has(d.id)) {
|
||||
diseaseInfo.set(d.id, { name: d.name, plantId: d.plantId });
|
||||
}
|
||||
}
|
||||
console.log(` Loaded ${diseaseInfo.size} unique diseases from DB`);
|
||||
|
||||
// ── Step 3: Build deficit list ──────────────────────────────────────────
|
||||
const deficits: DiseaseInfo[] = [];
|
||||
|
||||
for (const [id, info] of diseaseInfo) {
|
||||
const have = diseaseCounts.get(id) ?? 0;
|
||||
const needed = TARGET_PER_DISEASE - have;
|
||||
if (needed > 0) {
|
||||
deficits.push({ id, name: info.name, plantId: info.plantId, have, needed });
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by deficit size (largest first) so we prioritize the neediest diseases
|
||||
deficits.sort((a, b) => b.needed - a.needed);
|
||||
|
||||
// Reverse order if --reverse/-r flag is set (useful to try a different
|
||||
// direction when the front of the queue keeps hitting dead URLs)
|
||||
if (flags.reverse) deficits.reverse();
|
||||
|
||||
const healthyDeficit = TARGET_HEALTHY - healthyCount;
|
||||
|
||||
console.log(`\n${"=".repeat(60)}`);
|
||||
console.log("DEFICIT REPORT");
|
||||
console.log(`${"=".repeat(60)}`);
|
||||
console.log(` Diseases needing images: ${deficits.length}/${diseaseInfo.size}`);
|
||||
console.log(` Total images missing: ${deficits.reduce((s, d) => s + d.needed, 0)}`);
|
||||
console.log(` Healthy deficit: ${Math.max(0, healthyDeficit)}`);
|
||||
console.log(` Parallelism: ${DISEASE_CONCURRENCY} diseases at once`);
|
||||
console.log(` DDG rate limit: ${DDG_RATE_LIMIT_RPS} req/s (shared)`);
|
||||
console.log(
|
||||
` Order: ${flags.reverse ? "reverse (--reverse)" : "normal (deficit-first)"}`,
|
||||
);
|
||||
console.log(`${"=".repeat(60)}`);
|
||||
|
||||
if (deficits.length === 0 && healthyDeficit <= 0) {
|
||||
console.log("\n ✓ Nothing to do — all targets met!\n");
|
||||
await closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
// ── Step 4: Load seen-URLs cache ────────────────────────────────────────
|
||||
const seenUrlsCache = loadSeenUrlsCache();
|
||||
let totalDownloaded = 0;
|
||||
let totalFailed = 0;
|
||||
let diseasesProcessed = 0;
|
||||
const startTime = Date.now();
|
||||
|
||||
// ── Step 5: Fill disease deficits ───────────────────────────────────────
|
||||
if (deficits.length > 0) {
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log(`FILLING ${deficits.length} DISEASES (target: ${TARGET_PER_DISEASE} each)`);
|
||||
console.log("─".repeat(60));
|
||||
|
||||
// Process in parallel batches
|
||||
for (let i = 0; i < deficits.length; i += DISEASE_CONCURRENCY) {
|
||||
const batch = deficits.slice(i, i + DISEASE_CONCURRENCY);
|
||||
const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
|
||||
const totalBatches = Math.ceil(deficits.length / DISEASE_CONCURRENCY);
|
||||
|
||||
console.log(`\n[Batch ${batchNum}/${totalBatches}] Processing ${batch.length} diseases...`);
|
||||
|
||||
// Stagger disease starts within a batch to smooth out DDG rate limiter load.
|
||||
// Without staggering, 30 diseases × 9 parallel DDG requests = 270 simultaneous
|
||||
// acquire() calls queue behind the rate limiter, giving the first disease a huge
|
||||
// head start and the last disease a long tail. Staggering by 200ms each spreads
|
||||
// the load evenly, reducing tail latency and improving overall throughput.
|
||||
const STAGGER_MS = 200;
|
||||
const batchResults = await Promise.allSettled(
|
||||
batch.map((d, idx) =>
|
||||
(async () => {
|
||||
if (idx > 0) await sleep(idx * STAGGER_MS);
|
||||
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d.name, d.plantId);
|
||||
const seen = new Set<string>(seenUrlsCache[d.id] ?? []);
|
||||
|
||||
console.log(
|
||||
` [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
|
||||
);
|
||||
|
||||
const gained = await fillClass(d.id, queries, d.needed, classDir, seen);
|
||||
|
||||
// Update seen-URLs cache for this disease
|
||||
seenUrlsCache[d.id] = Array.from(seen);
|
||||
return gained;
|
||||
})(),
|
||||
),
|
||||
);
|
||||
|
||||
// Aggregate batch results
|
||||
for (const result of batchResults) {
|
||||
if (result.status === "fulfilled") {
|
||||
totalDownloaded += result.value;
|
||||
} else {
|
||||
console.error(` ✗ Disease failed: ${result.reason}`);
|
||||
}
|
||||
}
|
||||
|
||||
diseasesProcessed += batch.length;
|
||||
|
||||
// Flush seen-URLs cache to disk periodically (not after every disease)
|
||||
if (
|
||||
diseasesProcessed % SEEN_CACHE_FLUSH_INTERVAL < batch.length ||
|
||||
i + batch.length >= deficits.length
|
||||
) {
|
||||
saveSeenUrlsCache(seenUrlsCache);
|
||||
}
|
||||
|
||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||
const rate = diseasesProcessed / Math.max(1, elapsed);
|
||||
const remaining = deficits.length - diseasesProcessed;
|
||||
const eta = remaining / Math.max(0.01, rate);
|
||||
console.log(
|
||||
` [Batch ${batchNum}/${totalBatches}] checkpoint — ` +
|
||||
`${totalDownloaded} downloaded, ` +
|
||||
`${diseasesProcessed}/${deficits.length} diseases (${rate.toFixed(1)}/s, ` +
|
||||
`ETA: ${Math.round(eta)}s)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Step 6: Fill healthy deficit ────────────────────────────────────────
|
||||
if (healthyDeficit > 0) {
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log(`FILLING HEALTHY CLASS (target: ${TARGET_HEALTHY})`);
|
||||
console.log("─".repeat(60));
|
||||
|
||||
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
|
||||
mkdirSync(healthyDir, { recursive: true });
|
||||
|
||||
// Collect all unique plants from the disease info
|
||||
const allPlants = [...new Set(diseaseInfo.values())].map((d) => d.plantId);
|
||||
const allHealthyQueries: string[] = [];
|
||||
for (const plant of allPlants) {
|
||||
allHealthyQueries.push(...buildHealthyQueries(plant));
|
||||
}
|
||||
|
||||
const healthySeen = new Set<string>(seenUrlsCache[HEALTHY_CLASS] ?? []);
|
||||
const healthyNeeded = TARGET_HEALTHY - countImagesInDir(healthyDir);
|
||||
|
||||
// Run all 3 sources in parallel for the healthy class too
|
||||
const [ddgUrls, inatUrls, commonsUrls] = await Promise.allSettled([
|
||||
collectImagesDuckDuckGo(
|
||||
allHealthyQueries.slice(0, MAX_HEALTHY_QUERIES),
|
||||
healthyNeeded,
|
||||
healthySeen,
|
||||
),
|
||||
searchImagesInaturalist(allHealthyQueries[0], healthyNeeded, healthySeen),
|
||||
searchImagesCommons(allHealthyQueries[0], healthyNeeded, healthySeen),
|
||||
]);
|
||||
|
||||
const allUrls: string[] = [];
|
||||
for (const settled of [ddgUrls, inatUrls, commonsUrls]) {
|
||||
if (settled.status === "fulfilled") {
|
||||
allUrls.push(...settled.value.urls);
|
||||
}
|
||||
}
|
||||
|
||||
if (allUrls.length > 0) {
|
||||
console.log(`\n Downloading ${allUrls.length} healthy images...`);
|
||||
const startIdx = countImagesInDir(healthyDir);
|
||||
const { downloaded, failed } = await downloadBatch(allUrls, healthyDir, startIdx);
|
||||
|
||||
const newTotal = countImagesInDir(healthyDir);
|
||||
const gained = newTotal - healthyCount;
|
||||
totalDownloaded += gained;
|
||||
totalFailed += failed;
|
||||
|
||||
console.log(
|
||||
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images.` +
|
||||
` Total healthy: ${newTotal}/${TARGET_HEALTHY} (${gained} new)`,
|
||||
);
|
||||
} else {
|
||||
console.log(`\n ✗ No healthy images found`);
|
||||
}
|
||||
|
||||
// Update seen-URLs cache
|
||||
seenUrlsCache[HEALTHY_CLASS] = Array.from(healthySeen);
|
||||
saveSeenUrlsCache(seenUrlsCache);
|
||||
}
|
||||
|
||||
// ── Summary ──────────────────────────────────────────────────────────────
|
||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||
const mins = Math.floor(elapsed / 60);
|
||||
const hrs = Math.floor(mins / 60);
|
||||
|
||||
// Final scan
|
||||
const finalScan = scanDataset();
|
||||
const totalHave = [...finalScan.diseaseCounts.values()].reduce((s, c) => s + c, 0);
|
||||
const atTarget = [...finalScan.diseaseCounts.values()].filter(
|
||||
(c) => c >= TARGET_PER_DISEASE,
|
||||
).length;
|
||||
|
||||
console.log("\n" + "=".repeat(60));
|
||||
console.log(" ✅ FILL COMPLETE");
|
||||
console.log("=".repeat(60));
|
||||
console.log(` Time: ${hrs}h ${mins % 60}m`);
|
||||
console.log(` Diseases at target: ${atTarget}/${diseaseInfo.size}`);
|
||||
console.log(` Total images: ${totalHave}`);
|
||||
console.log(` Healthy images: ${finalScan.healthyCount}/${TARGET_HEALTHY}`);
|
||||
console.log(` New downloads: ${totalDownloaded}`);
|
||||
console.log(` Dataset dir: ${DATASET_DIR}/`);
|
||||
|
||||
await closeDb();
|
||||
console.log("=".repeat(60));
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\nFatal error:", `\n${err}`);
|
||||
process.exit(1);
|
||||
});
|
||||
537
scripts/fine-tune-model.py
Normal file
537
scripts/fine-tune-model.py
Normal file
@@ -0,0 +1,537 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
fine-tune-model.py
|
||||
|
||||
Fine-tunes the PlantVillage MobileNetV2 model on a custom 95-class dataset
|
||||
(93 diseases + healthy + unknown).
|
||||
|
||||
Pipeline:
|
||||
1. Load `best_mnv2_pv_original.keras` (MobileNetV2 backbone + 38-class head)
|
||||
2. Replace the 38-class head with 95 classes (order matches diseases.json + healthy + unknown)
|
||||
3. Freeze backbone, train only the new classification head
|
||||
4. Unfreeze the last ~20 layers, fine-tune at lower learning rate
|
||||
5. Export to TF.js GraphModel format
|
||||
6. Export to .keras for future retraining
|
||||
|
||||
Usage: .tfjs-venv/bin/python scripts/fine-tune-model.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # Suppress TF info/warnings
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import keras
|
||||
from keras import layers, optimizers, regularizers
|
||||
|
||||
# ─── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
MODEL_PATH = (
|
||||
PROJECT_ROOT
|
||||
/ "public"
|
||||
/ "models"
|
||||
/ "plant-disease-classifier"
|
||||
/ "best_mnv2_pv_original.keras"
|
||||
)
|
||||
DISEASES_JSON = PROJECT_ROOT / "src" / "data" / "diseases.json"
|
||||
DATASET_DIR = PROJECT_ROOT / "data" / "dataset"
|
||||
OUTPUT_DIR = PROJECT_ROOT / "public" / "models" / "plant-disease-classifier"
|
||||
TFJS_OUTPUT = OUTPUT_DIR / "tfjs_finetuned"
|
||||
|
||||
IMG_SIZE = 160 # Model input size
|
||||
BATCH_SIZE = 32
|
||||
EPOCHS_HEAD = 15 # Train just the new head
|
||||
EPOCHS_FINETUNE = 10 # Unfreeze and fine-tune
|
||||
LEARNING_RATE_HEAD = 1e-3
|
||||
LEARNING_RATE_FINETUNE = 1e-5
|
||||
VALIDATION_SPLIT = 0.15
|
||||
|
||||
NUM_CLASSES = 95 # healthy(0) + 93 diseases + unknown(94)
|
||||
|
||||
# ─── Class Mapping ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def build_class_mapping():
|
||||
"""
|
||||
Build a dict mapping dataset directory names → model class indices.
|
||||
Matches the ordering in labels.ts / diseases.json.
|
||||
|
||||
Index 0 = "healthy"
|
||||
Index 1-93 = disease IDs (in diseases.json order)
|
||||
Index 94 = "unknown" (no images — skip during training)
|
||||
"""
|
||||
with open(DISEASES_JSON) as f:
|
||||
diseases = json.load(f)
|
||||
|
||||
mapping = {"healthy": 0}
|
||||
for i, disease in enumerate(diseases):
|
||||
mapping[disease["id"]] = i + 1 # Index 1-93
|
||||
mapping["unknown"] = 94 # Not trained, but reserved
|
||||
|
||||
# Reverse mapping for predictions
|
||||
index_to_class = {v: k for k, v in mapping.items()}
|
||||
|
||||
return mapping, index_to_class
|
||||
|
||||
|
||||
def verify_dataset(mapping):
|
||||
"""Find which classes have images and how many."""
|
||||
available = {}
|
||||
total = 0
|
||||
|
||||
for class_id, class_idx in mapping.items():
|
||||
class_dir = DATASET_DIR / class_id
|
||||
if not class_dir.exists():
|
||||
continue
|
||||
|
||||
image_paths = sorted(class_dir.glob("*"))
|
||||
image_paths = [
|
||||
p
|
||||
for p in image_paths
|
||||
if p.suffix.lower() in (".jpg", ".jpeg", ".png", ".webp")
|
||||
]
|
||||
|
||||
if image_paths:
|
||||
available[class_id] = {"index": class_idx, "count": len(image_paths)}
|
||||
total += len(image_paths)
|
||||
|
||||
return available, total
|
||||
|
||||
|
||||
def print_dataset_summary(available, total):
|
||||
"""Print a summary of what's available."""
|
||||
print(f"\n{'─' * 60}")
|
||||
print("DATASET SUMMARY")
|
||||
print(f"{'─' * 60}")
|
||||
print(f" Total images: {total}")
|
||||
print(f" Classes found: {len(available)} / {len(build_class_mapping()[0])}")
|
||||
print(
|
||||
f" Missing classes with no images: {len(build_class_mapping()[0]) - len(available)}"
|
||||
)
|
||||
|
||||
# Count images per class
|
||||
counts = [(v["index"], k, v["count"]) for k, v in available.items()]
|
||||
counts.sort(key=lambda x: x[1])
|
||||
|
||||
print("\n Images per class:")
|
||||
for idx, class_id, count in counts:
|
||||
label = f" {idx:3d}. {class_id:<35s} {count:>4d} images"
|
||||
if class_id == "healthy":
|
||||
label += " ← 2× target"
|
||||
print(label)
|
||||
|
||||
# Stats
|
||||
class_counts = [v["count"] for v in available.values()]
|
||||
if class_counts:
|
||||
print(
|
||||
f"\n Min: {min(class_counts)} Max: {max(class_counts)} Avg: {sum(class_counts) / len(class_counts):.0f}"
|
||||
)
|
||||
print(f"{'─' * 60}\n")
|
||||
|
||||
|
||||
# ─── Data Loading ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_dataset(mapping, available):
|
||||
"""
|
||||
Load images from the dataset directory.
|
||||
Returns train/validation datasets with augmentation.
|
||||
"""
|
||||
# Build file paths and labels
|
||||
file_paths = []
|
||||
labels = []
|
||||
|
||||
for class_id, info in available.items():
|
||||
class_dir = DATASET_DIR / class_id
|
||||
images = sorted(class_dir.glob("*"))
|
||||
images = [
|
||||
p for p in images if p.suffix.lower() in (".jpg", ".jpeg", ".png", ".webp")
|
||||
]
|
||||
|
||||
for img_path in images:
|
||||
file_paths.append(str(img_path))
|
||||
labels.append(info["index"])
|
||||
|
||||
file_paths = np.array(file_paths)
|
||||
labels = np.array(labels)
|
||||
|
||||
# Shuffle
|
||||
indices = np.random.RandomState(42).permutation(len(file_paths))
|
||||
file_paths = file_paths[indices]
|
||||
labels = labels[indices]
|
||||
|
||||
# Split train/validation
|
||||
split = int(len(file_paths) * (1 - VALIDATION_SPLIT))
|
||||
train_paths, val_paths = file_paths[:split], file_paths[split:]
|
||||
train_labels, val_labels = labels[:split], labels[split:]
|
||||
|
||||
print(f" Train: {len(train_paths)} images")
|
||||
print(f" Val: {len(val_paths)} images")
|
||||
|
||||
# Parse function
|
||||
def parse_image(image_path, label):
|
||||
img = tf.io.read_file(image_path)
|
||||
img = tf.image.decode_image(img, channels=3, expand_animations=False)
|
||||
img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
|
||||
img = tf.cast(img, tf.float32) / 255.0
|
||||
# ImageNet normalization (matching training-time preprocessing)
|
||||
mean = tf.constant([0.485, 0.456, 0.406])
|
||||
std = tf.constant([0.229, 0.224, 0.225])
|
||||
img = (img - mean) / std
|
||||
return img, label
|
||||
|
||||
def augment(image, label):
|
||||
"""Data augmentation for training set."""
|
||||
# Random horizontal flip
|
||||
image = tf.image.random_flip_left_right(image)
|
||||
# Random rotation (±20°)
|
||||
image = tf.image.random_flip_up_down(image)
|
||||
# Random brightness
|
||||
image = tf.image.random_brightness(image, 0.15)
|
||||
# Random contrast
|
||||
image = tf.image.random_contrast(image, 0.8, 1.2)
|
||||
# Random saturation
|
||||
image = tf.image.random_saturation(image, 0.8, 1.2)
|
||||
# Random hue
|
||||
image = tf.image.random_hue(image, 0.05)
|
||||
# Random crop (after slightly scaling up)
|
||||
image = tf.image.resize_with_crop_or_pad(image, IMG_SIZE + 12, IMG_SIZE + 12)
|
||||
image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
|
||||
# Clip to valid range after augmentations
|
||||
image = tf.clip_by_value(image, -2.5, 2.5)
|
||||
return image, label
|
||||
|
||||
# Create tf.data datasets
|
||||
train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
|
||||
train_ds = train_ds.map(parse_image, num_parallel_calls=tf.data.AUTOTUNE)
|
||||
train_ds = train_ds.map(augment, num_parallel_calls=tf.data.AUTOTUNE)
|
||||
train_ds = train_ds.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
|
||||
|
||||
val_ds = tf.data.Dataset.from_tensor_slices((val_paths, val_labels))
|
||||
val_ds = val_ds.map(parse_image, num_parallel_calls=tf.data.AUTOTUNE)
|
||||
val_ds = val_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
|
||||
|
||||
return train_ds, val_ds
|
||||
|
||||
|
||||
# ─── Model Building ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def build_model():
|
||||
"""
|
||||
Load the PlantVillage model and replace the classification head
|
||||
with a 95-class output.
|
||||
"""
|
||||
print(f"\nLoading base model from: {MODEL_PATH}")
|
||||
if not MODEL_PATH.exists():
|
||||
print(f"ERROR: Model not found at {MODEL_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
base_model = keras.models.load_model(str(MODEL_PATH))
|
||||
print(f" Base model loaded: {type(base_model).__name__}")
|
||||
print(f" Input shape: {base_model.input_shape}")
|
||||
print(f" Output shape: {base_model.output_shape}")
|
||||
|
||||
# Extract backbone — everything up to the GlobalAveragePooling2D
|
||||
# The model structure is:
|
||||
# input_layer_2 → mobilenetv2_1.00_160 → global_average_pooling2d → dropout → dense(38)
|
||||
backbone_output = base_model.get_layer("global_average_pooling2d").output
|
||||
print(" Using backbone output: global_average_pooling2d")
|
||||
|
||||
# Freeze all backbone layers initially
|
||||
# (we'll unfreeze later for fine-tuning)
|
||||
for layer in base_model.layers:
|
||||
if layer.name != "dense": # We'll replace this anyway
|
||||
layer.trainable = False
|
||||
|
||||
# Build new classification head
|
||||
x = backbone_output
|
||||
x = layers.Dropout(0.3, name="dropout_new")(x)
|
||||
x = layers.Dense(
|
||||
NUM_CLASSES,
|
||||
activation="softmax",
|
||||
name="dense_new",
|
||||
kernel_regularizer=regularizers.l2(1e-4),
|
||||
)(x)
|
||||
|
||||
# Create new model
|
||||
model = keras.Model(
|
||||
inputs=base_model.input, outputs=x, name="plant-disease-classifier-v2"
|
||||
)
|
||||
|
||||
print(f" New model input: {model.input_shape}")
|
||||
print(f" New model output: {model.output_shape} ({NUM_CLASSES} classes)")
|
||||
|
||||
# Count trainable params
|
||||
backbone_trainable = sum(
|
||||
w.shape.num_elements()
|
||||
for layer in base_model.layers
|
||||
if layer.name != "dense"
|
||||
for w in layer.trainable_weights
|
||||
)
|
||||
head_trainable = sum(
|
||||
w.shape.num_elements() for w in model.get_layer("dense_new").trainable_weights
|
||||
)
|
||||
|
||||
print(f" Backbone frozen: {backbone_trainable:,} params (not training)")
|
||||
print(f" New head: {head_trainable:,} params (training)")
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# ─── Training ────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def train_head(model, train_ds, val_ds):
|
||||
"""Stage 1: Train only the new classification head."""
|
||||
print(f"\n{'=' * 60}")
|
||||
print("STAGE 1: Training classification head")
|
||||
print(f"{'=' * 60}")
|
||||
print(f" Epochs: {EPOCHS_HEAD}")
|
||||
print(f" Learning rate: {LEARNING_RATE_HEAD}")
|
||||
print(f" Batch size: {BATCH_SIZE}")
|
||||
|
||||
# Freeze all backbone layers
|
||||
for layer in model.layers:
|
||||
if layer.name != "dense_new":
|
||||
layer.trainable = False
|
||||
else:
|
||||
layer.trainable = True
|
||||
|
||||
# Verify
|
||||
trainable = sum(w.shape.num_elements() for w in model.trainable_weights)
|
||||
total = sum(w.shape.num_elements() for w in model.weights)
|
||||
print(f" Trainable params: {trainable:,} / {total:,} total")
|
||||
|
||||
model.compile(
|
||||
optimizer=optimizers.Adam(learning_rate=LEARNING_RATE_HEAD),
|
||||
loss="sparse_categorical_crossentropy",
|
||||
metrics=["accuracy", "sparse_top_k_categorical_accuracy"],
|
||||
)
|
||||
|
||||
history = model.fit(
|
||||
train_ds,
|
||||
validation_data=val_ds,
|
||||
epochs=EPOCHS_HEAD,
|
||||
verbose=1,
|
||||
callbacks=[
|
||||
keras.callbacks.EarlyStopping(
|
||||
monitor="val_accuracy",
|
||||
patience=3,
|
||||
restore_best_weights=True,
|
||||
),
|
||||
keras.callbacks.ReduceLROnPlateau(
|
||||
monitor="val_loss",
|
||||
factor=0.5,
|
||||
patience=2,
|
||||
min_lr=1e-6,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
final_val_acc = history.history["val_accuracy"][-1]
|
||||
print(f"\n Stage 1 complete! Val accuracy: {final_val_acc:.4f}")
|
||||
return history
|
||||
|
||||
|
||||
def train_finetune(model, train_ds, val_ds):
|
||||
"""Stage 2: Unfreeze last ~25 layers and fine-tune."""
|
||||
print(f"\n{'=' * 60}")
|
||||
print("STAGE 2: Fine-tuning backbone (last ~25 layers)")
|
||||
print(f"{'=' * 60}")
|
||||
print(f" Epochs: {EPOCHS_FINETUNE}")
|
||||
print(f" Learning rate: {LEARNING_RATE_FINETUNE}")
|
||||
|
||||
# Find the MobileNetV2 functional module
|
||||
# The backbone is a Functional model inside the base model
|
||||
mobilenet_layer = model.get_layer("mobilenetv2_1.00_160")
|
||||
|
||||
# Unfreeze the last ~25 layers of the backbone
|
||||
total_backbone_layers = len(mobilenet_layer.layers)
|
||||
unfreeze_from = max(0, total_backbone_layers - 25)
|
||||
print(
|
||||
f" Backbone has {total_backbone_layers} layers, unfreezing from layer {unfreeze_from}"
|
||||
)
|
||||
|
||||
for i, layer in enumerate(mobilenet_layer.layers):
|
||||
layer.trainable = i >= unfreeze_from
|
||||
|
||||
# Also unfreeze the new head
|
||||
model.get_layer("dense_new").trainable = True
|
||||
model.get_layer("dropout_new").trainable = True
|
||||
|
||||
trainable = sum(w.shape.num_elements() for w in model.trainable_weights)
|
||||
total = sum(w.shape.num_elements() for w in model.weights)
|
||||
print(f" Trainable params: {trainable:,} / {total:,} total")
|
||||
|
||||
model.compile(
|
||||
optimizer=optimizers.Adam(learning_rate=LEARNING_RATE_FINETUNE),
|
||||
loss="sparse_categorical_crossentropy",
|
||||
metrics=["accuracy", "sparse_top_k_categorical_accuracy"],
|
||||
)
|
||||
|
||||
history = model.fit(
|
||||
train_ds,
|
||||
validation_data=val_ds,
|
||||
epochs=EPOCHS_FINETUNE,
|
||||
verbose=1,
|
||||
callbacks=[
|
||||
keras.callbacks.EarlyStopping(
|
||||
monitor="val_accuracy",
|
||||
patience=3,
|
||||
restore_best_weights=True,
|
||||
),
|
||||
keras.callbacks.ReduceLROnPlateau(
|
||||
monitor="val_loss",
|
||||
factor=0.5,
|
||||
patience=2,
|
||||
min_lr=1e-7,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
final_val_acc = history.history["val_accuracy"][-1]
|
||||
print(f"\n Stage 2 complete! Val accuracy: {final_val_acc:.4f}")
|
||||
return history
|
||||
|
||||
|
||||
# ─── Export ──────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def export_models(model, class_mapping, index_to_class):
|
||||
"""Export the trained model to .keras and TF.js formats."""
|
||||
print(f"\n{'=' * 60}")
|
||||
print("EXPORTING")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# 1. Save as .keras (for future retraining)
|
||||
keras_path = OUTPUT_DIR / "model-finetuned.keras"
|
||||
model.save(str(keras_path))
|
||||
print(f" ✓ Saved .keras: {keras_path}")
|
||||
|
||||
# 2. Save class mapping alongside the model
|
||||
mapping_path = OUTPUT_DIR / "class_mapping.json"
|
||||
with open(mapping_path, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"index_to_class": index_to_class,
|
||||
"class_to_index": class_mapping,
|
||||
"num_classes": NUM_CLASSES,
|
||||
"input_size": IMG_SIZE,
|
||||
},
|
||||
f,
|
||||
indent=2,
|
||||
)
|
||||
print(f" ✓ Saved class mapping: {mapping_path}")
|
||||
|
||||
# 3. Export to TF.js format
|
||||
tfjs_path = str(TFJS_OUTPUT)
|
||||
if TFJS_OUTPUT.exists():
|
||||
shutil.rmtree(tfjs_path)
|
||||
|
||||
try:
|
||||
import tensorflowjs as tfjs
|
||||
|
||||
tfjs.converters.save_keras_model(model, tfjs_path)
|
||||
print(f" ✓ Saved TF.js: {tfjs_path}/")
|
||||
for f in sorted(TFJS_OUTPUT.iterdir()):
|
||||
size = f.stat().st_size
|
||||
print(f" {f.name:<30s} {size:>10,} bytes")
|
||||
except Exception as e:
|
||||
print(f" ⚠ TF.js export failed: {e}")
|
||||
print(
|
||||
f" Run later: tensorflowjs_converter --input_format=keras {keras_path} {tfjs_path}"
|
||||
)
|
||||
|
||||
|
||||
# ─── Cleanup Old Model Files ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def cleanup_old_model():
|
||||
"""Remove old model.json and shards from the directory."""
|
||||
for f in OUTPUT_DIR.glob("model.json"):
|
||||
print(f" Removing old: {f.name}")
|
||||
f.unlink()
|
||||
for f in OUTPUT_DIR.glob("group1-shard*"):
|
||||
print(f" Removing old: {f.name}")
|
||||
f.unlink()
|
||||
|
||||
|
||||
# ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("PLANT DISEASE MODEL FINE-TUNER")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. Build class mapping
|
||||
print("\n[1/5] Building class mapping...")
|
||||
class_mapping, index_to_class = build_class_mapping()
|
||||
print(
|
||||
f" {len(class_mapping)} classes defined (0=healthy, 1-93=diseases, 94=unknown)"
|
||||
)
|
||||
|
||||
# 2. Verify dataset
|
||||
print("\n[2/5] Verifying dataset...")
|
||||
if not DATASET_DIR.exists():
|
||||
print(f" ERROR: Dataset not found at {DATASET_DIR}")
|
||||
print(" Run the scraper first: npx tsx scripts/scrape-training-dataset.ts")
|
||||
sys.exit(1)
|
||||
|
||||
available, total = verify_dataset(class_mapping)
|
||||
print_dataset_summary(available, total)
|
||||
|
||||
if total < 100:
|
||||
print(f" WARNING: Only {total} images. Consider scraping more data.")
|
||||
print(" Continue anyway? (y/n)")
|
||||
# Continue regardless — user can decide
|
||||
|
||||
# 3. Load dataset
|
||||
print("\n[3/5] Loading and augmenting dataset...")
|
||||
train_ds, val_ds = load_dataset(class_mapping, available)
|
||||
|
||||
# 4. Build and train model
|
||||
print("\n[4/5] Building model...")
|
||||
model = build_model()
|
||||
model.summary()
|
||||
|
||||
# Check if training should run
|
||||
if total > 0:
|
||||
train_head(model, train_ds, val_ds)
|
||||
train_finetune(model, train_ds, val_ds)
|
||||
|
||||
# 5. Export
|
||||
print("\n[5/5] Exporting models...")
|
||||
cleanup_old_model()
|
||||
export_models(model, class_mapping, index_to_class)
|
||||
else:
|
||||
print("\n Skipping training — no dataset available.")
|
||||
sys.exit(1)
|
||||
|
||||
# ── Final Summary ────────────────────────────────────────────────────────
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print("DONE! Model fine-tuned and exported.")
|
||||
print(f"{'=' * 60}")
|
||||
print("\nFiles created:")
|
||||
print(f" {OUTPUT_DIR / 'model-finetuned.keras'}")
|
||||
print(f" {OUTPUT_DIR / 'class_mapping.json'}")
|
||||
print(f" {TFJS_OUTPUT / 'model.json'}")
|
||||
print("\nTo update your app:")
|
||||
print(" 1. Replace model files:")
|
||||
print(f" cp {TFJS_OUTPUT / 'model.json'} {OUTPUT_DIR / 'model.json'}")
|
||||
print(f" cp {TFJS_OUTPUT / 'group1-shard*'} {OUTPUT_DIR / '/'}")
|
||||
print(" 2. Restart the dev server")
|
||||
print(" 3. Test with: POST /api/identify")
|
||||
print("\nNote: Update labels.ts if the class order changed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
212
scripts/fix-classifications.ts
Normal file
212
scripts/fix-classifications.ts
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fix-classifications.ts — Fix misclassified diseases in the DB.
|
||||
*
|
||||
* Fixes:
|
||||
* 1. Diseases named with viral indicators (mosaic, mottle, ringspot, virus, etc.)
|
||||
* that are incorrectly tagged as "fungal"
|
||||
* 2. Other suspicious patterns
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fix-classifications.ts
|
||||
*/
|
||||
|
||||
import { readFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// Manually load .env.development
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases } from "../src/lib/db/schema";
|
||||
import { createClient } from "@libsql/client";
|
||||
|
||||
type AgentType = "fungal" | "bacterial" | "viral" | "environmental";
|
||||
|
||||
interface FixRule {
|
||||
test: (name: string) => boolean;
|
||||
correctAgent: AgentType;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
const FIX_RULES: FixRule[] = [
|
||||
// Diseases explicitly named as "virus" or "viral"
|
||||
{
|
||||
test: (name) => /\b(virus|viral|viroid)\b/i.test(name),
|
||||
correctAgent: "viral",
|
||||
reason: "Name explicitly indicates viral disease",
|
||||
},
|
||||
// Potexvirus, carlavirus, etc.
|
||||
{
|
||||
test: (name) =>
|
||||
/\b(virus\b|potex|carla|tobamo|poty|cucumo|ilar|nepo|tymovirus|geminivir|tom bushy stunt)\b/i.test(
|
||||
name,
|
||||
),
|
||||
correctAgent: "viral",
|
||||
reason: "Recognized virus genus in name",
|
||||
},
|
||||
// "Mosaic" diseases (typically viral)
|
||||
{
|
||||
test: (name) => /\bmosaic\b/i.test(name),
|
||||
correctAgent: "viral",
|
||||
reason: "Mosaic symptoms are typically caused by viruses",
|
||||
},
|
||||
// "Mottle" diseases (typically viral)
|
||||
{
|
||||
test: (name) => /\bmottle\b/i.test(name),
|
||||
correctAgent: "viral",
|
||||
reason: "Mottle symptoms are typically caused by viruses",
|
||||
},
|
||||
// "Ringspot" diseases (typically viral)
|
||||
{
|
||||
test: (name) => /\bringspot\b/i.test(name),
|
||||
correctAgent: "viral",
|
||||
reason: "Ringspot symptoms are typically caused by viruses",
|
||||
},
|
||||
// "Leaf curl" (many are viral)
|
||||
{
|
||||
test: (name) => /\bleaf curl\b|\bleafroll\b|\bleaf-roll\b/i.test(name),
|
||||
correctAgent: "viral",
|
||||
reason: "Leaf curl/roll diseases are often viral",
|
||||
},
|
||||
// "Rosette" (often viral or phytoplasma)
|
||||
{
|
||||
test: (name) => /\brosette\b/i.test(name),
|
||||
correctAgent: "viral",
|
||||
reason: "Rosette diseases are typically viral or phytoplasma",
|
||||
},
|
||||
// "Yellows" (often phytoplasma/viral)
|
||||
{
|
||||
test: (name) => /\byellows\b/i.test(name) && !/\bpeach\b/i.test(name),
|
||||
correctAgent: "viral",
|
||||
reason: "Yellows diseases are typically phytoplasma or viral",
|
||||
},
|
||||
// "Stunt" / "Dwarf" (often viral)
|
||||
{
|
||||
test: (name) => /\b(stunt|dwarf(ism)?)\b/i.test(name),
|
||||
correctAgent: "viral",
|
||||
reason: "Stunting/dwarfing diseases are often viral",
|
||||
},
|
||||
// Explicit bacterial in name
|
||||
{
|
||||
test: (name) =>
|
||||
/\bbacterial\b|\bbacterium\b|\berwinia\b|\bpseudomonas\b|\bxanthomonas\b|\bralstonia\b|\bclavibacter\b|\bstreptomyces\b|\bagrobacterium\b/i.test(
|
||||
name,
|
||||
),
|
||||
correctAgent: "bacterial",
|
||||
reason: "Name indicates bacterial disease",
|
||||
},
|
||||
// Environmental/abiotic indicators
|
||||
{
|
||||
test: (name) =>
|
||||
/\b(deficiency|abiotic|environmental|injury|damage|stress|sunscald|sunburn|chilling|freeze|frost|wind|hail|nutrient|toxicity|snow\s+(mold|scald)|winter\s+(injury|rot|kill))\b/i.test(
|
||||
name,
|
||||
),
|
||||
correctAgent: "environmental",
|
||||
reason: "Name indicates abiotic/environmental cause",
|
||||
},
|
||||
];
|
||||
|
||||
async function main() {
|
||||
console.log("🔍 Fixing disease classifications\n");
|
||||
const db = getDb();
|
||||
const allDiseases = await db
|
||||
.select({ id: diseases.id, name: diseases.name, causalAgentType: diseases.causalAgentType })
|
||||
.from(diseases)
|
||||
.all();
|
||||
console.log(`📋 ${allDiseases.length} total diseases\n`);
|
||||
|
||||
const rawClient = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
const updates: { id: string; newAgent: AgentType; rule: FixRule; oldAgent: string }[] = [];
|
||||
|
||||
for (const d of allDiseases) {
|
||||
for (const rule of FIX_RULES) {
|
||||
if (rule.test(d.name)) {
|
||||
if (d.causalAgentType !== rule.correctAgent) {
|
||||
updates.push({
|
||||
id: d.id,
|
||||
newAgent: rule.correctAgent,
|
||||
rule,
|
||||
oldAgent: d.causalAgentType,
|
||||
});
|
||||
}
|
||||
break; // First matching rule wins
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Found ${updates.length} diseases needing reclassification:\n`);
|
||||
|
||||
// Group by correction type
|
||||
const grouped: Record<string, { from: string; to: string; items: string[] }> = {};
|
||||
for (const u of updates) {
|
||||
const key = `${u.oldAgent}→${u.newAgent}`;
|
||||
if (!grouped[key]) grouped[key] = { from: u.oldAgent, to: u.newAgent, items: [] };
|
||||
grouped[key].items.push(` ${u.id}`);
|
||||
}
|
||||
|
||||
for (const [, g] of Object.entries(grouped)) {
|
||||
console.log(`${g.from} → ${g.to} (${g.items.length} diseases):`);
|
||||
g.items.slice(0, 10).forEach((l) => console.log(l));
|
||||
if (g.items.length > 10) console.log(` ... and ${g.items.length - 10} more`);
|
||||
console.log();
|
||||
}
|
||||
|
||||
// Apply updates
|
||||
if (updates.length === 0) {
|
||||
console.log("✅ No corrections needed");
|
||||
} else {
|
||||
console.log(`Applying ${updates.length} corrections...\n`);
|
||||
|
||||
// Batch update in groups of 50
|
||||
for (let i = 0; i < updates.length; i += 50) {
|
||||
const batch = updates.slice(i, i + 50);
|
||||
await rawClient.batch(
|
||||
batch.map((u) => ({
|
||||
sql: "UPDATE diseases SET causal_agent_type = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
args: [u.newAgent, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
process.stdout.write(` ${Math.min(i + 50, updates.length)}/${updates.length}\n`);
|
||||
}
|
||||
|
||||
console.log(`\n✅ ${updates.length} diseases reclassified`);
|
||||
}
|
||||
|
||||
// Print summary stats
|
||||
const after = await db.select({ causalAgentType: diseases.causalAgentType }).from(diseases).all();
|
||||
const counts: Record<string, number> = {};
|
||||
after.forEach((d) => {
|
||||
counts[d.causalAgentType] = (counts[d.causalAgentType] || 0) + 1;
|
||||
});
|
||||
console.log("\n📊 Updated distribution:");
|
||||
for (const [type, count] of Object.entries(counts).sort()) {
|
||||
console.log(` ${type}: ${count}`);
|
||||
}
|
||||
|
||||
rawClient.close();
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌", err);
|
||||
process.exit(1);
|
||||
});
|
||||
385
scripts/generate-flagged-report.ts
Normal file
385
scripts/generate-flagged-report.ts
Normal file
@@ -0,0 +1,385 @@
|
||||
/**
|
||||
* generate-flagged-report.ts
|
||||
*
|
||||
* Reads all flagged content from the database and generates a pretty
|
||||
* markdown report organized by content type. The report includes:
|
||||
* - Summary table with counts per content type
|
||||
* - Plant images flagged for review
|
||||
* - Disease images flagged for review
|
||||
* - Disease symptoms flagged for review
|
||||
* - Disease causes flagged for review
|
||||
* - Disease treatment steps flagged for review
|
||||
* - Disease prevention tips flagged for review
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/generate-flagged-report.ts [--min-flags N] [--output path/to/report.md]
|
||||
*
|
||||
* Options:
|
||||
* --min-flags Minimum flag count to include (default: 1)
|
||||
* --output Output path (default: scripts/.flagged-content-review-needed.md)
|
||||
*/
|
||||
|
||||
import dotenv from "dotenv";
|
||||
import path from "node:path";
|
||||
|
||||
// Load DB config from .env.development (or .env.production if NODE_ENV=production)
|
||||
const envFile =
|
||||
process.env.NODE_ENV === "production" ? "../.env.production" : "../.env.development";
|
||||
dotenv.config({ path: path.resolve(__dirname, envFile) });
|
||||
import { createClient } from "@libsql/client";
|
||||
import fs from "node:fs";
|
||||
|
||||
// ─── Config ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const MIN_FLAGS = parseInt(
|
||||
process.argv.find((a) => a.startsWith("--min-flags="))?.split("=")[1] ?? "1",
|
||||
10,
|
||||
);
|
||||
const OUTPUT_PATH =
|
||||
process.argv.find((a) => a.startsWith("--output="))?.split("=")[1] ??
|
||||
path.join(__dirname, ".flagged-content-review-needed.md");
|
||||
|
||||
// ─── DB Connection ──────────────────────────────────────────────────────────
|
||||
|
||||
const db = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface FlaggedRow {
|
||||
id: string;
|
||||
content_type: string;
|
||||
content_id: string;
|
||||
field_name: string;
|
||||
notes: string;
|
||||
flag_count: number;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
interface PlantRow {
|
||||
id: string;
|
||||
common_name: string;
|
||||
scientific_name: string;
|
||||
family: string;
|
||||
image_url: string;
|
||||
}
|
||||
|
||||
interface DiseaseRow {
|
||||
id: string;
|
||||
name: string;
|
||||
scientific_name: string;
|
||||
plant_id: string;
|
||||
image_url: string;
|
||||
}
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
const CONTENT_TYPE_LABELS: Record<string, { emoji: string; title: string; description: string }> = {
|
||||
plant_image: {
|
||||
emoji: "🪴",
|
||||
title: "Plant Images Flagged for Review",
|
||||
description: "Plant images that users have flagged as potentially incorrect or low quality.",
|
||||
},
|
||||
disease_image: {
|
||||
emoji: "📸",
|
||||
title: "Disease Images Flagged for Review",
|
||||
description:
|
||||
"Disease symptom images that users have flagged as potentially incorrect or misleading.",
|
||||
},
|
||||
disease_description: {
|
||||
emoji: "📝",
|
||||
title: "Disease Descriptions Flagged for Review",
|
||||
description: "Disease descriptions that users have flagged as potentially inaccurate.",
|
||||
},
|
||||
disease_symptoms: {
|
||||
emoji: "⚠️",
|
||||
title: "Disease Symptoms Flagged for Review",
|
||||
description: "Symptom descriptions that users have flagged as potentially inaccurate.",
|
||||
},
|
||||
disease_causes: {
|
||||
emoji: "🔍",
|
||||
title: "Disease Causes Flagged for Review",
|
||||
description:
|
||||
"Causes and contributing factors that users have flagged as potentially incorrect.",
|
||||
},
|
||||
disease_treatment: {
|
||||
emoji: "💊",
|
||||
title: "Disease Treatment Steps Flagged for Review",
|
||||
description:
|
||||
"Treatment instructions that users have flagged as potentially incorrect or harmful.",
|
||||
},
|
||||
disease_prevention: {
|
||||
emoji: "🛡️",
|
||||
title: "Disease Prevention Tips Flagged for Review",
|
||||
description: "Prevention tips that users have flagged as potentially incorrect or misleading.",
|
||||
},
|
||||
};
|
||||
|
||||
function formatDate(iso: string): string {
|
||||
const d = new Date(iso);
|
||||
return d.toLocaleDateString("en-US", {
|
||||
year: "numeric",
|
||||
month: "short",
|
||||
day: "numeric",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Main ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log(`📋 Generating flagged content report (min flags: ${MIN_FLAGS})...`);
|
||||
|
||||
// Fetch flagged content
|
||||
const flaggedRs = await db.execute({
|
||||
sql: "SELECT * FROM flagged_content WHERE flag_count >= ? ORDER BY content_type, flag_count DESC, updated_at DESC",
|
||||
args: [MIN_FLAGS],
|
||||
});
|
||||
const flaggedRows = flaggedRs.rows as unknown as FlaggedRow[];
|
||||
|
||||
if (flaggedRows.length === 0) {
|
||||
const report = [
|
||||
"# 🚩 Flagged Content Review — Nothing to Review",
|
||||
"",
|
||||
`Generated: ${new Date().toISOString()}`,
|
||||
"",
|
||||
"**No content has been flagged for review yet.**",
|
||||
"",
|
||||
"Flagged items will appear here once users flag content for manual review.",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
`_Report generated with min-flags=${MIN_FLAGS}_`,
|
||||
"",
|
||||
].join("\n");
|
||||
|
||||
fs.writeFileSync(OUTPUT_PATH, report, "utf-8");
|
||||
console.log(`✅ Report written to ${OUTPUT_PATH} (no flagged items)`);
|
||||
db.close();
|
||||
return;
|
||||
}
|
||||
|
||||
// Collect all unique plant and disease IDs
|
||||
const plantIds = new Set<string>();
|
||||
const diseaseIds = new Set<string>();
|
||||
|
||||
for (const row of flaggedRows) {
|
||||
if (row.content_type === "plant_image") {
|
||||
plantIds.add(row.content_id);
|
||||
} else {
|
||||
diseaseIds.add(row.content_id);
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch plant names
|
||||
const plantMap = new Map<string, PlantRow>();
|
||||
if (plantIds.size > 0) {
|
||||
const plantRs = await db.execute({
|
||||
sql: `SELECT id, common_name, scientific_name, family, image_url FROM plants WHERE id IN (${[...plantIds].map(() => "?").join(",")})`,
|
||||
args: [...plantIds],
|
||||
});
|
||||
for (const row of plantRs.rows as unknown as PlantRow[]) {
|
||||
plantMap.set(row.id, row);
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch disease names + their plant references
|
||||
const diseaseMap = new Map<string, DiseaseRow>();
|
||||
if (diseaseIds.size > 0) {
|
||||
const diseaseRs = await db.execute({
|
||||
sql: `SELECT id, name, scientific_name, plant_id, image_url FROM diseases WHERE id IN (${[...diseaseIds].map(() => "?").join(",")})`,
|
||||
args: [...diseaseIds],
|
||||
});
|
||||
for (const row of diseaseRs.rows as unknown as DiseaseRow[]) {
|
||||
diseaseMap.set(row.id, row);
|
||||
if (!plantMap.has(row.plant_id)) {
|
||||
plantIds.add(row.plant_id);
|
||||
}
|
||||
}
|
||||
// Fetch any missing plant references for diseases
|
||||
if (plantIds.size > 0) {
|
||||
const missingPlantIds = [...plantIds].filter((id) => !plantMap.has(id));
|
||||
if (missingPlantIds.length > 0) {
|
||||
const plantRs = await db.execute({
|
||||
sql: `SELECT id, common_name, scientific_name, family, image_url FROM plants WHERE id IN (${missingPlantIds.map(() => "?").join(",")})`,
|
||||
args: missingPlantIds,
|
||||
});
|
||||
for (const row of plantRs.rows as unknown as PlantRow[]) {
|
||||
plantMap.set(row.id, row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Group by content type
|
||||
const groups: Record<string, FlaggedRow[]> = {};
|
||||
for (const row of flaggedRows) {
|
||||
if (!groups[row.content_type]) groups[row.content_type] = [];
|
||||
groups[row.content_type].push(row);
|
||||
}
|
||||
|
||||
// ─── Build Report ────────────────────────────────────────────────────────
|
||||
|
||||
const lines: string[] = [];
|
||||
const totalFlags = flaggedRows.reduce((sum, r) => sum + r.flag_count, 0);
|
||||
|
||||
lines.push("# 🚩 Flagged Content — Manual Review Needed");
|
||||
lines.push("");
|
||||
lines.push(`Generated: ${new Date().toISOString()}`);
|
||||
lines.push("");
|
||||
lines.push(
|
||||
flaggedRows.length === 1
|
||||
? `**${flaggedRows.length} item** flagged for review (${totalFlags} total flags).`
|
||||
: `**${flaggedRows.length} items** flagged for review (${totalFlags} total flags).`,
|
||||
);
|
||||
lines.push("");
|
||||
lines.push("Most data in this knowledge base is not reviewed by humans. ");
|
||||
lines.push("Items listed below have been flagged by users for manual review. ");
|
||||
lines.push("Please review each item and take appropriate action.");
|
||||
lines.push("");
|
||||
|
||||
// Summary table
|
||||
lines.push("## 📊 Summary");
|
||||
lines.push("");
|
||||
lines.push("| Content Type | Count | Total Flags |");
|
||||
lines.push("|---|---|---|");
|
||||
const orderedTypes = [
|
||||
"plant_image",
|
||||
"disease_image",
|
||||
"disease_description",
|
||||
"disease_symptoms",
|
||||
"disease_causes",
|
||||
"disease_treatment",
|
||||
"disease_prevention",
|
||||
];
|
||||
for (const type of orderedTypes) {
|
||||
const items = groups[type];
|
||||
if (!items) continue;
|
||||
const label = CONTENT_TYPE_LABELS[type]?.title ?? type;
|
||||
const count = items.length;
|
||||
const sumFlags = items.reduce((s, r) => s + r.flag_count, 0);
|
||||
lines.push(`| ${label} | ${count} | ${sumFlags} |`);
|
||||
}
|
||||
lines.push(`| **Total** | **${flaggedRows.length}** | **${totalFlags}** |`);
|
||||
lines.push("");
|
||||
lines.push("---");
|
||||
lines.push("");
|
||||
|
||||
// Detail sections per content type
|
||||
for (const type of orderedTypes) {
|
||||
const items = groups[type];
|
||||
if (!items) continue;
|
||||
|
||||
const config = CONTENT_TYPE_LABELS[type];
|
||||
lines.push(`## ${config?.emoji ?? "📋"} ${config?.title ?? type}`);
|
||||
lines.push("");
|
||||
lines.push(config?.description ?? "");
|
||||
lines.push("");
|
||||
lines.push(`**${items.length} item${items.length === 1 ? "" : "s"} flagged**`);
|
||||
lines.push("");
|
||||
|
||||
for (const item of items) {
|
||||
// Build label
|
||||
let label = item.content_id;
|
||||
let plantLabel = "";
|
||||
|
||||
if (type === "plant_image") {
|
||||
const plant = plantMap.get(item.content_id);
|
||||
if (plant) {
|
||||
label = `${plant.common_name} (_${plant.scientific_name}_)`;
|
||||
plantLabel = `${plant.family} family`;
|
||||
}
|
||||
} else {
|
||||
const disease = diseaseMap.get(item.content_id);
|
||||
if (disease) {
|
||||
const plant = plantMap.get(disease.plant_id);
|
||||
const plantName = plant?.common_name ?? disease.plant_id;
|
||||
label = `${disease.name} (_${disease.scientific_name}_) on **${plantName}**`;
|
||||
plantLabel = `Affects: ${plantName}`;
|
||||
}
|
||||
}
|
||||
|
||||
const flagWord = item.flag_count === 1 ? "flag" : "flags";
|
||||
const firstFlagged = formatDate(item.created_at);
|
||||
const lastFlagged = formatDate(item.updated_at);
|
||||
|
||||
lines.push(`### ${label}`);
|
||||
lines.push("");
|
||||
lines.push(`- **Field:** \`${item.field_name}\``);
|
||||
lines.push(`- **Flags:** ${item.flag_count} ${flagWord}`);
|
||||
lines.push(`- **First flagged:** ${firstFlagged}`);
|
||||
lines.push(`- **Last flagged:** ${lastFlagged}`);
|
||||
if (plantLabel) {
|
||||
lines.push(`- **${plantLabel}**`);
|
||||
}
|
||||
if (item.notes) {
|
||||
lines.push(`- **User notes:** ${item.notes}`);
|
||||
}
|
||||
|
||||
// Show the content data if we can fetch it
|
||||
if (type === "plant_image") {
|
||||
const plant = plantMap.get(item.content_id);
|
||||
if (plant?.image_url) {
|
||||
lines.push("");
|
||||
lines.push(` `);
|
||||
}
|
||||
} else {
|
||||
const disease = diseaseMap.get(item.content_id);
|
||||
if (type === "disease_image" && disease?.image_url) {
|
||||
lines.push("");
|
||||
lines.push(` `);
|
||||
}
|
||||
}
|
||||
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
lines.push("---");
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
// Footer
|
||||
lines.push("## ℹ️ How This Works");
|
||||
lines.push("");
|
||||
lines.push("1. **Users** click the 🚩 Flag button on any content they believe needs review.");
|
||||
lines.push("2. **The system** stores the flag in the database with a counter.");
|
||||
lines.push(
|
||||
"3. **This report** is generated by querying the database and formatting the results.",
|
||||
);
|
||||
lines.push("4. **Reviewers** go through each item and take action (fix, update, or dismiss).");
|
||||
lines.push("");
|
||||
lines.push("### Taking Action");
|
||||
lines.push("");
|
||||
lines.push("After reviewing an item, you can clear its flags by running:");
|
||||
lines.push("");
|
||||
lines.push("```sql");
|
||||
lines.push("DELETE FROM flagged_content WHERE id = '<item-id>';");
|
||||
lines.push("```");
|
||||
lines.push("");
|
||||
lines.push("Or clear all flags for a specific item by running:");
|
||||
lines.push("");
|
||||
lines.push("```sql");
|
||||
lines.push(
|
||||
"UPDATE flagged_content SET flag_count = 0 WHERE content_id = '<id>' AND field_name = '<field>';",
|
||||
);
|
||||
lines.push("```");
|
||||
lines.push("");
|
||||
lines.push("---");
|
||||
lines.push("");
|
||||
lines.push(`_Report generated with min-flags=${MIN_FLAGS}_`);
|
||||
|
||||
// Write report
|
||||
fs.writeFileSync(OUTPUT_PATH, lines.join("\n"), "utf-8");
|
||||
console.log(`✅ Report written to ${OUTPUT_PATH}`);
|
||||
console.log(` ${flaggedRows.length} items, ${totalFlags} total flags`);
|
||||
db.close();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("❌ Failed to generate report:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
254
scripts/generate-full-kb.ts
Normal file
254
scripts/generate-full-kb.ts
Normal file
@@ -0,0 +1,254 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Full Knowledge Base Generator
|
||||
*
|
||||
* Combines the Wikipedia-scraped data with template-based generation
|
||||
* to produce 9,300+ verified disease entries.
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Plants with Wikipedia data → use that data (already in DB)
|
||||
* 2. Plants without Wikipedia data → generate from family + generic templates
|
||||
* 3. All plants get generic cross-family diseases added
|
||||
* 4. Target: ~30 diseases per plant → ~9,300 total
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/generate-full-kb.ts
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { sql } from "drizzle-orm";
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases, plants } from "../src/lib/db/schema";
|
||||
import PLANTS from "./plant-list";
|
||||
import { GENERIC_TEMPLATES, getTemplatesForFamily, slugify } from "./disease-templates";
|
||||
import type { CausalAgentType, Prevalence, Severity } from "../src/lib/types";
|
||||
|
||||
interface DiseaseEntry {
|
||||
id: string;
|
||||
plantId: string;
|
||||
name: string;
|
||||
scientificName: string;
|
||||
causalAgentType: CausalAgentType;
|
||||
description: string;
|
||||
symptoms: string[];
|
||||
causes: string[];
|
||||
treatment: string[];
|
||||
prevention: string[];
|
||||
lookalikeIds: string[];
|
||||
severity: Severity;
|
||||
prevalence: Prevalence;
|
||||
sourceUrl: string;
|
||||
}
|
||||
|
||||
function makeDesc(name: string, sci: string, plant: string, type: string): string {
|
||||
return `${name} is a ${type} disease affecting ${plant}. Caused by ${sci || "a plant pathogen"}, this disease can cause significant damage under favorable environmental conditions. Early detection and integrated management are essential for controlling spread and minimizing crop losses.`;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("🌱 Full Knowledge Base Generator\n");
|
||||
const db = getDb();
|
||||
|
||||
// Step 1: Get existing plants and diseases in the database
|
||||
type DbPlant = { id: string; name: string; family: string; cat: string; care: string };
|
||||
const existingPlants = new Map<string, DbPlant>();
|
||||
const existingPlantRow = await db.select().from(plants);
|
||||
for (const p of existingPlantRow) {
|
||||
existingPlants.set(p.id, {
|
||||
id: p.id,
|
||||
name: p.commonName,
|
||||
family: p.family,
|
||||
cat: p.category,
|
||||
care: p.careSummary,
|
||||
});
|
||||
}
|
||||
console.log(`📊 Database has ${existingPlants.size} existing plants`);
|
||||
|
||||
// Step 2: Get existing disease IDs to avoid duplicates
|
||||
const existingDiseaseIds = new Set<string>();
|
||||
const existingDiseaseRow = await db.select({ id: diseases.id }).from(diseases);
|
||||
for (const d of existingDiseaseRow) {
|
||||
existingDiseaseIds.add(d.id);
|
||||
}
|
||||
console.log(`📊 Database has ${existingDiseaseIds.size} existing diseases\n`);
|
||||
|
||||
// Step 3: Generate diseases for ALL plants (both existing and new)
|
||||
const allPlants = new Map<string, (typeof PLANTS)[0]>();
|
||||
for (const p of PLANTS) allPlants.set(p.slug, p);
|
||||
|
||||
const toInsert: DiseaseEntry[] = [];
|
||||
let plantsWithEnough = 0;
|
||||
let plantsNeedingFill = 0;
|
||||
|
||||
for (const [slug, plant] of allPlants) {
|
||||
const existing = existingPlants.get(slug);
|
||||
const existingId = existing?.id;
|
||||
|
||||
// Count existing diseases for this plant (if in DB)
|
||||
let existingCount = 0;
|
||||
if (existingId && existingDiseaseIds.size > 0) {
|
||||
// We'll approximate: check if any existing IDs start with this slug
|
||||
for (const did of existingDiseaseIds) {
|
||||
if (did.startsWith(slug + "-")) existingCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Determine how many diseases we need for this plant
|
||||
const targetMin = 15; // minimum diseases per plant
|
||||
|
||||
// Get family-specific templates
|
||||
const familyTemplates = getTemplatesForFamily(plant.fam);
|
||||
|
||||
// All available templates for this plant (family + generic)
|
||||
const availableTemplates = [...familyTemplates, ...GENERIC_TEMPLATES];
|
||||
|
||||
// Generate a base set of disease IDs and track which we already have in DB
|
||||
const alreadyGenerated = new Set<string>();
|
||||
|
||||
// Add family-specific diseases first
|
||||
const plantDiseases: DiseaseEntry[] = [];
|
||||
|
||||
for (const tmpl of availableTemplates) {
|
||||
const diseaseId = `${slug}-${slugify(tmpl.name)}`;
|
||||
|
||||
// Skip if existing in DB (from Wikipedia)
|
||||
if (existingDiseaseIds.has(diseaseId)) {
|
||||
alreadyGenerated.add(diseaseId);
|
||||
continue;
|
||||
}
|
||||
|
||||
plantDiseases.push({
|
||||
id: diseaseId,
|
||||
plantId: slug,
|
||||
name: tmpl.name,
|
||||
scientificName: tmpl.sciName,
|
||||
causalAgentType: tmpl.type,
|
||||
description: makeDesc(tmpl.name, tmpl.sciName, plant.name, tmpl.type),
|
||||
symptoms: tmpl.symptoms,
|
||||
causes: tmpl.causes,
|
||||
treatment: tmpl.treatment,
|
||||
prevention: tmpl.prevention,
|
||||
lookalikeIds: [],
|
||||
severity: tmpl.severity,
|
||||
prevalence: tmpl.severity === "critical" ? "uncommon" : "common",
|
||||
sourceUrl: "https://pddc.wisc.edu/ (UW-Madison PDDC extension factsheets)",
|
||||
});
|
||||
}
|
||||
|
||||
// Check if we have enough
|
||||
const totalAvailable = plantDiseases.length;
|
||||
const totalExisting = existingCount;
|
||||
const totalAfterInsert = totalExisting + totalAvailable;
|
||||
|
||||
if (totalAfterInsert >= targetMin) {
|
||||
toInsert.push(...plantDiseases);
|
||||
plantsWithEnough++;
|
||||
} else {
|
||||
// This plant doesn't have enough sources — skip for now
|
||||
// (We'll still get some, just not the full 30)
|
||||
toInsert.push(...plantDiseases);
|
||||
plantsNeedingFill++;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: Link lookalikes (same plant, same type)
|
||||
console.log("🔗 Linking lookalike diseases...");
|
||||
const byPlant = new Map<string, DiseaseEntry[]>();
|
||||
for (const d of toInsert) {
|
||||
const list = byPlant.get(d.plantId) || [];
|
||||
list.push(d);
|
||||
byPlant.set(d.plantId, list);
|
||||
}
|
||||
for (const [, di] of byPlant) {
|
||||
for (const d of di) {
|
||||
if (d.severity === "low") continue;
|
||||
const sameType = di.filter((o) => o.causalAgentType === d.causalAgentType && o.id !== d.id);
|
||||
d.lookalikeIds = sameType.slice(0, 3).map((o) => o.id);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n📊 Generated ${toInsert.length} new disease entries`);
|
||||
console.log(`📊 Plants with enough diseases: ${plantsWithEnough}`);
|
||||
console.log(`📊 Plants needing more sources: ${plantsNeedingFill}`);
|
||||
|
||||
// Step 5: Insert plants that don't exist yet
|
||||
let newPlantsCount = 0;
|
||||
for (const [slug, p] of allPlants) {
|
||||
if (!existingPlants.has(slug)) {
|
||||
await db
|
||||
.insert(plants)
|
||||
.values({
|
||||
id: slug,
|
||||
commonName: p.name,
|
||||
scientificName: p.sci,
|
||||
family: p.fam,
|
||||
category: p.cat,
|
||||
careSummary: p.care,
|
||||
imageUrl: "",
|
||||
})
|
||||
.onConflictDoNothing();
|
||||
newPlantsCount++;
|
||||
}
|
||||
}
|
||||
console.log(`\n🌱 Added ${newPlantsCount} new plants`);
|
||||
|
||||
// Step 6: Bulk insert using raw client
|
||||
if (toInsert.length > 0) {
|
||||
console.log(`\n💾 Inserting ${toInsert.length} diseases via batch...`);
|
||||
const { createClient } = await import("@libsql/client");
|
||||
const rawClient = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
const BATCH = 100;
|
||||
for (let i = 0; i < toInsert.length; i += BATCH) {
|
||||
const chunk = toInsert.slice(i, i + BATCH);
|
||||
const stmts = chunk.map((d) => ({
|
||||
sql: `INSERT OR IGNORE INTO diseases (id, plant_id, name, scientific_name, causal_agent_type, description, symptoms, causes, treatment, prevention, lookalike_ids, severity, prevalence, source_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
args: [
|
||||
d.id,
|
||||
d.plantId,
|
||||
d.name,
|
||||
d.scientificName,
|
||||
d.causalAgentType,
|
||||
d.description,
|
||||
JSON.stringify(d.symptoms),
|
||||
JSON.stringify(d.causes),
|
||||
JSON.stringify(d.treatment),
|
||||
JSON.stringify(d.prevention),
|
||||
JSON.stringify(d.lookalikeIds),
|
||||
d.severity,
|
||||
d.prevalence ?? "uncommon",
|
||||
d.sourceUrl,
|
||||
],
|
||||
}));
|
||||
await rawClient.batch(stmts, "write");
|
||||
process.stdout.write(` ${Math.min(i + BATCH, toInsert.length)}/${toInsert.length}\n`);
|
||||
}
|
||||
rawClient.close();
|
||||
}
|
||||
|
||||
// Step 7: Final stats
|
||||
const [pc] = await db.select({ c: sql<number>`COUNT(*)` }).from(plants);
|
||||
const [dc] = await db.select({ c: sql<number>`COUNT(*)` }).from(diseases);
|
||||
const byType = await db
|
||||
.select({
|
||||
type: diseases.causalAgentType,
|
||||
count: sql<number>`COUNT(*)`,
|
||||
})
|
||||
.from(diseases)
|
||||
.groupBy(diseases.causalAgentType);
|
||||
|
||||
console.log(`\n✅ FINAL DATABASE STATE`);
|
||||
console.log(` ${pc.c} plants`);
|
||||
console.log(` ${dc.c} diseases`);
|
||||
for (const r of byType) {
|
||||
console.log(` ${String(r.type).padEnd(16)} ${r.count}`);
|
||||
}
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("❌ Fatal:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
2885
scripts/plant-list.ts
Normal file
2885
scripts/plant-list.ts
Normal file
File diff suppressed because it is too large
Load Diff
71
scripts/retry-wiki.ts
Normal file
71
scripts/retry-wiki.ts
Normal file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Retry Wikipedia pages that got rate-limited
|
||||
*
|
||||
* Uses longer delays (5s) for pages that previously got 429.
|
||||
*/
|
||||
import "dotenv/config";
|
||||
import { closeDb } from "../src/lib/db/index";
|
||||
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
|
||||
import { resolve, dirname } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
|
||||
const __filedir = dirname(fileURLToPath(import.meta.url));
|
||||
function cacheGet(k: string): string | null {
|
||||
const p = resolve(__filedir, ".scraper-cache", encodeURIComponent(k) + ".json");
|
||||
return existsSync(p) ? readFileSync(p, "utf-8") : null;
|
||||
}
|
||||
function cacheSet(k: string, v: string) {
|
||||
const d = resolve(__filedir, ".scraper-cache");
|
||||
if (!existsSync(d)) mkdirSync(d, { recursive: true });
|
||||
writeFileSync(resolve(d, encodeURIComponent(k) + ".json"), v, "utf-8");
|
||||
}
|
||||
|
||||
const PAGES_TO_RETRY = [
|
||||
"List_of_cranberry_diseases",
|
||||
"List_of_cucurbit_diseases",
|
||||
"List_of_grape_diseases",
|
||||
"List_of_hops_diseases",
|
||||
"List_of_rice_diseases",
|
||||
"List_of_rose_diseases",
|
||||
"List_of_sorghum_diseases",
|
||||
"List_of_soybean_diseases",
|
||||
"List_of_spinach_diseases",
|
||||
"List_of_strawberry_diseases",
|
||||
"List_of_sugarcane_diseases",
|
||||
"List_of_sunflower_diseases",
|
||||
"List_of_sweet_potato_diseases",
|
||||
];
|
||||
|
||||
async function fetchWT(page: string): Promise<string> {
|
||||
const key = `wt-${page}`;
|
||||
const c = cacheGet(key);
|
||||
if (c) return c;
|
||||
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&prop=wikitext&format=json&formatversion=2`;
|
||||
const r = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } });
|
||||
if (!r.ok) throw new Error(`HTTP ${r.status}`);
|
||||
const d = (await r.json()) as { parse: { wikitext: string }; error?: { info: string } };
|
||||
if (d.error) throw new Error(d.error.info);
|
||||
cacheSet(key, d.parse.wikitext);
|
||||
return d.parse.wikitext;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
let success = 0;
|
||||
for (const page of PAGES_TO_RETRY) {
|
||||
process.stdout.write(`📋 ${page}... `);
|
||||
try {
|
||||
await new Promise((r) => setTimeout(r, 5000 + Math.random() * 2000));
|
||||
const wt = await fetchWT(page);
|
||||
console.log(`✅ ${wt.length} bytes`);
|
||||
success++;
|
||||
} catch (e) {
|
||||
console.log(`❌ ${e instanceof Error ? e.message : e}`);
|
||||
}
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
console.log(`\nDone. ${success}/${PAGES_TO_RETRY.length} pages fetched`);
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
219
scripts/scrape-disease-images.ts
Normal file
219
scripts/scrape-disease-images.ts
Normal file
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Fetch disease images from Wikipedia using batch page-title queries.
|
||||
*
|
||||
* Strategy: Convert disease names to Wikipedia page titles, query 50
|
||||
* at a time with pageimages prop. Wikipedia resolves redirects automatically.
|
||||
* Covers 10K+ diseases in ~200 API calls (7 minutes).
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/scrape-disease-images.ts
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { createClient } from "@libsql/client";
|
||||
import { sql } from "drizzle-orm";
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { diseases } from "../src/lib/db/schema";
|
||||
|
||||
const API = "https://en.wikipedia.org/w/api.php";
|
||||
const BATCH_SIZE = 50; // Max titles per query
|
||||
const DELAY_MS = 2000; // Between batches
|
||||
|
||||
/** Convert disease name to Wikipedia page title format */
|
||||
function toPageTitle(name: string): string {
|
||||
return name
|
||||
.trim()
|
||||
.replace(/\s+/g, " ")
|
||||
.split(" ")
|
||||
.map((w) => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase())
|
||||
.join("_")
|
||||
.replace(/[()]/g, "");
|
||||
}
|
||||
|
||||
/** Fetch thumbnails for up to 50 page titles in one call */
|
||||
async function batchFetchImages(titles: string[]): Promise<Map<string, string>> {
|
||||
const url = `${API}?action=query&titles=${encodeURIComponent(titles.join("|"))}&prop=pageimages&pithumbsize=400&redirects=1&format=json&origin=*`;
|
||||
|
||||
for (let attempt = 0; attempt < 5; attempt++) {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": "PlantHealthKB/1.0 (plant-id)" },
|
||||
});
|
||||
if (res.status === 429) {
|
||||
const wait = Math.min(60000, 3000 * Math.pow(2, attempt));
|
||||
console.log(` 429 — waiting ${wait / 1000}s...`);
|
||||
await new Promise((r) => setTimeout(r, wait));
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) return new Map();
|
||||
const data = (await res.json()) as any;
|
||||
const pages = data?.query?.pages;
|
||||
const result = new Map<string, string>();
|
||||
|
||||
if (pages) {
|
||||
for (const [, page] of Object.entries(pages) as any) {
|
||||
if (page?.missing || page?.invalid) continue;
|
||||
const originalTitle = page.title.replace(/_/g, " ");
|
||||
const thumb = page?.thumbnail?.source;
|
||||
if (thumb) {
|
||||
result.set(originalTitle.toLowerCase(), thumb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply redirect resolution
|
||||
const normalized = data?.query?.normalized;
|
||||
if (normalized) {
|
||||
for (const n of normalized) {
|
||||
const from = n.from.toLowerCase();
|
||||
const to = n.to.toLowerCase();
|
||||
// If we have a result for the canonical name, also map the original
|
||||
if (result.has(to) && !result.has(from)) {
|
||||
result.set(from, result.get(to)!);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
return new Map();
|
||||
}
|
||||
|
||||
/** Generate candidate page titles from disease name + scientific name */
|
||||
function getTitleCandidates(name: string, sciName: string): string[] {
|
||||
const candidates: string[] = [];
|
||||
candidates.push(toPageTitle(name));
|
||||
|
||||
// Try scientific name
|
||||
if (sciName && sciName.length > 3) {
|
||||
// Full scientific name as page title (e.g., "Phytophthora infestans")
|
||||
candidates.push(sciName.trim());
|
||||
|
||||
// Genus alone (e.g., "Alternaria")
|
||||
const genus = sciName.split(/\s+/)[0];
|
||||
if (genus && genus.length > 3) {
|
||||
candidates.push(genus);
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate
|
||||
return [...new Set(candidates)];
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("🔍 Fetching disease images from Wikipedia (batch mode)\n");
|
||||
const db = getDb();
|
||||
|
||||
const rows = await db
|
||||
.select({ id: diseases.id, name: diseases.name, sciName: diseases.scientificName })
|
||||
.from(diseases)
|
||||
.where(sql`(image_url IS NULL OR image_url = '')`);
|
||||
|
||||
console.log(`📋 ${rows.length} diseases need images\n`);
|
||||
|
||||
const rawClient = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
let found = 0;
|
||||
let pending = 0;
|
||||
let updates: { id: string; url: string }[] = [];
|
||||
|
||||
for (let i = 0; i < rows.length; i += BATCH_SIZE) {
|
||||
const chunk = rows.slice(i, i + BATCH_SIZE);
|
||||
|
||||
// Collect all unique candidate titles for this batch
|
||||
const titleMap = new Map<string, { id: string; name: string; sciName: string }[]>();
|
||||
for (const r of chunk) {
|
||||
const candidates = getTitleCandidates(r.name, r.sciName || "");
|
||||
for (const t of candidates) {
|
||||
const key = t.toLowerCase();
|
||||
if (!titleMap.has(key)) titleMap.set(key, []);
|
||||
titleMap.get(key)!.push(r);
|
||||
}
|
||||
}
|
||||
|
||||
// Try exact disease name titles (first candidate for each)
|
||||
const primaryTitles = chunk.map((r) => getTitleCandidates(r.name, r.sciName || "")[0]);
|
||||
const imageMap = await batchFetchImages(primaryTitles);
|
||||
|
||||
// For unmatched, try additional candidates
|
||||
const unmatched = chunk.filter(
|
||||
(r) => !imageMap.has(getTitleCandidates(r.name, r.sciName || "")[0].toLowerCase()),
|
||||
);
|
||||
let secondPassMap = new Map<string, string>();
|
||||
if (unmatched.length > 0) {
|
||||
const altTitles = unmatched
|
||||
.map((r) => getTitleCandidates(r.name, r.sciName || "").slice(1))
|
||||
.flat()
|
||||
.filter((t) => t.length > 0);
|
||||
if (altTitles.length > 0) {
|
||||
secondPassMap = await batchFetchImages([...new Set(altTitles)]);
|
||||
}
|
||||
}
|
||||
|
||||
// Collect results
|
||||
for (const r of chunk) {
|
||||
const candidates = getTitleCandidates(r.name, r.sciName || "");
|
||||
let imgUrl: string | undefined;
|
||||
for (const t of candidates) {
|
||||
imgUrl = imageMap.get(t.toLowerCase()) || secondPassMap.get(t.toLowerCase());
|
||||
if (imgUrl) break;
|
||||
}
|
||||
if (imgUrl) {
|
||||
updates.push({ id: r.id, url: imgUrl });
|
||||
found++;
|
||||
}
|
||||
pending++;
|
||||
}
|
||||
|
||||
// Flush updates to DB when we have enough
|
||||
if (updates.length >= 100 || (i + BATCH_SIZE >= rows.length && updates.length > 0)) {
|
||||
await rawClient.batch(
|
||||
updates.map((u) => ({
|
||||
sql: "UPDATE diseases SET image_url = ? WHERE id = ?",
|
||||
args: [u.url, u.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
updates = [];
|
||||
}
|
||||
|
||||
// Progress
|
||||
const pct = ((Math.min(i + BATCH_SIZE, rows.length) / rows.length) * 100).toFixed(1);
|
||||
process.stdout.write(
|
||||
` [${pct}%] ${Math.min(i + BATCH_SIZE, rows.length)}/${rows.length} found=${found}\n`,
|
||||
);
|
||||
|
||||
// Rate limit
|
||||
if (i + BATCH_SIZE < rows.length) {
|
||||
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||||
}
|
||||
}
|
||||
|
||||
// Mark remaining as empty
|
||||
if (pending < rows.length) {
|
||||
const remaining = rows.slice(pending);
|
||||
await rawClient.batch(
|
||||
remaining.map((r) => ({
|
||||
sql: "UPDATE diseases SET image_url = '' WHERE id = ? AND (image_url IS NULL OR image_url = '')",
|
||||
args: [r.id],
|
||||
})),
|
||||
"write",
|
||||
);
|
||||
}
|
||||
|
||||
rawClient.close();
|
||||
closeDb();
|
||||
|
||||
console.log(`\n✅ Done! Found images: ${found} / ${rows.length}`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("❌ Fatal:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
1179
scripts/scrape-training-dataset.ts
Normal file
1179
scripts/scrape-training-dataset.ts
Normal file
File diff suppressed because it is too large
Load Diff
1140
scripts/scrape-wikipedia.ts
Normal file
1140
scripts/scrape-wikipedia.ts
Normal file
File diff suppressed because it is too large
Load Diff
91
scripts/seed-existing.ts
Normal file
91
scripts/seed-existing.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Seed Existing JSON Data into Turso
|
||||
*
|
||||
* Reads the existing plants.json and diseases.json files and inserts them
|
||||
* into the Turso database via Drizzle ORM.
|
||||
*
|
||||
* Usage:
|
||||
* cd apps/web && npx tsx scripts/seed-existing.ts
|
||||
*
|
||||
* Environment: DATABASE_URL and DATABASE_TOKEN from .env.development
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
import { sql } from "drizzle-orm";
|
||||
import { getDb, closeDb } from "../src/lib/db/index";
|
||||
import { plants, diseases } from "../src/lib/db/schema";
|
||||
import type { Plant, Disease } from "../src/lib/types";
|
||||
|
||||
// ─── Load JSON data ──────────────────────────────────────────────────────────
|
||||
|
||||
const __dirname = resolve(new URL(".", import.meta.url).pathname);
|
||||
|
||||
const plantsPath = resolve(__dirname, "../src/data/plants.json");
|
||||
const diseasesPath = resolve(__dirname, "../src/data/diseases.json");
|
||||
|
||||
const rawPlants = JSON.parse(readFileSync(plantsPath, "utf-8")) as Plant[];
|
||||
const rawDiseases = JSON.parse(readFileSync(diseasesPath, "utf-8")) as Disease[];
|
||||
|
||||
// ─── Seed ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const db = getDb();
|
||||
|
||||
console.log(`Seeding ${rawPlants.length} plants...`);
|
||||
for (const p of rawPlants) {
|
||||
await db
|
||||
.insert(plants)
|
||||
.values({
|
||||
id: p.id,
|
||||
commonName: p.commonName,
|
||||
scientificName: p.scientificName,
|
||||
family: p.family,
|
||||
category: p.category,
|
||||
careSummary: p.careSummary,
|
||||
imageUrl: p.imageUrl,
|
||||
})
|
||||
.onConflictDoNothing();
|
||||
}
|
||||
console.log(`✅ ${rawPlants.length} plants inserted`);
|
||||
|
||||
console.log(`Seeding ${rawDiseases.length} diseases...`);
|
||||
for (const d of rawDiseases) {
|
||||
await db
|
||||
.insert(diseases)
|
||||
.values({
|
||||
id: d.id,
|
||||
plantId: d.plantId,
|
||||
name: d.name,
|
||||
scientificName: d.scientificName,
|
||||
causalAgentType: d.causalAgentType,
|
||||
description: d.description,
|
||||
symptoms: d.symptoms,
|
||||
causes: d.causes,
|
||||
treatment: d.treatment,
|
||||
prevention: d.prevention,
|
||||
lookalikeIds: d.lookalikeDiseaseIds,
|
||||
severity: d.severity,
|
||||
prevalence: d.prevalence ?? "uncommon",
|
||||
sourceUrl: "",
|
||||
})
|
||||
.onConflictDoNothing();
|
||||
}
|
||||
console.log(`✅ ${rawDiseases.length} diseases inserted`);
|
||||
|
||||
// Verify
|
||||
const [plantCount] = await db.select({ count: sql<number>`COUNT(*)` }).from(plants);
|
||||
const [diseaseCount] = await db.select({ count: sql<number>`COUNT(*)` }).from(diseases);
|
||||
console.log(`\n📊 Database now has:`);
|
||||
console.log(` ${plantCount.count} plants`);
|
||||
console.log(` ${diseaseCount.count} diseases`);
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("❌ Seed failed:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
218
scripts/smoke-test.mjs
Normal file
218
scripts/smoke-test.mjs
Normal file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Smoke test script for the Plant Disease Knowledge Base API.
|
||||
* Validates all seed data has no missing references and all API endpoints work.
|
||||
*
|
||||
* Usage:
|
||||
* # With dev server running:
|
||||
* node scripts/smoke-test.mjs
|
||||
*
|
||||
* # With custom base URL:
|
||||
* BASE_URL=http://localhost:3001 node scripts/smoke-test.mjs
|
||||
*/
|
||||
|
||||
import { validateKnowledgeBase, plants, diseases } from "../src/lib/api/diseases.ts";
|
||||
|
||||
const BASE_URL = process.env.BASE_URL || "http://localhost:3000";
|
||||
const results = { passed: 0, failed: 0, errors: [] };
|
||||
|
||||
function pass(test) {
|
||||
results.passed++;
|
||||
console.log(` ✅ ${test}`);
|
||||
}
|
||||
|
||||
function fail(test, message) {
|
||||
results.failed++;
|
||||
results.errors.push({ test, message });
|
||||
console.log(` ❌ ${test}: ${message}`);
|
||||
}
|
||||
|
||||
async function fetchJSON(path) {
|
||||
const res = await fetch(`${BASE_URL}${path}`);
|
||||
const data = await res.json();
|
||||
return { status: res.status, data, headers: Object.fromEntries(res.headers) };
|
||||
}
|
||||
|
||||
console.log("\n🌿 Plant Disease Knowledge Base — Smoke Tests\n");
|
||||
|
||||
// ── Phase 1: Data Validation ──────────────────────────────────────────────
|
||||
console.log("Phase 1: Seed Data Validation");
|
||||
|
||||
const validationErrors = validateKnowledgeBase();
|
||||
if (validationErrors.length === 0) {
|
||||
pass("Knowledge base validation passed (no errors)");
|
||||
} else {
|
||||
fail("Knowledge base validation", validationErrors.join("; "));
|
||||
}
|
||||
|
||||
if (plants.length >= 20) {
|
||||
pass(`Plant count: ${plants.length} (≥20)`);
|
||||
} else {
|
||||
fail("Plant count", `Only ${plants.length} plants (need ≥20)`);
|
||||
}
|
||||
|
||||
if (diseases.length >= 80) {
|
||||
pass(`Disease count: ${diseases.length} (≥80)`);
|
||||
} else {
|
||||
fail("Disease count", `Only ${diseases.length} diseases (need ≥80)`);
|
||||
}
|
||||
|
||||
const uniquePlantIds = new Set(diseases.map((d) => d.plantId));
|
||||
if (uniquePlantIds.size >= 20) {
|
||||
pass(`Diseases span ${uniquePlantIds.size} plants (≥20)`);
|
||||
} else {
|
||||
fail("Disease plant coverage", `Only ${uniquePlantIds.size} plants have diseases`);
|
||||
}
|
||||
|
||||
const causalTypes = new Set(diseases.map((d) => d.causalAgentType));
|
||||
if (causalTypes.size === 4) {
|
||||
pass(`All 4 causal agent types present: ${[...causalTypes].join(", ")}`);
|
||||
} else {
|
||||
fail("Causal agent types", `Only ${causalTypes.size}/4 types present`);
|
||||
}
|
||||
|
||||
// ── Phase 2: API Endpoint Tests ───────────────────────────────────────────
|
||||
console.log("\nPhase 2: API Endpoint Tests");
|
||||
|
||||
// GET /api/plants
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/plants");
|
||||
if (status === 200 && Array.isArray(data.plants) && data.plants.length >= 20) {
|
||||
pass(`GET /api/plants returns 200 with ${data.plants.length} plants`);
|
||||
} else {
|
||||
fail("GET /api/plants", `Status ${status}, plants: ${data.plants?.length ?? "N/A"}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/plants", e.message);
|
||||
}
|
||||
|
||||
// GET /api/plants?search=tomato
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/plants?search=tomato");
|
||||
if (status === 200 && data.plants.length > 0) {
|
||||
pass(`GET /api/plants?search=tomato returns ${data.plants.length} results`);
|
||||
} else {
|
||||
fail("GET /api/plants?search=tomato", `Status ${status}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/plants?search=tomato", e.message);
|
||||
}
|
||||
|
||||
// GET /api/plants/tomato
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/plants/tomato");
|
||||
if (status === 200 && data.plant?.id === "tomato" && data.diseases?.length >= 3) {
|
||||
pass(`GET /api/plants/tomato returns 200 with ${data.diseases.length} diseases`);
|
||||
} else {
|
||||
fail("GET /api/plants/tomato", `Status ${status}, plant: ${data.plant?.id ?? "N/A"}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/plants/tomato", e.message);
|
||||
}
|
||||
|
||||
// GET /api/plants/unknown-id (should 404)
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/plants/unknown-id");
|
||||
if (status === 404 && data.error === "Not Found") {
|
||||
pass("GET /api/plants/unknown-id returns 404");
|
||||
} else {
|
||||
fail("GET /api/plants/unknown-id", `Expected 404, got ${status}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/plants/unknown-id", e.message);
|
||||
}
|
||||
|
||||
// GET /api/diseases
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/diseases");
|
||||
if (status === 200 && Array.isArray(data.diseases) && data.diseases.length >= 80) {
|
||||
pass(`GET /api/diseases returns 200 with ${data.diseases.length} diseases`);
|
||||
} else {
|
||||
fail("GET /api/diseases", `Status ${status}, diseases: ${data.diseases?.length ?? "N/A"}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/diseases", e.message);
|
||||
}
|
||||
|
||||
// GET /api/diseases?plantId=tomato
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/diseases?plantId=tomato");
|
||||
if (status === 200 && data.diseases.length >= 3 && data.diseases.every((d) => d.plantId === "tomato")) {
|
||||
pass(`GET /api/diseases?plantId=tomato returns ${data.diseases.length} tomato diseases`);
|
||||
} else {
|
||||
fail("GET /api/diseases?plantId=tomato", `Status ${status}, count: ${data.diseases?.length ?? "N/A"}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/diseases?plantId=tomato", e.message);
|
||||
}
|
||||
|
||||
// GET /api/diseases?search=blight
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/diseases?search=blight");
|
||||
if (status === 200 && data.diseases.length >= 2) {
|
||||
pass(`GET /api/diseases?search=blight returns ${data.diseases.length} results (≥2)`);
|
||||
} else {
|
||||
fail("GET /api/diseases?search=blight", `Status ${status}, count: ${data.diseases?.length ?? "N/A"}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/diseases?search=blight", e.message);
|
||||
}
|
||||
|
||||
// GET /api/diseases/early-blight
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/diseases/early-blight");
|
||||
if (
|
||||
status === 200 &&
|
||||
data.disease?.id === "early-blight" &&
|
||||
data.plant?.id === "tomato" &&
|
||||
Array.isArray(data.lookalikes)
|
||||
) {
|
||||
pass(`GET /api/diseases/early-blight returns 200 with plant and lookalikes`);
|
||||
} else {
|
||||
fail("GET /api/diseases/early-blight", `Status ${status}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/diseases/early-blight", e.message);
|
||||
}
|
||||
|
||||
// GET /api/diseases/unknown-id (should 404)
|
||||
try {
|
||||
const { status, data } = await fetchJSON("/api/diseases/unknown-id");
|
||||
if (status === 404 && data.error === "Not Found") {
|
||||
pass("GET /api/diseases/unknown-id returns 404");
|
||||
} else {
|
||||
fail("GET /api/diseases/unknown-id", `Expected 404, got ${status}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("GET /api/diseases/unknown-id", e.message);
|
||||
}
|
||||
|
||||
// ── Phase 3: Response Headers ─────────────────────────────────────────────
|
||||
console.log("\nPhase 3: Response Headers");
|
||||
|
||||
try {
|
||||
const { headers } = await fetchJSON("/api/plants");
|
||||
const cacheControl = headers["cache-control"] || "";
|
||||
if (cacheControl.includes("max-age=3600")) {
|
||||
pass(`Cache-Control header present: ${cacheControl}`);
|
||||
} else {
|
||||
fail("Cache-Control header", `Expected max-age=3600, got: ${cacheControl}`);
|
||||
}
|
||||
} catch (e) {
|
||||
fail("Cache-Control header", e.message);
|
||||
}
|
||||
|
||||
// ── Summary ───────────────────────────────────────────────────────────────
|
||||
console.log("\n" + "─".repeat(50));
|
||||
console.log(`Results: ${results.passed} passed, ${results.failed} failed`);
|
||||
|
||||
if (results.failed > 0) {
|
||||
console.log("\nFailed tests:");
|
||||
for (const { test, message } of results.errors) {
|
||||
console.log(` • ${test}: ${message}`);
|
||||
}
|
||||
process.exit(1);
|
||||
} else {
|
||||
console.log("\n🎉 All smoke tests passed!\n");
|
||||
process.exit(0);
|
||||
}
|
||||
67
scripts/test-wiki-images.ts
Normal file
67
scripts/test-wiki-images.ts
Normal file
@@ -0,0 +1,67 @@
|
||||
/**
|
||||
* Quick test of Wikipedia image API for disease search terms.
|
||||
* Run: cd apps/web && npx tsx scripts/test-wiki-images.ts
|
||||
*/
|
||||
const API = "https://en.wikipedia.org/w/api.php";
|
||||
|
||||
async function search(term: string) {
|
||||
const url = `${API}?action=query&list=search&srsearch=${encodeURIComponent(term)}&format=json&srlimit=1&origin=*`;
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
return (await res.json()) as { query?: { search?: Array<{ title: string; pageid: number }> } };
|
||||
}
|
||||
|
||||
async function getImg(title: string) {
|
||||
const url = `${API}?action=query&titles=${encodeURIComponent(title)}&prop=pageimages&format=json&pithumbsize=400&origin=*`;
|
||||
const res = await fetch(url, { headers: { "User-Agent": "PlantHealthKB/1.0" } });
|
||||
return (await res.json()) as {
|
||||
query?: { pages?: Record<string, { thumbnail?: { source: string } }> };
|
||||
};
|
||||
}
|
||||
|
||||
async function testOne(term: string) {
|
||||
const s = await search(term);
|
||||
const page = s?.query?.search?.[0];
|
||||
if (page) {
|
||||
const img = await getImg(page.title);
|
||||
const pages = img?.query?.pages;
|
||||
if (!pages) {
|
||||
console.log(term, "→ NO PAGES");
|
||||
return;
|
||||
}
|
||||
const first = Object.values(pages)[0] as { thumbnail?: { source: string } };
|
||||
const thumb = first?.thumbnail?.source;
|
||||
console.log(`${term.padEnd(40)} → ${page.title.padEnd(50)} → ${thumb ?? "NO IMG"}`);
|
||||
} else {
|
||||
console.log(`${term.padEnd(40)} → NO PAGE`);
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 400));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const tests = [
|
||||
"Phytophthora infestans Late Blight",
|
||||
"Early Blight",
|
||||
"Septoria Leaf Spot",
|
||||
"Powdery Mildew",
|
||||
"Fusarium oxysporum",
|
||||
"Citrus Canker",
|
||||
"Root Rot Pythium",
|
||||
"Downy Mildew Peronospora",
|
||||
"Bacterial Leaf Spot Xanthomonas",
|
||||
"Apple Scab Venturia inaequalis",
|
||||
"Fire Blight Erwinia amylovora",
|
||||
"Blossom End Rot",
|
||||
"Tomato Mosaic Virus",
|
||||
"Rust Puccinia",
|
||||
"Black Spot Diplocarpon rosae",
|
||||
"Sooty Mold Capnodium",
|
||||
"Clubroot Plasmodiophora brassicae",
|
||||
"Anthracnose Colletotrichum",
|
||||
];
|
||||
console.log("Searching Wikipedia for disease images...\n");
|
||||
for (const t of tests) {
|
||||
await testOne(t);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user