#!/usr/bin/env node /** * scrape-training-dataset.ts * * Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons. * * Targets (tiered by plant type): * - Core plants (houseplants + common garden): 100 images per disease * - Full set (all 11,498 DB diseases): 10 images per disease * - Healthy: 400 images * * Sources (all free, no API keys): * 1. DB image_url — existing images already found * 2. DuckDuckGo — general web image search * 3. iNaturalist — real-world plant observation photos * 4. Wikimedia Commons — curated scientific/educational images * * Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts * Progress: data/dataset/.progress.json — interrupt and resume safely. */ import "dotenv/config"; import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs"; import { resolve, extname } from "path"; // Load .env.development for DB creds const envPath = resolve(__dirname, "../.env.development"); try { const env = readFileSync(envPath, "utf-8"); for (const line of env.split("\n")) { const trimmed = line.trim(); if (trimmed && !trimmed.startsWith("#")) { const eqIdx = trimmed.indexOf("="); if (eqIdx > 0) { const key = trimmed.slice(0, eqIdx).trim(); const val = trimmed.slice(eqIdx + 1).trim(); if (!process.env[key]) process.env[key] = val; } } } } catch {} import { getDb, closeDb } from "@/lib/db/index"; import { diseases } from "@/lib/db/schema"; // ─── Config ───────────────────────────────────────────────────────────────── const DATASET_DIR = resolve(__dirname, "../data/dataset"); const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json"); /** Target images per disease for CORE plants */ const TARGET_CORE = 100; /** Target images per disease for the FULL set */ const TARGET_FULL = 10; /** Target images for the "healthy" class */ const TARGET_HEALTHY = 400; /** Core plants that get higher image targets */ const CORE_PLANTS = new Set([ // Houseplants "monstera", "pothos", "snake-plant", "peace-lily", "orchid", "succulent", "fiddle-leaf-fig", "aloe-vera", "cactus", "fern", // Garden plants "tomato", "basil", "rose", "pepper", "strawberry", "cucumber", "squash", "lettuce", "spinach", "cabbage", "lavender", "mint", "jasmine", "sunflower", "daisy", "zucchini", "bean", "eggplant", "chili", // General disease patterns "general", ]); /** Delay between DuckDuckGo search API calls (ms) */ const SEARCH_DELAY = 1500; /** Delay between image downloads (ms) */ const DOWNLOAD_DELAY = 100; /** Max concurrent downloads */ const CONCURRENT_DOWNLOADS = 10; /** Minimum image size in bytes to accept */ const MIN_IMAGE_SIZE = 10_000; // 10KB /** Maximum image size in bytes */ const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB /** Allowed file extensions */ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"]; /** User agent for requests */ const UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"; /** Class ID for healthy plants */ const HEALTHY_CLASS = "healthy"; // ─── Types ────────────────────────────────────────────────────────────────── interface DbDisease { id: string; plantId: string; name: string; imageUrl: string | null; } interface DuckDuckGoImageResult { image: string; title: string; url: string; thumbnail: string; height: number; width: number; } interface ClassProgress { count: number; downloaded: number; failed: number; seenUrls: string[]; exhausted: boolean; } interface Progress { lastUpdated: string; classes: Record; } // ─── DB Loading ────────────────────────────────────────────────────────────── /** * Load all diseases from the database with their existing image URLs. */ async function loadDiseasesFromDb(): Promise { const db = getDb(); const rows = await db .select({ id: diseases.id, plantId: diseases.plantId, name: diseases.name, imageUrl: diseases.imageUrl, }) .from(diseases) .orderBy(diseases.id); return rows; } // ─── DuckDuckGo API ───────────────────────────────────────────────────────── async function getVqdToken(query: string): Promise { const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`; const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "text/html" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`); const html = await res.text(); const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/); if (!match) throw new Error(`Could not extract vqd token for "${query}"`); return match[1]; } async function searchImagesDuckDuckGo( query: string, vqd: string, page: number, ): Promise { const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(query)}&vqd=${vqd}&o=json&p=${page}&f=,,,`; const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "application/json", Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`, }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) { if (res.status === 429) { console.warn(" ⚠ Rate limited (429). Waiting 10s..."); await sleep(10_000); return searchImagesDuckDuckGo(query, vqd, page); } if (res.status === 403) return []; throw new Error(`DuckDuckGo search failed: ${res.status}`); } const data = (await res.json()) as { results: DuckDuckGoImageResult[] }; return data.results ?? []; } async function collectImagesDuckDuckGo( query: string, target: number, seenUrls: Set, ): Promise<{ urls: string[]; exhausted: boolean }> { const results: string[] = []; let page = 1; let exhausted = false; let consecutiveEmpty = 0; let vqd: string; try { vqd = await getVqdToken(query); } catch (err) { console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`); return { urls: [], exhausted: true }; } const MAX_PAGES = 5; let lowNoveltyCount = 0; while (results.length < target && page <= MAX_PAGES) { await sleep(SEARCH_DELAY); let pageResults: DuckDuckGoImageResult[]; try { pageResults = await searchImagesDuckDuckGo(query, vqd, page); } catch (err) { console.warn(` ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`); break; } if (!pageResults || pageResults.length === 0) { consecutiveEmpty++; if (consecutiveEmpty >= 3) { exhausted = true; break; } page++; continue; } consecutiveEmpty = 0; let newCount = 0; for (const r of pageResults) { if (results.length >= target) break; const imgUrl = r.image || r.url; if (!imgUrl || typeof imgUrl !== "string") continue; if (seenUrls.has(imgUrl)) continue; try { new URL(imgUrl); } catch { continue; } seenUrls.add(imgUrl); results.push(imgUrl); newCount++; } const newRatio = newCount / pageResults.length; if (newRatio < 0.05) { lowNoveltyCount++; if (lowNoveltyCount >= 2) break; } else { lowNoveltyCount = 0; } if (results.length < target) page++; } return { urls: results.slice(0, target), exhausted }; } // ─── iNaturalist API ───────────────────────────────────────────────────────── async function searchImagesInaturalist( query: string, target: number, seenUrls: Set, ): Promise<{ urls: string[]; exhausted: boolean }> { const results: string[] = []; const perPage = Math.min(target, 200); const apiUrl = `https://api.inaturalist.org/v1/observations` + `?q=${encodeURIComponent(query)}` + `&photos_only=true` + `&quality_grade=research` + `&per_page=${perPage}` + `&order_by=observed_on&order=desc`; try { const res = await fetch(apiUrl, { headers: { "User-Agent": UA, Accept: "application/json" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) return { urls: [], exhausted: false }; const data = (await res.json()) as { results: Array<{ photos: Array<{ url: string }> }>; }; for (const obs of data.results ?? []) { if (results.length >= target) break; for (const photo of obs.photos ?? []) { if (results.length >= target) break; const url = photo.url; if (!url || seenUrls.has(url)) continue; const fullUrl = url.replace("/medium.", "/original."); seenUrls.add(fullUrl); results.push(fullUrl); } } return { urls: results, exhausted: results.length < target }; } catch { return { urls: results, exhausted: false }; } } // ─── Wikimedia Commons API ────────────────────────────────────────────────── async function searchImagesCommons( query: string, target: number, seenUrls: Set, ): Promise<{ urls: string[]; exhausted: boolean }> { const results: string[] = []; let sroffset = 0; while (results.length < target) { const params = new URLSearchParams({ action: "query", list: "search", srsearch: query, srnamespace: "6", srlimit: "50", sroffset: String(sroffset), format: "json", origin: "*", // server-side API call }); const url = `https://commons.wikimedia.org/w/api.php?${params}`; try { const res = await fetch(url, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(10_000), }); if (!res.ok) break; const data = (await res.json()) as { query?: { search?: Array<{ title: string }> }; continue?: { sroffset?: number }; }; const hits = data.query?.search ?? []; if (hits.length === 0) break; for (const hit of hits) { if (results.length >= target) break; const filename = hit.title.replace(/^File:/, ""); const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(filename)}`; if (seenUrls.has(imgUrl)) continue; seenUrls.add(imgUrl); results.push(imgUrl); } sroffset = data.continue?.sroffset ?? sroffset + hits.length; } catch { break; } } return { urls: results, exhausted: results.length < target }; } // ─── Image Download ───────────────────────────────────────────────────────── async function downloadImage(url: string, destPath: string): Promise { try { const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) return false; const contentType = res.headers.get("content-type") || ""; if (contentType.includes("text/html")) return false; const buffer = Buffer.from(await res.arrayBuffer()); if (buffer.length < MIN_IMAGE_SIZE) return false; if (buffer.length > MAX_IMAGE_SIZE) return false; let ext = extname(new URL(url).pathname).toLowerCase(); if (!ALLOWED_EXTENSIONS.includes(ext)) { if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg"; else if (contentType.includes("png")) ext = ".png"; else if (contentType.includes("webp")) ext = ".webp"; else ext = ".jpg"; } const filePath = destPath.replace(/\.\w+$/, ext); writeFileSync(filePath, buffer); return true; } catch { return false; } } async function downloadBatch( urls: string[], classDir: string, startIndex: number, ): Promise<{ downloaded: number; failed: number; lastIndex: number }> { let downloaded = 0; let failed = 0; let index = startIndex; for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) { const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS); const results = await Promise.all( chunk.map(async (url) => { const paddedIndex = String(index).padStart(4, "0"); const destPath = resolve(classDir, `img_${paddedIndex}.jpg`); const success = await downloadImage(url, destPath); await sleep(DOWNLOAD_DELAY); return { success, index: index++, url: url.substring(0, 50) }; }), ); for (const r of results) { if (r.success) downloaded++; else { failed++; if (failed % 20 === 1) console.log(` ⚠ Failed: ${r.url}...`); } } const total = downloaded + failed; if (total % 30 === 0 || total === urls.length) { console.log(` Progress: ${downloaded}/${urls.length} (${failed} failed)`); } } return { downloaded, failed, lastIndex: index }; } // ─── Progress Tracking ────────────────────────────────────────────────────── function loadProgress(): Progress { if (!existsSync(PROGRESS_FILE)) { return { lastUpdated: new Date().toISOString(), classes: {} }; } return JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Progress; } function saveProgress(progress: Progress): void { progress.lastUpdated = new Date().toISOString(); writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2)); } function getClassProgress(progress: Progress, classId: string): ClassProgress { if (!progress.classes[classId]) { progress.classes[classId] = { count: 0, downloaded: 0, failed: 0, seenUrls: [], exhausted: false, }; } return progress.classes[classId]; } // ─── Query Building ───────────────────────────────────────────────────────── function buildSearchQueries(disease: DbDisease): string[] { const name = disease.name || disease.id.replace(/-/g, " "); const plant = disease.plantId.replace(/-/g, " "); // Every query keeps the disease NAME to avoid noisy labels return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`]; } function buildHealthyQueries(plant: string): string[] { const name = plant.replace(/-/g, " "); return [ `healthy ${name} leaf`, `${name} leaf closeup`, `healthy ${name} plant`, `${name} foliage`, ]; } // ─── Dataset Collection ───────────────────────────────────────────────────── async function collectClassImages( classId: string, queries: string[], target: number, progress: Progress, classDir: string, existingUrls: string[] = [], fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only ): Promise { const cp = getClassProgress(progress, classId); const seenUrls = new Set(cp.seenUrls); if (cp.count >= target) { console.log(` ✓ Already have ${cp.count}/${target}`); return; } if (cp.exhausted) { console.log(` ✓ Exhausted (${cp.count}/${target})`); return; } mkdirSync(classDir, { recursive: true }); const allUrls: string[] = []; let exhausted = false; // ── Source 0: Existing DB URLs ────────────────────────────────────────── const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u)); if (freshDbUrls.length > 0) { console.log(` DB: ${freshDbUrls.length} existing URLs`); for (const url of freshDbUrls) { if (allUrls.length >= target) break; seenUrls.add(url); allUrls.push(url); } } // ── Source 1: DuckDuckGo ────────────────────────────────────────────── // Skip DDG in fast mode (full set — DDG is slowest source) if (!fastMode && allUrls.length < target) { for (const query of queries) { if (allUrls.length >= target) break; process.stdout.write(` DDG: "${query.substring(0, 40)}"... `); const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls); allUrls.push(...result.urls); if (result.exhausted) exhausted = true; console.log(`${result.urls.length} new`); if (allUrls.length >= target) break; } } // ── Source 2: iNaturalist ────────────────────────────────────────────── if (allUrls.length < target) { const primaryQuery = queries[0]; console.log(` iNat: Searching...`); const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls); allUrls.push(...result.urls); if (result.exhausted) exhausted = true; console.log(` iNat: ${result.urls.length} images`); } // ── Source 3: Wikimedia Commons ──────────────────────────────────────── if (allUrls.length < target) { const primaryQuery = queries[0]; console.log(` Commons: Searching...`); const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls); allUrls.push(...result.urls); if (result.exhausted) exhausted = true; console.log(` Commons: ${result.urls.length} images`); } if (allUrls.length === 0) { cp.exhausted = exhausted; saveProgress(progress); console.log(` ✗ No images found`); return; } // Save progress with seen URLs BEFORE downloading cp.seenUrls = Array.from(seenUrls); cp.exhausted = exhausted; saveProgress(progress); console.log(` Downloading ${allUrls.length} images...`); const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count); cp.count += downloaded; cp.downloaded += downloaded; cp.failed += failed; saveProgress(progress); const pct = Math.round((cp.count / target) * 100); console.log( ` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${allUrls.length} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`, ); } // ─── Main ─────────────────────────────────────────────────────────────────── async function main() { console.log("=".repeat(60)); console.log("PLANT DISEASE DATASET COLLECTOR — FULL DB"); console.log("=".repeat(60)); // Load diseases from DB console.log("\nLoading diseases from database..."); const dbDiseases = await loadDiseasesFromDb(); console.log(` ${dbDiseases.length} diseases loaded`); const coreDiseases = dbDiseases.filter((d) => CORE_PLANTS.has(d.plantId)); const fullDiseases = dbDiseases.filter((d) => !CORE_PLANTS.has(d.plantId)); console.log(` Core plants: ${coreDiseases.length} diseases (target: ${TARGET_CORE})`); console.log(` Full set: ${fullDiseases.length} diseases (target: ${TARGET_FULL})`); // Load progress mkdirSync(DATASET_DIR, { recursive: true }); const progress = loadProgress(); const startTime = Date.now(); // ── Phase 1: Core set ────────────────────────────────────────────────── console.log("\n" + "─".repeat(60)); console.log("PHASE 1: Core Diseases (100 images each)"); console.log("─".repeat(60)); for (let i = 0; i < coreDiseases.length; i++) { const d = coreDiseases[i]; const classDir = resolve(DATASET_DIR, d.id); const queries = buildSearchQueries(d); const existingUrls = d.imageUrl ? [d.imageUrl] : []; const pct = Math.round((i / coreDiseases.length) * 100); console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`); await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls); } // ── Phase 2: Full set ────────────────────────────────────────────────── console.log("\n" + "─".repeat(60)); console.log("PHASE 2: Full Disease Set (10 images each)"); console.log("─".repeat(60)); for (let i = 0; i < fullDiseases.length; i++) { const d = fullDiseases[i]; const classDir = resolve(DATASET_DIR, d.id); const queries = buildSearchQueries(d); const existingUrls = d.imageUrl ? [d.imageUrl] : []; const pct = Math.round((i / fullDiseases.length) * 100); console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`); await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true); } // ── Phase 3: Healthy class ────────────────────────────────────────────── console.log("\n" + "─".repeat(60)); console.log("PHASE 3: Healthy Plant Images"); console.log("─".repeat(60)); const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS); const healthyCp = getClassProgress(progress, HEALTHY_CLASS); const healthySeen = new Set(healthyCp.seenUrls); if (healthyCp.count >= TARGET_HEALTHY) { console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`); } else { // Collect all unique plants const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))]; const allHealthyQueries: string[] = []; for (const plant of allPlants) { allHealthyQueries.push(...buildHealthyQueries(plant)); } const healthySources = [ { name: "DDG", collector: collectImagesDuckDuckGo }, { name: "iNat", collector: searchImagesInaturalist }, { name: "Commons", collector: searchImagesCommons }, ] as const; const totalHealthyUrls: string[] = []; let anyRemaining = false; for (const source of healthySources) { if (totalHealthyUrls.length >= TARGET_HEALTHY) break; console.log(`\n Source: ${source.name}`); for (const query of allHealthyQueries.slice(0, 20)) { if (totalHealthyUrls.length >= TARGET_HEALTHY) break; process.stdout.write(` "${query}"... `); const result = await source.collector( query, TARGET_HEALTHY - totalHealthyUrls.length, healthySeen, ); totalHealthyUrls.push(...result.urls); if (!result.exhausted) anyRemaining = true; console.log(`${result.urls.length} new`); } } healthyCp.seenUrls = Array.from(healthySeen); if (totalHealthyUrls.length > 0) { healthyCp.exhausted = !anyRemaining; saveProgress(progress); console.log(`\n Downloading ${totalHealthyUrls.length} healthy images...`); const { downloaded, failed } = await downloadBatch( totalHealthyUrls, healthyDir, healthyCp.count, ); healthyCp.count += downloaded; healthyCp.downloaded += downloaded; healthyCp.failed += failed; const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100); console.log( ` Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`, ); } else { console.log(` ✗ No healthy images found`); } saveProgress(progress); } // ── Summary ──────────────────────────────────────────────────────────────── const elapsed = Math.round((Date.now() - startTime) / 1000); const mins = Math.floor(elapsed / 60); const hrs = Math.floor(mins / 60); let totalDownloaded = 0; let totalFailed = 0; for (const [, cp] of Object.entries(progress.classes)) { totalDownloaded += cp.downloaded || 0; totalFailed += cp.failed || 0; } console.log("\n" + "=".repeat(60)); console.log("COMPLETE"); console.log("=".repeat(60)); console.log(` Time: ${hrs}h ${mins % 60}m`); console.log(` Downloaded: ${totalDownloaded} images`); console.log(` Failed: ${totalFailed} images`); console.log(` Dataset: ${DATASET_DIR}/`); await closeDb(); console.log("=".repeat(60)); } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } main().catch((err) => { console.error("Fatal error:", err); process.exit(1); });