#!/usr/bin/env node /** * scrape-training-dataset.ts * * Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons. * * Target: Top 200 most common plant diseases (ranked by iNaturalist observation counts) * - 200 images per disease * - 200 healthy plant images * - Processes 5 diseases in parallel with 30 concurrent downloads each * * Sources (all free, no API keys): * 1. DB image_url — existing images already found * 2. DuckDuckGo — general web image search * 3. iNaturalist — real-world plant observation photos * 4. Wikimedia Commons — curated scientific/educational images * * Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts * Progress: data/dataset/.progress.json — interrupt and resume safely. */ import "dotenv/config"; import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "fs"; import { resolve, extname } from "path"; // Load .env.development for DB creds const envPath = resolve(__dirname, "../.env.development"); try { const env = readFileSync(envPath, "utf-8"); for (const line of env.split("\n")) { const trimmed = line.trim(); if (trimmed && !trimmed.startsWith("#")) { const eqIdx = trimmed.indexOf("="); if (eqIdx > 0) { const key = trimmed.slice(0, eqIdx).trim(); const val = trimmed.slice(eqIdx + 1).trim(); if (!process.env[key]) process.env[key] = val; } } } } catch {} import { getDb, closeDb } from "@/lib/db/index"; import { diseases } from "@/lib/db/schema"; import { sql } from "drizzle-orm"; // ─── Config ───────────────────────────────────────────────────────────────── const DATASET_DIR = resolve(__dirname, "../data/dataset"); const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json"); /** Target images per disease */ const TARGET_PER_DISEASE = 200; /** Number of diseases to target (most common first) */ const TARGET_DISEASE_COUNT = 200; /** Target images for the "healthy" class */ const TARGET_HEALTHY = 400; /** Delay between DuckDuckGo search API calls (ms) */ const SEARCH_DELAY = 1500; /** Max concurrent image downloads per disease */ const CONCURRENT_DOWNLOADS = 30; /** Number of diseases to process in parallel */ const DISEASE_CONCURRENCY = 5; /** Minimum image size in bytes to accept */ const MIN_IMAGE_SIZE = 10_000; // 10KB /** Maximum image size in bytes */ const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB /** Allowed file extensions */ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"]; /** User agent for requests */ const UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"; /** Class ID for healthy plants */ const HEALTHY_CLASS = "healthy"; // ─── Types ────────────────────────────────────────────────────────────────── interface DbDisease { id: string; plantId: string; name: string; imageUrl: string | null; } interface DuckDuckGoImageResult { image: string; title: string; url: string; thumbnail: string; height: number; width: number; } interface SourceState { exhausted: boolean; } interface ClassProgress { count: number; downloaded: number; failed: number; seenUrls: string[]; exhausted: boolean; /** Per-source exhaustion tracking — prevents re-scraping exhausted sources on resume */ sources: { db: SourceState; duckduckgo: SourceState; inaturalist: SourceState; wikimedia: SourceState; }; } interface Progress { lastUpdated: string; classes: Record; /** Phase checkpoint: 0=core, 1=full, 2=healthy. On resume, skip to this phase. */ phase: number; /** Index within the current phase's disease array. On resume, skip to this index. */ phaseIndex: number; } // ─── DB Loading ────────────────────────────────────────────────────────────── const INAT_CACHE_FILE = resolve(DATASET_DIR, ".inat-prevalence-cache.json"); /** * Query iNaturalist for real-world prevalence of a disease. * Returns observation count (higher = more common in the real world). */ async function getInatPrevalence(diseaseName: string, plantName?: string): Promise { try { const headers = { "User-Agent": UA, Accept: "application/json" }; const signal = AbortSignal.timeout(10_000); const baseUrl = "https://api.inaturalist.org/v1/observations"; // Tier 1: disease + plant name, research-grade, Plantae/Fungi/Chromista // This is the most specific and reliable query — filters to relevant kingdoms // and only counts community-verified observations. if (plantName) { const q = `${diseaseName} ${plantName}`; const url = `${baseUrl}?q=${encodeURIComponent(q)}` + `&quality_grade=research` + `&iconic_taxon_id=47126,47158,47686` + `&photos_only=true&per_page=1`; const res = await fetch(url, { headers, signal }); if (res.ok) { const data = (await res.json()) as { total_results: number }; if ((data.total_results ?? 0) > 0) return data.total_results!; } } // Fallback: disease name only, all quality grades (original behavior) const url = `${baseUrl}?q=${encodeURIComponent(diseaseName.toLowerCase())}&photos_only=true&per_page=1`; const res = await fetch(url, { headers, signal }); if (!res.ok) return 0; const data = (await res.json()) as { total_results: number }; return data.total_results ?? 0; } catch { return 0; } } /** * Load prevalence data from cache or build it by querying iNaturalist. * Caches results to avoid re-querying on every run. */ async function loadPrevalenceData( uniqueNames: string[], plantMap?: Map, ): Promise> { // Load cache if exists let cache: Record = {}; if (existsSync(INAT_CACHE_FILE)) { try { cache = JSON.parse(readFileSync(INAT_CACHE_FILE, "utf-8")); } catch {} } const prevalenceMap = new Map(); const toQuery: string[] = []; // Check which names need querying for (const name of uniqueNames) { const key = name.toLowerCase(); if (key in cache) { prevalenceMap.set(name, cache[key]); } else { toQuery.push(name); } } if (toQuery.length > 0) { console.log(`\n Querying iNaturalist for ${toQuery.length} disease prevalence scores...`); let queried = 0; for (const name of toQuery) { const count = await getInatPrevalence(name, plantMap?.get(name)); const key = name.toLowerCase(); cache[key] = count; prevalenceMap.set(name, count); queried++; // Save cache every 10 queries if (queried % 10 === 0) { writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2)); console.log(` Queried ${queried}/${toQuery.length}...`); } // Rate limit: ~100 req/min await sleep(600); } // Final cache save writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2)); console.log(` ✓ Queried ${queried} diseases, cached to ${INAT_CACHE_FILE}`); } return prevalenceMap; } /** * Persist prevalence scores to the database and update prevalence enum. * Maps observation counts to common/uncommon/rare based on thresholds. */ async function persistPrevalenceData( db: ReturnType, prevalenceMap: Map, ): Promise { // Load all diseases to update const allDiseases = await db .select({ id: diseases.id, name: diseases.name, }) .from(diseases); // Compute percentile-based thresholds from actual score distribution. // Top 25% → common, bottom 25% → rare, middle 50% → uncommon. // This guarantees meaningful classification regardless of absolute scale. const scores = Array.from(prevalenceMap.values()) .filter((s) => s > 0) .sort((a, b) => a - b); const n = scores.length; const commonThreshold = n > 0 ? scores[Math.floor(n * 0.75)] : 1000; const rareThreshold = n > 0 ? scores[Math.floor(n * 0.25)] : 10; console.log( `\n Prevalence distribution: ${n} non-zero scores` + `, p25=${rareThreshold.toLocaleString()}` + `, p75=${commonThreshold.toLocaleString()}`, ); console.log(` Persisting prevalence data for ${allDiseases.length} diseases...`); let updated = 0; for (const disease of allDiseases) { const score = prevalenceMap.get(disease.name) ?? 0; // Map score to prevalence enum using distribution-based thresholds. // Score of 0 means no iNaturalist observations found — genuinely rare. let prevalence: "common" | "uncommon" | "rare" | "very_rare"; if (score === 0) { prevalence = "very_rare"; } else if (score >= commonThreshold) { prevalence = "common"; } else if (score > rareThreshold) { prevalence = "uncommon"; } else { prevalence = "rare"; } await db .update(diseases) .set({ prevalenceScore: score, prevalence, updatedAt: sql`(datetime('now'))`, }) .where(sql`${diseases.id} = ${disease.id}`); updated++; if (updated % 100 === 0) { console.log(` Updated ${updated}/${allDiseases.length}...`); } } console.log(` ✓ Updated ${updated} diseases with prevalence data`); } /** * Load the top 200 most common diseases from the database. * Ranks by iNaturalist observation counts (real-world prevalence data). */ async function loadDiseasesFromDb(): Promise { const db = getDb(); // Get unique disease names and their most common host plant for better iNaturalist queries. const nameStats = await db .select({ name: diseases.name, plantId: diseases.plantId, count: sql`COUNT(*)`.mapWith(Number), }) .from(diseases) .groupBy(diseases.name, diseases.plantId); // Aggregate: unique names, name frequency (across all plants), and most common plant per name const seenNames = new Set(); const nameFrequency = new Map(); const plantFreq = new Map>(); let totalDiseases = 0; for (const row of nameStats) { seenNames.add(row.name); nameFrequency.set(row.name, (nameFrequency.get(row.name) ?? 0) + row.count); totalDiseases += row.count; if (!plantFreq.has(row.name)) plantFreq.set(row.name, new Map()); plantFreq.get(row.name)!.set(row.plantId, row.count); } const uniqueNames = [...seenNames]; // For each disease name, pick the most frequent host plant for more specific iNaturalist queries const plantMap = new Map(); for (const [name, freq] of plantFreq) { const top = [...freq.entries()].sort((a, b) => b[1] - a[1])[0]; plantMap.set(name, top[0]); } console.log( ` Found ${uniqueNames.length} unique disease names across ${totalDiseases} diseases`, ); // Load or build prevalence data from iNaturalist (with plant context for better queries) const prevalenceMap = await loadPrevalenceData(uniqueNames, plantMap); // Persist prevalence scores to database await persistPrevalenceData(db, prevalenceMap); // Load all diseases const allDiseases = await db .select({ id: diseases.id, plantId: diseases.plantId, name: diseases.name, imageUrl: diseases.imageUrl, }) .from(diseases); // Sort by iNaturalist prevalence (descending), then by name frequency as tiebreaker allDiseases.sort((a, b) => { const prevA = prevalenceMap.get(a.name) ?? 0; const prevB = prevalenceMap.get(b.name) ?? 0; if (prevA !== prevB) return prevB - prevA; // Tiebreaker: name frequency const freqA = nameFrequency.get(a.name) ?? 0; const freqB = nameFrequency.get(b.name) ?? 0; return freqB - freqA; }); // Return top TARGET_DISEASE_COUNT return allDiseases.slice(0, TARGET_DISEASE_COUNT); } // ─── DuckDuckGo API ───────────────────────────────────────────────────────── async function getVqdToken(query: string): Promise { const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`; const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "text/html" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`); const html = await res.text(); const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/); if (!match) throw new Error(`Could not extract vqd token for "${query}"`); return match[1]; } async function searchImagesDuckDuckGo( query: string, vqd: string, page: number, ): Promise { const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent( query, )}&vqd=${vqd}&o=json&p=${page}&f=,,,`; const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "application/json", Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`, }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) { if (res.status === 429) { console.warn(" ⚠ Rate limited (429). Waiting 10s..."); await sleep(10_000); return searchImagesDuckDuckGo(query, vqd, page); } if (res.status === 403) return []; throw new Error(`DuckDuckGo search failed: ${res.status}`); } const data = (await res.json()) as { results: DuckDuckGoImageResult[] }; return data.results ?? []; } async function collectImagesDuckDuckGo( query: string, target: number, seenUrls: Set, ): Promise<{ urls: string[]; exhausted: boolean }> { const results: string[] = []; let page = 1; let exhausted = false; let consecutiveEmpty = 0; let vqd: string; try { vqd = await getVqdToken(query); } catch (err) { console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`); return { urls: [], exhausted: true }; } const MAX_PAGES = 5; let lowNoveltyCount = 0; while (results.length < target && page <= MAX_PAGES) { await sleep(SEARCH_DELAY); let pageResults: DuckDuckGoImageResult[]; try { pageResults = await searchImagesDuckDuckGo(query, vqd, page); } catch (err) { console.warn(` ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`); break; } if (!pageResults || pageResults.length === 0) { consecutiveEmpty++; if (consecutiveEmpty >= 3) { exhausted = true; break; } page++; continue; } consecutiveEmpty = 0; let newCount = 0; for (const r of pageResults) { if (results.length >= target) break; const imgUrl = r.image || r.url; if (!imgUrl || typeof imgUrl !== "string") continue; if (seenUrls.has(imgUrl)) continue; try { new URL(imgUrl); } catch { continue; } seenUrls.add(imgUrl); results.push(imgUrl); newCount++; } const newRatio = newCount / pageResults.length; if (newRatio < 0.05) { lowNoveltyCount++; if (lowNoveltyCount >= 2) break; } else { lowNoveltyCount = 0; } if (results.length < target) page++; } return { urls: results.slice(0, target), exhausted }; } // ─── iNaturalist API ───────────────────────────────────────────────────────── async function searchImagesInaturalist( query: string, target: number, seenUrls: Set, ): Promise<{ urls: string[]; exhausted: boolean }> { const results: string[] = []; const perPage = Math.min(target, 200); const apiUrl = `https://api.inaturalist.org/v1/observations` + `?q=${encodeURIComponent(query)}` + `&photos_only=true` + `&quality_grade=research` + `&per_page=${perPage}` + `&order_by=observed_on&order=desc`; try { const res = await fetch(apiUrl, { headers: { "User-Agent": UA, Accept: "application/json" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) return { urls: [], exhausted: false }; const data = (await res.json()) as { results: Array<{ photos: Array<{ url: string }> }>; }; for (const obs of data.results ?? []) { if (results.length >= target) break; for (const photo of obs.photos ?? []) { if (results.length >= target) break; const url = photo.url; if (!url || seenUrls.has(url)) continue; const fullUrl = url.replace("/medium.", "/original."); seenUrls.add(fullUrl); results.push(fullUrl); } } return { urls: results, exhausted: results.length < target }; } catch { return { urls: results, exhausted: false }; } } // ─── Wikimedia Commons API ────────────────────────────────────────────────── async function searchImagesCommons( query: string, target: number, seenUrls: Set, ): Promise<{ urls: string[]; exhausted: boolean }> { const results: string[] = []; let sroffset = 0; while (results.length < target) { const params = new URLSearchParams({ action: "query", list: "search", srsearch: query, srnamespace: "6", srlimit: "50", sroffset: String(sroffset), format: "json", // No origin needed — server-side fetch, Wikimedia ignores CORS headers on API calls }); const url = `https://commons.wikimedia.org/w/api.php?${params}`; try { const res = await fetch(url, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(10_000), }); if (!res.ok) break; const data = (await res.json()) as { query?: { search?: Array<{ title: string }> }; continue?: { sroffset?: number }; }; const hits = data.query?.search ?? []; if (hits.length === 0) break; for (const hit of hits) { if (results.length >= target) break; const filename = hit.title.replace(/^File:/, ""); const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent( filename, )}`; if (seenUrls.has(imgUrl)) continue; seenUrls.add(imgUrl); results.push(imgUrl); } sroffset = data.continue?.sroffset ?? sroffset + hits.length; } catch { break; } } return { urls: results, exhausted: results.length < target }; } // ─── Image Download ───────────────────────────────────────────────────────── async function downloadImage(url: string, destPath: string): Promise { try { const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) return false; const contentType = res.headers.get("content-type") || ""; if (contentType.includes("text/html")) return false; const buffer = Buffer.from(await res.arrayBuffer()); if (buffer.length < MIN_IMAGE_SIZE) return false; if (buffer.length > MAX_IMAGE_SIZE) return false; let ext = extname(new URL(url).pathname).toLowerCase(); if (!ALLOWED_EXTENSIONS.includes(ext)) { if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg"; else if (contentType.includes("png")) ext = ".png"; else if (contentType.includes("webp")) ext = ".webp"; else ext = ".jpg"; } const filePath = destPath.replace(/\.\w+$/, ext); writeFileSync(filePath, buffer); return true; } catch { return false; } } async function downloadBatch( urls: string[], classDir: string, startIndex: number, ): Promise<{ downloaded: number; failed: number; lastIndex: number }> { let downloaded = 0; let failed = 0; let index = startIndex; for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) { const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS); const results = await Promise.all( chunk.map(async (url) => { const paddedIndex = String(index).padStart(4, "0"); const destPath = resolve(classDir, `img_${paddedIndex}.jpg`); const success = await downloadImage(url, destPath); return { success, index: index++, url: url.substring(0, 50) }; }), ); for (const r of results) { if (r.success) downloaded++; else { failed++; if (failed % 20 === 1) console.log(` ⚠ Failed: ${r.url}...`); } } const total = downloaded + failed; if (total % 30 === 0 || total === urls.length) { console.log(` Progress: ${downloaded}/${urls.length} (${failed} failed)`); } } return { downloaded, failed, lastIndex: index }; } // ─── Progress Tracking ────────────────────────────────────────────────────── function loadProgress(): Progress { if (!existsSync(PROGRESS_FILE)) { return { lastUpdated: new Date().toISOString(), classes: {}, phase: 0, phaseIndex: 0, }; } try { const raw = JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Partial; raw.classes ??= {}; // Migration: detect old tiered system (phaseIndex > 200 means it's from old core/full system) const isOldFormat = (raw.phaseIndex ?? 0) > 200 || !raw.phase; if (isOldFormat) { console.warn(" ↻ Migrating progress file from old tiered system to new format"); console.warn(" Phase checkpoint reset to 0 (will re-scan all 200 diseases)"); console.warn(" Per-class progress (seenUrls, counts) preserved"); raw.phase = 0; raw.phaseIndex = 0; } else { raw.phase ??= 0; raw.phaseIndex ??= 0; } // Ensure each class has the sources field for (const key of Object.keys(raw.classes)) { const cp = raw.classes[key] as Partial; // Migrate class-level exhausted to per-source exhausted if needed if (!cp.sources) { const classExhausted = cp.exhausted ?? false; cp.sources = { db: { exhausted: classExhausted }, duckduckgo: { exhausted: classExhausted }, inaturalist: { exhausted: classExhausted }, wikimedia: { exhausted: classExhausted }, }; } cp.seenUrls ??= []; } return raw as Progress; } catch { console.warn(" ⚠ Corrupt progress file, starting fresh"); return { lastUpdated: new Date().toISOString(), classes: {}, phase: 0, phaseIndex: 0, }; } } function saveProgress(progress: Progress): void { progress.lastUpdated = new Date().toISOString(); writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2)); } function getClassProgress(progress: Progress, classId: string): ClassProgress { if (!progress.classes[classId]) { progress.classes[classId] = { count: 0, downloaded: 0, failed: 0, seenUrls: [], exhausted: false, sources: { db: { exhausted: false }, duckduckgo: { exhausted: false }, inaturalist: { exhausted: false }, wikimedia: { exhausted: false }, }, }; } return progress.classes[classId]; } // ─── Query Building ───────────────────────────────────────────────────────── function buildSearchQueries(disease: DbDisease): string[] { const name = disease.name || disease.id.replace(/-/g, " "); const plant = disease.plantId.replace(/-/g, " "); // Every query keeps the disease NAME to avoid noisy labels return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`]; } function buildHealthyQueries(plant: string): string[] { const name = plant.replace(/-/g, " "); return [ `healthy ${name} leaf`, `${name} leaf closeup`, `healthy ${name} plant`, `${name} foliage`, ]; } // ─── File Reconciliation ─────────────────────────────────────────────────── /** * Count actual image files in a class directory. * Returns the count of files matching img_* pattern, OR 0 if dir doesn't exist. */ function countImagesInDir(classDir: string): number { if (!existsSync(classDir)) return 0; try { const files = readdirSync(classDir); return files.filter((f) => f.startsWith("img_")).length; } catch { return 0; } } /** * Reconcile a class's progress count with actual files on disk. * If files were deleted after the progress file was saved, this * adjusts the count downward so we re-download the missing ones. * Returns the reconciled count. */ function reconcileClassCount(classDir: string, progressCount: number): number { const fileCount = countImagesInDir(classDir); if (fileCount < progressCount) { console.log( ` ↻ File count (${fileCount}) < progress count (${progressCount}) — reconciling`, ); return fileCount; } return progressCount; } // ─── Dataset Collection ───────────────────────────────────────────────────── async function collectClassImages( classId: string, queries: string[], target: number, progress: Progress, classDir: string, existingUrls: string[] = [], ): Promise { const cp = getClassProgress(progress, classId); // ── Reconcile with actual files on disk ───────────────────────────────── const actualCount = reconcileClassCount(classDir, cp.count); if (actualCount !== cp.count) { cp.count = actualCount; saveProgress(progress); } const seenUrls = new Set(cp.seenUrls); const sources = cp.sources; if (cp.count >= target) { console.log(` ✓ Already have ${cp.count}/${target}`); return; } // Check if ALL sources are exhausted const allExhausted = sources.db.exhausted && sources.duckduckgo.exhausted && sources.inaturalist.exhausted && sources.wikimedia.exhausted; if (allExhausted) { cp.exhausted = true; saveProgress(progress); console.log(` ✓ Exhausted (${cp.count}/${target})`); return; } mkdirSync(classDir, { recursive: true }); const allUrls: string[] = []; let anyNewResults = false; const needed = target - cp.count; // ── Source 0: Existing DB URLs ────────────────────────────────────────── if (!sources.db.exhausted) { const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u)); if (freshDbUrls.length > 0) { console.log(` DB: ${freshDbUrls.length} existing URLs`); for (const url of freshDbUrls) { if (allUrls.length >= needed) break; seenUrls.add(url); allUrls.push(url); } if (freshDbUrls.length > 0) anyNewResults = true; } // DB source is always "exhausted" after processing its initial URLs sources.db.exhausted = true; } // ── Source 1: DuckDuckGo ────────────────────────────────────────────── if (!sources.duckduckgo.exhausted && allUrls.length < needed) { for (const query of queries) { if (allUrls.length >= needed) break; process.stdout.write(` DDG: "${query.substring(0, 40)}"... `); const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls); allUrls.push(...result.urls); if (result.exhausted) { sources.duckduckgo.exhausted = true; } if (result.urls.length > 0) anyNewResults = true; console.log(`${result.urls.length} new`); if (allUrls.length >= needed) break; } // If DDG never gave us anything, mark exhausted to avoid re-trying if (!anyNewResults && sources.duckduckgo.exhausted) { /* already marked */ } } // ── Source 2: iNaturalist ────────────────────────────────────────────── if (!sources.inaturalist.exhausted && allUrls.length < needed) { const primaryQuery = queries[0]; console.log(` iNat: Searching...`); const result = await searchImagesInaturalist(primaryQuery, needed - allUrls.length, seenUrls); allUrls.push(...result.urls); if (result.exhausted) sources.inaturalist.exhausted = true; if (result.urls.length > 0) anyNewResults = true; console.log(` iNat: ${result.urls.length} images`); } // ── Source 3: Wikimedia Commons ──────────────────────────────────────── if (!sources.wikimedia.exhausted && allUrls.length < needed) { const primaryQuery = queries[0]; console.log(` Commons: Searching...`); const result = await searchImagesCommons(primaryQuery, needed - allUrls.length, seenUrls); allUrls.push(...result.urls); if (result.exhausted) sources.wikimedia.exhausted = true; if (result.urls.length > 0) anyNewResults = true; console.log(` Commons: ${result.urls.length} images`); } if (allUrls.length === 0) { cp.exhausted = true; saveProgress(progress); console.log(` ✗ No images found — exhausted`); return; } if (!anyNewResults && allUrls.length > 0) { // Only DB URLs survived — nothing more will come from searches cp.exhausted = true; saveProgress(progress); } // Save progress with seen URLs BEFORE downloading cp.seenUrls = Array.from(seenUrls); saveProgress(progress); console.log(` Downloading ${allUrls.length} images...`); // Use actual file count as start index so filenames don't have gaps const startIndex = countImagesInDir(classDir); const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex); // Re-count actual files on disk after download (more reliable than tracking) const newTotal = countImagesInDir(classDir); cp.count = newTotal; cp.downloaded += downloaded; cp.failed += failed; // Check if all sources exhausted if ( sources.db.exhausted && sources.duckduckgo.exhausted && sources.inaturalist.exhausted && sources.wikimedia.exhausted ) { cp.exhausted = true; } // Don't mark exhausted if we still have room to grow if (cp.count >= target) { cp.exhausted = true; } saveProgress(progress); const pct = Math.round((cp.count / target) * 100); console.log( ` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${ allUrls.length } (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`, ); } // ─── Main ─────────────────────────────────────────────────────────────────── async function main() { console.log("=".repeat(60)); console.log("PLANT DISEASE DATASET COLLECTOR — TOP 200 COMMON DISEASES"); console.log("=".repeat(60)); // Ensure dataset directory exists before any cache writes mkdirSync(DATASET_DIR, { recursive: true }); // Load diseases from DB console.log("\nLoading top 200 most common diseases from database..."); const dbDiseases = await loadDiseasesFromDb(); console.log(` ${dbDiseases.length} diseases loaded`); // Load progress const progress = loadProgress(); // If all phases complete, exit early if (progress.phase === 3) { console.log(" ✓ All phases already complete. Delete .progress.json to re-run."); await closeDb(); return; } const startTime = Date.now(); // ── Phase 1: Common diseases (200 images each) ────────────────────────── console.log("\n" + "─".repeat(60)); console.log("PHASE 1: Common Diseases (200 images each)"); console.log("─".repeat(60)); const diseaseStart = progress.phase === 0 ? progress.phaseIndex : 0; if (diseaseStart > 0) { console.log( ` Resuming from disease #${diseaseStart + 1} (${( (diseaseStart / dbDiseases.length) * 100 ).toFixed(0)}% done)`, ); } // Process diseases in parallel batches for (let i = diseaseStart; i < dbDiseases.length; i += DISEASE_CONCURRENCY) { const batch = dbDiseases.slice(i, i + DISEASE_CONCURRENCY); const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1; const totalBatches = Math.ceil(dbDiseases.length / DISEASE_CONCURRENCY); const pct = Math.round((i / dbDiseases.length) * 100); console.log( `\n[Batch ${batchNum}/${totalBatches}] (${pct}%) Processing ${batch.length} diseases in parallel...`, ); // Process all diseases in this batch concurrently await Promise.all( batch.map(async (d, batchIdx) => { const diseaseIdx = i + batchIdx; const classDir = resolve(DATASET_DIR, d.id); const queries = buildSearchQueries(d); const existingUrls = d.imageUrl ? [d.imageUrl] : []; console.log(` [${diseaseIdx + 1}/${dbDiseases.length}] ${d.name || d.id} (${d.plantId})`); await collectClassImages( d.id, queries, TARGET_PER_DISEASE, progress, classDir, existingUrls, ); }), ); // Save checkpoint: phase 0, at index i + batch.length progress.phase = 0; progress.phaseIndex = i + batch.length; saveProgress(progress); } // ── Phase 3: Healthy class ────────────────────────────────────────────── console.log("\n" + "─".repeat(60)); console.log("PHASE 3: Healthy Plant Images"); console.log("─".repeat(60)); const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS); const healthyCp = getClassProgress(progress, HEALTHY_CLASS); // Reconcile healthy class with files on disk const healthyActualCount = reconcileClassCount(healthyDir, healthyCp.count); if (healthyActualCount !== healthyCp.count) { healthyCp.count = healthyActualCount; saveProgress(progress); } const healthySeen = new Set(healthyCp.seenUrls); if (healthyCp.count >= TARGET_HEALTHY) { console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`); } else { // Collect all unique plants const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))]; const allHealthyQueries: string[] = []; for (const plant of allPlants) { allHealthyQueries.push(...buildHealthyQueries(plant)); } const healthySources = [ { name: "DDG", collector: collectImagesDuckDuckGo }, { name: "iNat", collector: searchImagesInaturalist }, { name: "Commons", collector: searchImagesCommons }, ] as const; const totalHealthyUrls: string[] = []; let anyRemaining = false; for (const source of healthySources) { if (totalHealthyUrls.length >= TARGET_HEALTHY) break; console.log(`\n Source: ${source.name}`); for (const query of allHealthyQueries.slice(0, 20)) { if (totalHealthyUrls.length >= TARGET_HEALTHY) break; process.stdout.write(` "${query}"... `); const result = await source.collector( query, TARGET_HEALTHY - totalHealthyUrls.length, healthySeen, ); totalHealthyUrls.push(...result.urls); if (!result.exhausted) anyRemaining = true; console.log(`${result.urls.length} new`); } } healthyCp.seenUrls = Array.from(healthySeen); if (totalHealthyUrls.length > 0) { healthyCp.exhausted = !anyRemaining; saveProgress(progress); console.log(`\n Downloading ${totalHealthyUrls.length} healthy images...`); const healthyStartIndex = countImagesInDir(healthyDir); const { downloaded, failed } = await downloadBatch( totalHealthyUrls, healthyDir, healthyStartIndex, ); // Re-count actual files on disk const newHealthyTotal = countImagesInDir(healthyDir); healthyCp.count = newHealthyTotal; healthyCp.downloaded += downloaded; healthyCp.failed += failed; if (healthyCp.count >= TARGET_HEALTHY) { healthyCp.exhausted = true; } const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100); console.log( ` Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`, ); } else { console.log(` ✗ No healthy images found`); } saveProgress(progress); } // ── Summary ──────────────────────────────────────────────────────────────── // Mark all phases complete progress.phase = 3; progress.phaseIndex = 0; saveProgress(progress); const elapsed = Math.round((Date.now() - startTime) / 1000); const mins = Math.floor(elapsed / 60); const hrs = Math.floor(mins / 60); let totalDownloaded = 0; let totalFailed = 0; for (const [, cp] of Object.entries(progress.classes)) { totalDownloaded += cp.downloaded || 0; totalFailed += cp.failed || 0; } console.log("\n" + "=".repeat(60)); console.log(" ✅ ALL PHASES COMPLETE"); console.log("=".repeat(60)); console.log(` Time: ${hrs}h ${mins % 60}m`); console.log(` Downloaded: ${totalDownloaded} images`); console.log(` Failed: ${totalFailed} images`); console.log(` Dataset: ${DATASET_DIR}/`); await closeDb(); console.log("=".repeat(60)); } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } main().catch((err) => { console.error("Fatal error:", err); process.exit(1); });