#!/usr/bin/env node /** * fill-training-dataset.ts * * Scans the existing dataset directory and downloads any missing images * to reach the target counts (200 per disease, 400 for healthy). * * Does NOT re-run prevalence queries — just fills gaps from image sources. * Each run scans the directory, reports deficits, then fills them. * Interrupt-safe: re-run to pick up where you left off. * * Parallelism strategy: * - Disease-level: 30 diseases processed concurrently * - Per disease: all 3 DDG queries run in parallel * - Per query: all search pages fetched in parallel * - Per disease: DDG, iNaturalist, and Wikimedia Commons all run concurrently * - A shared DDG token-bucket rate limiter prevents bans * * Usage: cd apps/web && npx tsx scripts/fill-training-dataset.ts */ import "dotenv/config"; import { readFileSync, readdirSync, writeFileSync, existsSync, mkdirSync } from "fs"; import { resolve, extname } from "path"; // Load .env.development for DB creds const envPath = resolve(__dirname, "../.env.development"); try { const env = readFileSync(envPath, "utf-8"); for (const line of env.split("\n")) { const trimmed = line.trim(); if (trimmed && !trimmed.startsWith("#")) { const eqIdx = trimmed.indexOf("="); if (eqIdx > 0) { const key = trimmed.slice(0, eqIdx).trim(); const val = trimmed.slice(eqIdx + 1).trim(); if (!process.env[key]) process.env[key] = val; } } } } catch {} import { getDb, closeDb } from "@/lib/db/index"; import { diseases } from "@/lib/db/schema"; // ─── Config ───────────────────────────────────────────────────────────────── const DATASET_DIR = resolve(__dirname, "../data/dataset"); const SEEN_CACHE_FILE = resolve(DATASET_DIR, ".fill-seen-urls.json"); /** Target images per disease */ const TARGET_PER_DISEASE = 200; /** Target images for the "healthy" class */ const TARGET_HEALTHY = 400; /** * How many diseases to process in parallel. * Each disease is I/O-bound (HTTP requests), so high concurrency is safe. * The global DDG rate limiter prevents us from overwhelming DuckDuckGo. */ const DISEASE_CONCURRENCY = 50; /** * Max DDG requests per second (shared across all concurrent diseases). * DuckDuckGo is fairly tolerant, but we still want to be polite. * With DISEASE_CONCURRENCY=30, each disease fires 3 parallel queries with * parallel pages = 9 parallel DDG requests per disease at peak. * The rate limiter serializes this so we don't get banned. */ const DDG_RATE_LIMIT_RPS = 6; /** Max concurrent image downloads per disease */ const CONCURRENT_DOWNLOADS = 50; /** Minimum image size in bytes to accept */ const MIN_IMAGE_SIZE = 10_000; // 10KB /** Maximum image size in bytes */ const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB /** Allowed file extensions */ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"]; /** User agent for requests */ const UA = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1"; /** Healthy class directory name */ const HEALTHY_CLASS = "healthy"; /** How often (in diseases processed) to flush the seen-URLs cache to disk */ const SEEN_CACHE_FLUSH_INTERVAL = 20; /** Max DDG pages to fetch per query. * Each page returns ~50 image results, so 5 pages × 3 queries = ~750 raw URLs * before dedup. Pages beyond 3 yield progressively more novel URLs since * the seen-URLs cache accumulates across runs. */ const MAX_DDG_PAGES = 5; /** Healthy source queries limit */ const MAX_HEALTHY_QUERIES = 20; // ─── Types ────────────────────────────────────────────────────────────────── interface DuckDuckGoImageResult { image: string; title: string; url: string; thumbnail: string; height: number; width: number; } interface DiseaseInfo { id: string; name: string; plantId: string; have: number; needed: number; } interface CollectResult { urls: string[]; exhausted: boolean; } // ─── Token-Bucket Rate Limiter ────────────────────────────────────────────── class TokenBucket { private tokens: number; private lastRefill: number; private readonly capacity: number; private readonly refillInterval: number; // ms per token (e.g., 100ms for 10 rps) constructor(rps: number) { this.capacity = rps; this.tokens = rps; this.lastRefill = Date.now(); this.refillInterval = 1000 / rps; } /** Acquire one token, blocking until one is available. */ async acquire(): Promise { while (true) { this.refill(); if (this.tokens >= 1) { this.tokens -= 1; return; } // No tokens — wait for the next one to arrive, then retry await sleep(Math.ceil(this.refillInterval)); } } private refill(): void { const now = Date.now(); const elapsed = now - this.lastRefill; const newTokens = Math.floor(elapsed / this.refillInterval); if (newTokens > 0) { this.tokens = Math.min(this.capacity, this.tokens + newTokens); this.lastRefill = now - (elapsed % this.refillInterval); } } } // Global DDG rate limiter — all concurrent diseases share this const ddgLimiter = new TokenBucket(DDG_RATE_LIMIT_RPS); // ─── Helpers ──────────────────────────────────────────────────────────────── function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } /** Count actual image files in a directory (matching img_* pattern). */ function countImagesInDir(dir: string): number { if (!existsSync(dir)) return 0; try { const files = readdirSync(dir); return files.filter((f) => f.startsWith("img_")).length; } catch { return 0; } } // ─── Seen-URLs Cache ────────────────────────────────────────────────────── /** * Load the per-disease seen-URLs cache from disk. * This prevents re-fetching the same URLs across runs. */ function loadSeenUrlsCache(): Record { if (existsSync(SEEN_CACHE_FILE)) { try { return JSON.parse(readFileSync(SEEN_CACHE_FILE, "utf-8")); } catch {} } return {}; } /** * Save the seen-URLs cache to disk. */ function saveSeenUrlsCache(cache: Record): void { writeFileSync(SEEN_CACHE_FILE, JSON.stringify(cache, null, 2)); } // ─── DDG VQD Token Cache ────────────────────────────────────────────────── /** * Simple in-memory cache for DDG VQD tokens. * Tokens are per-query, but if we've fetched one for a similar query recently, * we can skip the initial HTML page fetch. */ const vqdCache = new Map(); function getCachedVqd(query: string): string | undefined { const entry = vqdCache.get(query); if (entry && entry.expiresAt > Date.now()) return entry.token; vqdCache.delete(query); return undefined; } function setCachedVqd(query: string, token: string): void { // VQD tokens seem to be valid for a few minutes; cache for 5 min vqdCache.set(query, { token, expiresAt: Date.now() + 5 * 60 * 1000 }); // Evict oldest entries if cache grows too large (unlikely but safe) if (vqdCache.size > 500) { const firstKey = vqdCache.keys().next().value; if (firstKey) vqdCache.delete(firstKey); } } // ─── DuckDuckGo API ───────────────────────────────────────────────────────── async function getVqdToken(query: string): Promise { const cached = getCachedVqd(query); if (cached) return cached; const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`; const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "text/html" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`); const html = await res.text(); const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/); if (!match) throw new Error(`Could not extract vqd token for "${query}"`); setCachedVqd(query, match[1]); return match[1]; } async function searchImagesDuckDuckGo( query: string, vqd: string, page: number, ): Promise { // Rate-limit before making the request await ddgLimiter.acquire(); const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent( query, )}&vqd=${vqd}&o=json&p=${page}&f=,,,`; const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "application/json", Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`, }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) { if (res.status === 429) { // Rate limited — wait and retry once await sleep(5_000); return searchImagesDuckDuckGo(query, vqd, page); } if (res.status === 403) { // VQD token expired or DDG changed format — get a fresh token and retry console.warn(` ⚠ DDG 403 on page ${page} — refreshing VQD token...`); try { const freshVqd = await getVqdToken(query); await ddgLimiter.acquire(); const retryUrl = url.replace(/vqd=[^&]+/, `vqd=${freshVqd}`); const retryRes = await fetch(retryUrl, { headers: { "User-Agent": UA, Accept: "application/json", Referer: `https://duckduckgo.com/?q=${encodeURIComponent( query, )}&t=h_&iax=images&ia=images`, }, signal: AbortSignal.timeout(15_000), }); if (retryRes.ok) { const freshData = (await retryRes.json()) as { results: DuckDuckGoImageResult[] }; return freshData.results ?? []; } } catch { // Fresh token also failed — give up on this page } return []; } console.warn(` ⚠ DDG returned ${res.status} on page ${page}`); return []; } const data = (await res.json()) as { results: DuckDuckGoImageResult[] }; return data.results ?? []; } /** * Collect images from DDG for a single query. * Fetches up to MAX_DDG_PAGES pages in PARALLEL (rate-limited via ddgLimiter). */ async function collectFromDdgQuery( query: string, target: number, seenUrls: Set, ): Promise { const results: string[] = []; let vqd: string; try { vqd = await getVqdToken(query); } catch (err) { console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`); return { urls: [], exhausted: true }; } // Fetch all pages in parallel const pageFetches: Promise[] = []; for (let page = 1; page <= MAX_DDG_PAGES; page++) { pageFetches.push(searchImagesDuckDuckGo(query, vqd, page)); } const pageResults = await Promise.allSettled(pageFetches); for (const settled of pageResults) { if (settled.status !== "fulfilled") continue; if (results.length >= target) break; for (const r of settled.value) { if (results.length >= target) break; const imgUrl = r.image || r.url; if (!imgUrl || typeof imgUrl !== "string") continue; if (seenUrls.has(imgUrl)) continue; try { new URL(imgUrl); } catch { continue; } seenUrls.add(imgUrl); results.push(imgUrl); } } return { urls: results.slice(0, target), exhausted: results.length < target }; } /** * Collect images from DDG across ALL queries for a disease. * Runs all queries in PARALLEL, then merges deduplicated results. */ async function collectImagesDuckDuckGo( queries: string[], target: number, seenUrls: Set, ): Promise<{ urls: string[]; exhausted: boolean }> { // Run all queries in parallel const queryResults = await Promise.allSettled( queries.map((q) => collectFromDdgQuery(q, target, seenUrls)), ); // Merge results — seenUrls already deduplicates across queries const merged: string[] = []; for (const settled of queryResults) { if (settled.status === "fulfilled") { merged.push(...settled.value.urls); if (merged.length >= target) break; } } return { urls: merged.slice(0, target), exhausted: merged.length < target }; } // ─── iNaturalist API ─────────────────────────────────────────────────────── async function searchImagesInaturalist( query: string, target: number, seenUrls: Set, ): Promise { const results: string[] = []; const perPage = Math.min(target, 200); const apiUrl = `https://api.inaturalist.org/v1/observations` + `?q=${encodeURIComponent(query)}` + `&photos_only=true` + `&quality_grade=research` + `&per_page=${perPage}` + `&order_by=observed_on&order=desc`; try { const res = await fetch(apiUrl, { headers: { "User-Agent": UA, Accept: "application/json" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) return { urls: [], exhausted: false }; const data = (await res.json()) as { results: Array<{ photos: Array<{ url: string }> }>; }; for (const obs of data.results ?? []) { if (results.length >= target) break; for (const photo of obs.photos ?? []) { if (results.length >= target) break; const url = photo.url; if (!url || seenUrls.has(url)) continue; const fullUrl = url.replace("/medium.", "/original."); seenUrls.add(fullUrl); results.push(fullUrl); } } return { urls: results, exhausted: results.length < target }; } catch { return { urls: results, exhausted: false }; } } // ─── Wikimedia Commons API ───────────────────────────────────────────────── async function searchImagesCommons( query: string, target: number, seenUrls: Set, ): Promise { const results: string[] = []; let sroffset = 0; while (results.length < target) { const params = new URLSearchParams({ action: "query", list: "search", srsearch: query, srnamespace: "6", srlimit: "50", sroffset: String(sroffset), format: "json", }); const url = `https://commons.wikimedia.org/w/api.php?${params}`; try { const res = await fetch(url, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(10_000), }); if (!res.ok) break; const data = (await res.json()) as { query?: { search?: Array<{ title: string }> }; continue?: { sroffset?: number }; }; const hits = data.query?.search ?? []; if (hits.length === 0) break; for (const hit of hits) { if (results.length >= target) break; const filename = hit.title.replace(/^File:/, ""); const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent( filename, )}`; if (seenUrls.has(imgUrl)) continue; seenUrls.add(imgUrl); results.push(imgUrl); } sroffset = data.continue?.sroffset ?? sroffset + hits.length; } catch { break; } } return { urls: results, exhausted: results.length < target }; } // ─── Image Download ───────────────────────────────────────────────────────── async function downloadImage(url: string, destPath: string): Promise { try { const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" }, signal: AbortSignal.timeout(8_000), }); if (!res.ok) return false; const contentType = res.headers.get("content-type") || ""; if (contentType.includes("text/html")) return false; const buffer = Buffer.from(await res.arrayBuffer()); if (buffer.length < MIN_IMAGE_SIZE) return false; if (buffer.length > MAX_IMAGE_SIZE) return false; let ext = extname(new URL(url).pathname).toLowerCase(); if (!ALLOWED_EXTENSIONS.includes(ext)) { if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg"; else if (contentType.includes("png")) ext = ".png"; else if (contentType.includes("webp")) ext = ".webp"; else ext = ".jpg"; } const filePath = destPath.replace(/\.\w+$/, ext); writeFileSync(filePath, buffer); return true; } catch { return false; } } async function downloadBatch( urls: string[], classDir: string, startIndex: number, ): Promise<{ downloaded: number; failed: number; lastIndex: number }> { let downloaded = 0; let failed = 0; for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) { const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS); const results = await Promise.all( chunk.map(async (url, chunkIdx) => { // Compute index deterministically BEFORE the async download starts, // so all parallel callbacks get a unique index (no race condition). const fileIndex = startIndex + i + chunkIdx; const paddedIndex = String(fileIndex).padStart(4, "0"); const destPath = resolve(classDir, `img_${paddedIndex}.jpg`); const success = await downloadImage(url, destPath); return { success, index: fileIndex }; }), ); for (const r of results) { if (r.success) downloaded++; else failed++; } } return { downloaded, failed, lastIndex: startIndex + urls.length }; } // ─── Query Building ───────────────────────────────────────────────────────── function buildSearchQueries(name: string, plant: string): string[] { return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`]; } function buildHealthyQueries(plant: string): string[] { const name = plant.replace(/-/g, " "); return [ `healthy ${name} leaf`, `${name} leaf closeup`, `healthy ${name} plant`, `${name} foliage`, ]; } // ─── Fill Logic ───────────────────────────────────────────────────────────── /** * Try to collect up to `needed` images for a disease by hitting all three * sources IN PARALLEL. Returns how many new images were actually downloaded. * * Sources (DDG with its 3 internal queries, iNat, Commons) all run concurrently. * As soon as any source completes, its URLs are downloaded immediately while * other sources are still searching (pipeline). */ async function fillClass( _diseaseId: string, queries: string[], needed: number, classDir: string, seenUrls: Set, ): Promise { if (needed <= 0) return 0; mkdirSync(classDir, { recursive: true }); const startCount = countImagesInDir(classDir); // ── Run all sources in parallel, pipelining downloads ────────────────── // Start downloading from each source as soon as it returns results, rather // than waiting for all sources to complete. DDG is (by far) the richest // source, so its results start saving to disk while iNat and Commons are // still searching. // // Each source gets a DEDICATED index range so there's no race condition // writing files. DDG gets [startCount, startCount+199], iNat gets // [startCount+200, startCount+399], Commons gets [startCount+400,...]. // The 4-digit filename supports up to 9999, well beyond our 200 target. let totalDownloaded = 0; let totalFailed = 0; let anySuccess = false; const collectAndDownload = async ( label: string, collector: () => Promise, indexOffset: number, ): Promise => { const result = await collector(); if (result.urls.length === 0) { console.log(` ${label}: 0 URLs found`); return; } console.log(` ${label}: ${result.urls.length} new URLs`); // Each source writes to its own non-overlapping range const { downloaded, failed } = await downloadBatch(result.urls, classDir, indexOffset); totalDownloaded += downloaded; totalFailed += failed; if (downloaded > 0) anySuccess = true; }; await Promise.allSettled([ collectAndDownload("DDG", () => collectImagesDuckDuckGo(queries, needed, seenUrls), startCount), collectAndDownload( "iNat", () => searchImagesInaturalist(queries[0], needed, seenUrls), startCount + TARGET_PER_DISEASE, ), collectAndDownload( "Commons", () => searchImagesCommons(queries[0], needed, seenUrls), startCount + 2 * TARGET_PER_DISEASE, ), ]); if (!anySuccess) { console.log(` ✗ No new images found from any source`); return 0; } const newTotal = countImagesInDir(classDir); const gained = newTotal - startCount; console.log( ` ✓ ${totalDownloaded}/${totalDownloaded + totalFailed} downloaded` + ` (${totalFailed} failed, ${gained} new files)`, ); return gained; } // ─── Directory Scanner ───────────────────────────────────────────────────── interface ScanResult { /** Disease id → how many images currently on disk */ diseaseCounts: Map; /** How many healthy images on disk */ healthyCount: number; } function scanDataset(): ScanResult { const diseaseCounts = new Map(); let healthyCount = 0; if (!existsSync(DATASET_DIR)) { return { diseaseCounts, healthyCount: 0 }; } const entries = readdirSync(DATASET_DIR, { withFileTypes: true }); for (const entry of entries) { if (!entry.isDirectory()) continue; if (entry.name.startsWith(".")) continue; if (entry.name === HEALTHY_CLASS) { healthyCount = countImagesInDir(resolve(DATASET_DIR, entry.name)); } else { const count = countImagesInDir(resolve(DATASET_DIR, entry.name)); if (count > 0) { diseaseCounts.set(entry.name, count); } } } return { diseaseCounts, healthyCount }; } // ─── CLI Flags ────────────────────────────────────────────────────────────── function parseFlags(): { reverse: boolean } { const args = process.argv.slice(2); return { reverse: args.includes("--reverse") || args.includes("-r"), }; } // ─── Main ─────────────────────────────────────────────────────────────────── async function main() { const flags = parseFlags(); console.log("=".repeat(60)); console.log("TRAINING DATASET FILL — Parallelized gap-filling download"); if (flags.reverse) console.log(" (reverse order — processing from lowest deficit first)"); console.log("=".repeat(60)); // Ensure dataset directory exists mkdirSync(DATASET_DIR, { recursive: true }); // ── Step 1: Scan what we already have ──────────────────────────────────── console.log("\nScanning existing dataset..."); const { diseaseCounts, healthyCount } = scanDataset(); console.log(` Found ${diseaseCounts.size} disease directories, ${healthyCount} healthy images`); // ── Step 2: Load disease info from DB ──────────────────────────────────── console.log("\nLoading disease info from database..."); const db = getDb(); const allDiseases = await db .select({ id: diseases.id, plantId: diseases.plantId, name: diseases.name, }) .from(diseases); // Build a deduplicated map: disease id → first disease info found const diseaseInfo = new Map(); for (const d of allDiseases) { if (!diseaseInfo.has(d.id)) { diseaseInfo.set(d.id, { name: d.name, plantId: d.plantId }); } } console.log(` Loaded ${diseaseInfo.size} unique diseases from DB`); // ── Step 3: Build deficit list ────────────────────────────────────────── const deficits: DiseaseInfo[] = []; for (const [id, info] of diseaseInfo) { const have = diseaseCounts.get(id) ?? 0; const needed = TARGET_PER_DISEASE - have; if (needed > 0) { deficits.push({ id, name: info.name, plantId: info.plantId, have, needed }); } } // Sort by deficit size (largest first) so we prioritize the neediest diseases deficits.sort((a, b) => b.needed - a.needed); // Reverse order if --reverse/-r flag is set (useful to try a different // direction when the front of the queue keeps hitting dead URLs) if (flags.reverse) deficits.reverse(); const healthyDeficit = TARGET_HEALTHY - healthyCount; console.log(`\n${"=".repeat(60)}`); console.log("DEFICIT REPORT"); console.log(`${"=".repeat(60)}`); console.log(` Diseases needing images: ${deficits.length}/${diseaseInfo.size}`); console.log(` Total images missing: ${deficits.reduce((s, d) => s + d.needed, 0)}`); console.log(` Healthy deficit: ${Math.max(0, healthyDeficit)}`); console.log(` Parallelism: ${DISEASE_CONCURRENCY} diseases at once`); console.log(` DDG rate limit: ${DDG_RATE_LIMIT_RPS} req/s (shared)`); console.log( ` Order: ${flags.reverse ? "reverse (--reverse)" : "normal (deficit-first)"}`, ); console.log(`${"=".repeat(60)}`); if (deficits.length === 0 && healthyDeficit <= 0) { console.log("\n ✓ Nothing to do — all targets met!\n"); await closeDb(); return; } // ── Step 4: Load seen-URLs cache ──────────────────────────────────────── const seenUrlsCache = loadSeenUrlsCache(); let totalDownloaded = 0; let totalFailed = 0; let diseasesProcessed = 0; const startTime = Date.now(); // ── Step 5: Fill disease deficits ─────────────────────────────────────── if (deficits.length > 0) { console.log("\n" + "─".repeat(60)); console.log(`FILLING ${deficits.length} DISEASES (target: ${TARGET_PER_DISEASE} each)`); console.log("─".repeat(60)); // Process in parallel batches for (let i = 0; i < deficits.length; i += DISEASE_CONCURRENCY) { const batch = deficits.slice(i, i + DISEASE_CONCURRENCY); const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1; const totalBatches = Math.ceil(deficits.length / DISEASE_CONCURRENCY); console.log(`\n[Batch ${batchNum}/${totalBatches}] Processing ${batch.length} diseases...`); // Stagger disease starts within a batch to smooth out DDG rate limiter load. // Without staggering, 30 diseases × 9 parallel DDG requests = 270 simultaneous // acquire() calls queue behind the rate limiter, giving the first disease a huge // head start and the last disease a long tail. Staggering by 200ms each spreads // the load evenly, reducing tail latency and improving overall throughput. const STAGGER_MS = 200; const batchResults = await Promise.allSettled( batch.map((d, idx) => (async () => { if (idx > 0) await sleep(idx * STAGGER_MS); const classDir = resolve(DATASET_DIR, d.id); const queries = buildSearchQueries(d.name, d.plantId); // CRITICAL: Start with a FRESH empty set for within-run search dedup. // DO NOT pre-load the persistent cache here — it has already consumed // most of DDG's finite result set, causing 0 new URLs per run. // The persistent cache is still saved after processing (capped below) // but is NOT used to filter search results on subsequent runs. const seen = new Set(); console.log( ` [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`, ); const gained = await fillClass(d.id, queries, d.needed, classDir, seen); // Update seen-URLs cache for this disease — merge with existing // and cap at 500 per disease to prevent unbounded cache growth. const existing = seenUrlsCache[d.id] ?? []; const merged = [...new Set([...existing, ...Array.from(seen)])]; seenUrlsCache[d.id] = merged.slice(-500); return gained; })(), ), ); // Aggregate batch results for (const result of batchResults) { if (result.status === "fulfilled") { totalDownloaded += result.value; } else { console.error(` ✗ Disease failed: ${result.reason}`); } } diseasesProcessed += batch.length; // Flush seen-URLs cache to disk periodically (not after every disease) if ( diseasesProcessed % SEEN_CACHE_FLUSH_INTERVAL < batch.length || i + batch.length >= deficits.length ) { saveSeenUrlsCache(seenUrlsCache); } const elapsed = Math.round((Date.now() - startTime) / 1000); const rate = diseasesProcessed / Math.max(1, elapsed); const remaining = deficits.length - diseasesProcessed; const eta = remaining / Math.max(0.01, rate); console.log( ` [Batch ${batchNum}/${totalBatches}] checkpoint — ` + `${totalDownloaded} downloaded, ` + `${diseasesProcessed}/${deficits.length} diseases (${rate.toFixed(1)}/s, ` + `ETA: ${Math.round(eta)}s)`, ); } } // ── Step 6: Fill healthy deficit ──────────────────────────────────────── if (healthyDeficit > 0) { console.log("\n" + "─".repeat(60)); console.log(`FILLING HEALTHY CLASS (target: ${TARGET_HEALTHY})`); console.log("─".repeat(60)); const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS); mkdirSync(healthyDir, { recursive: true }); // Collect all unique plants from the disease info const allPlants = [...new Set(diseaseInfo.values())].map((d) => d.plantId); const allHealthyQueries: string[] = []; for (const plant of allPlants) { allHealthyQueries.push(...buildHealthyQueries(plant)); } const healthySeen = new Set(seenUrlsCache[HEALTHY_CLASS] ?? []); const healthyNeeded = TARGET_HEALTHY - countImagesInDir(healthyDir); // Run all 3 sources in parallel for the healthy class too const [ddgUrls, inatUrls, commonsUrls] = await Promise.allSettled([ collectImagesDuckDuckGo( allHealthyQueries.slice(0, MAX_HEALTHY_QUERIES), healthyNeeded, healthySeen, ), searchImagesInaturalist(allHealthyQueries[0], healthyNeeded, healthySeen), searchImagesCommons(allHealthyQueries[0], healthyNeeded, healthySeen), ]); const allUrls: string[] = []; for (const settled of [ddgUrls, inatUrls, commonsUrls]) { if (settled.status === "fulfilled") { allUrls.push(...settled.value.urls); } } if (allUrls.length > 0) { console.log(`\n Downloading ${allUrls.length} healthy images...`); const startIdx = countImagesInDir(healthyDir); const { downloaded, failed } = await downloadBatch(allUrls, healthyDir, startIdx); const newTotal = countImagesInDir(healthyDir); const gained = newTotal - healthyCount; totalDownloaded += gained; totalFailed += failed; console.log( ` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images.` + ` Total healthy: ${newTotal}/${TARGET_HEALTHY} (${gained} new)`, ); } else { console.log(`\n ✗ No healthy images found`); } // Update seen-URLs cache seenUrlsCache[HEALTHY_CLASS] = Array.from(healthySeen); saveSeenUrlsCache(seenUrlsCache); } // ── Summary ────────────────────────────────────────────────────────────── const elapsed = Math.round((Date.now() - startTime) / 1000); const mins = Math.floor(elapsed / 60); const hrs = Math.floor(mins / 60); // Final scan const finalScan = scanDataset(); const totalHave = [...finalScan.diseaseCounts.values()].reduce((s, c) => s + c, 0); const atTarget = [...finalScan.diseaseCounts.values()].filter( (c) => c >= TARGET_PER_DISEASE, ).length; console.log("\n" + "=".repeat(60)); console.log(" ✅ FILL COMPLETE"); console.log("=".repeat(60)); console.log(` Time: ${hrs}h ${mins % 60}m`); console.log(` Diseases at target: ${atTarget}/${diseaseInfo.size}`); console.log(` Total images: ${totalHave}`); console.log(` Healthy images: ${finalScan.healthyCount}/${TARGET_HEALTHY}`); console.log(` New downloads: ${totalDownloaded}`); console.log(` Dataset dir: ${DATASET_DIR}/`); await closeDb(); console.log("=".repeat(60)); } main().catch((err) => { console.error("\nFatal error:", `\n${err}`); process.exit(1); });