Files
plant-disease-id/scripts/fill-training-dataset.ts
2026-06-08 16:42:04 -04:00

928 lines
32 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* fill-training-dataset.ts
*
* Scans the existing dataset directory and downloads any missing images
* to reach the target counts (200 per disease, 400 for healthy).
*
* Does NOT re-run prevalence queries — just fills gaps from image sources.
* Each run scans the directory, reports deficits, then fills them.
* Interrupt-safe: re-run to pick up where you left off.
*
* Parallelism strategy:
* - Disease-level: 30 diseases processed concurrently
* - Per disease: all 3 DDG queries run in parallel
* - Per query: all search pages fetched in parallel
* - Per disease: DDG, iNaturalist, and Wikimedia Commons all run concurrently
* - A shared DDG token-bucket rate limiter prevents bans
*
* Usage: cd apps/web && npx tsx scripts/fill-training-dataset.ts
*/
import "dotenv/config";
import { readFileSync, readdirSync, writeFileSync, existsSync, mkdirSync } from "fs";
import { resolve, extname } from "path";
// Load .env.development for DB creds
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) process.env[key] = val;
}
}
}
} catch {}
import { getDb, closeDb } from "@/lib/db/index";
import { diseases } from "@/lib/db/schema";
// ─── Config ─────────────────────────────────────────────────────────────────
const DATASET_DIR = resolve(__dirname, "../data/dataset");
const SEEN_CACHE_FILE = resolve(DATASET_DIR, ".fill-seen-urls.json");
/** Target images per disease */
const TARGET_PER_DISEASE = 200;
/** Target images for the "healthy" class */
const TARGET_HEALTHY = 400;
/**
* How many diseases to process in parallel.
* Each disease is I/O-bound (HTTP requests), so high concurrency is safe.
* The global DDG rate limiter prevents us from overwhelming DuckDuckGo.
*/
const DISEASE_CONCURRENCY = 20;
/**
* Max DDG requests per second (shared across all concurrent diseases).
* DuckDuckGo is fairly tolerant, but we still want to be polite.
* With DISEASE_CONCURRENCY=30, each disease fires 3 parallel queries with
* parallel pages = 9 parallel DDG requests per disease at peak.
* The rate limiter serializes this so we don't get banned.
*/
const DDG_RATE_LIMIT_RPS = 2;
/** Max concurrent image downloads per disease */
const CONCURRENT_DOWNLOADS = 2;
/** Minimum image size in bytes to accept */
const MIN_IMAGE_SIZE = 10_000; // 10KB
/** Maximum image size in bytes */
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
/** Allowed file extensions */
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
/** User agent for requests */
const UA =
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1";
/** Healthy class directory name */
const HEALTHY_CLASS = "healthy";
/** How often (in diseases processed) to flush the seen-URLs cache to disk */
const SEEN_CACHE_FLUSH_INTERVAL = 20;
/** Max DDG pages to fetch per query.
* Each page returns ~100 image results, so 3 pages × 3 queries = ~900 raw URLs
* before dedup — more than enough to find 200 unique, valid images. */
const MAX_DDG_PAGES = 3;
/** Healthy source queries limit */
const MAX_HEALTHY_QUERIES = 20;
// ─── Types ──────────────────────────────────────────────────────────────────
interface DuckDuckGoImageResult {
image: string;
title: string;
url: string;
thumbnail: string;
height: number;
width: number;
}
interface DiseaseInfo {
id: string;
name: string;
plantId: string;
have: number;
needed: number;
}
interface CollectResult {
urls: string[];
exhausted: boolean;
}
// ─── Token-Bucket Rate Limiter ──────────────────────────────────────────────
class TokenBucket {
private tokens: number;
private lastRefill: number;
private readonly capacity: number;
private readonly refillInterval: number; // ms per token (e.g., 100ms for 10 rps)
constructor(rps: number) {
this.capacity = rps;
this.tokens = rps;
this.lastRefill = Date.now();
this.refillInterval = 1000 / rps;
}
/** Acquire one token, blocking until one is available. */
async acquire(): Promise<void> {
while (true) {
this.refill();
if (this.tokens >= 1) {
this.tokens -= 1;
return;
}
// No tokens — wait for the next one to arrive, then retry
await sleep(Math.ceil(this.refillInterval));
}
}
private refill(): void {
const now = Date.now();
const elapsed = now - this.lastRefill;
const newTokens = Math.floor(elapsed / this.refillInterval);
if (newTokens > 0) {
this.tokens = Math.min(this.capacity, this.tokens + newTokens);
this.lastRefill = now - (elapsed % this.refillInterval);
}
}
}
// Global DDG rate limiter — all concurrent diseases share this
const ddgLimiter = new TokenBucket(DDG_RATE_LIMIT_RPS);
// ─── Helpers ────────────────────────────────────────────────────────────────
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/** Count actual image files in a directory (matching img_* pattern). */
function countImagesInDir(dir: string): number {
if (!existsSync(dir)) return 0;
try {
const files = readdirSync(dir);
return files.filter((f) => f.startsWith("img_")).length;
} catch {
return 0;
}
}
// ─── Seen-URLs Cache ──────────────────────────────────────────────────────
/**
* Load the per-disease seen-URLs cache from disk.
* This prevents re-fetching the same URLs across runs.
*/
function loadSeenUrlsCache(): Record<string, string[]> {
if (existsSync(SEEN_CACHE_FILE)) {
try {
return JSON.parse(readFileSync(SEEN_CACHE_FILE, "utf-8"));
} catch {}
}
return {};
}
/**
* Save the seen-URLs cache to disk.
*/
function saveSeenUrlsCache(cache: Record<string, string[]>): void {
writeFileSync(SEEN_CACHE_FILE, JSON.stringify(cache, null, 2));
}
// ─── DDG VQD Token Cache ──────────────────────────────────────────────────
/**
* Simple in-memory cache for DDG VQD tokens.
* Tokens are per-query, but if we've fetched one for a similar query recently,
* we can skip the initial HTML page fetch.
*/
const vqdCache = new Map<string, { token: string; expiresAt: number }>();
function getCachedVqd(query: string): string | undefined {
const entry = vqdCache.get(query);
if (entry && entry.expiresAt > Date.now()) return entry.token;
vqdCache.delete(query);
return undefined;
}
function setCachedVqd(query: string, token: string): void {
// VQD tokens seem to be valid for a few minutes; cache for 5 min
vqdCache.set(query, { token, expiresAt: Date.now() + 5 * 60 * 1000 });
// Evict oldest entries if cache grows too large (unlikely but safe)
if (vqdCache.size > 500) {
const firstKey = vqdCache.keys().next().value;
if (firstKey) vqdCache.delete(firstKey);
}
}
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
async function getVqdToken(query: string): Promise<string> {
const cached = getCachedVqd(query);
if (cached) return cached;
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
const res = await fetch(url, {
headers: { "User-Agent": UA, Accept: "text/html" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
const html = await res.text();
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
setCachedVqd(query, match[1]);
return match[1];
}
async function searchImagesDuckDuckGo(
query: string,
vqd: string,
page: number,
): Promise<DuckDuckGoImageResult[]> {
// Rate-limit before making the request
await ddgLimiter.acquire();
const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
query,
)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
const res = await fetch(url, {
headers: {
"User-Agent": UA,
Accept: "application/json",
Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
},
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) {
if (res.status === 429) {
// Rate limited — wait and retry once
await sleep(5_000);
return searchImagesDuckDuckGo(query, vqd, page);
}
if (res.status === 403) return [];
// Don't throw for transient errors — just return empty
return [];
}
const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
return data.results ?? [];
}
/**
* Collect images from DDG for a single query.
* Fetches up to MAX_DDG_PAGES pages in PARALLEL (rate-limited via ddgLimiter).
*/
async function collectFromDdgQuery(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<CollectResult> {
const results: string[] = [];
let vqd: string;
try {
vqd = await getVqdToken(query);
} catch (err) {
console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
return { urls: [], exhausted: true };
}
// Fetch all pages in parallel
const pageFetches: Promise<DuckDuckGoImageResult[]>[] = [];
for (let page = 1; page <= MAX_DDG_PAGES; page++) {
pageFetches.push(searchImagesDuckDuckGo(query, vqd, page));
}
const pageResults = await Promise.allSettled(pageFetches);
for (const settled of pageResults) {
if (settled.status !== "fulfilled") continue;
if (results.length >= target) break;
for (const r of settled.value) {
if (results.length >= target) break;
const imgUrl = r.image || r.url;
if (!imgUrl || typeof imgUrl !== "string") continue;
if (seenUrls.has(imgUrl)) continue;
try {
new URL(imgUrl);
} catch {
continue;
}
seenUrls.add(imgUrl);
results.push(imgUrl);
}
}
return { urls: results.slice(0, target), exhausted: results.length < target };
}
/**
* Collect images from DDG across ALL queries for a disease.
* Runs all queries in PARALLEL, then merges deduplicated results.
*/
async function collectImagesDuckDuckGo(
queries: string[],
target: number,
seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
// Run all queries in parallel
const queryResults = await Promise.allSettled(
queries.map((q) => collectFromDdgQuery(q, target, seenUrls)),
);
// Merge results — seenUrls already deduplicates across queries
const merged: string[] = [];
for (const settled of queryResults) {
if (settled.status === "fulfilled") {
merged.push(...settled.value.urls);
if (merged.length >= target) break;
}
}
return { urls: merged.slice(0, target), exhausted: merged.length < target };
}
// ─── iNaturalist API ───────────────────────────────────────────────────────
async function searchImagesInaturalist(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<CollectResult> {
const results: string[] = [];
const perPage = Math.min(target, 200);
const apiUrl =
`https://api.inaturalist.org/v1/observations` +
`?q=${encodeURIComponent(query)}` +
`&photos_only=true` +
`&quality_grade=research` +
`&per_page=${perPage}` +
`&order_by=observed_on&order=desc`;
try {
const res = await fetch(apiUrl, {
headers: { "User-Agent": UA, Accept: "application/json" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) return { urls: [], exhausted: false };
const data = (await res.json()) as {
results: Array<{ photos: Array<{ url: string }> }>;
};
for (const obs of data.results ?? []) {
if (results.length >= target) break;
for (const photo of obs.photos ?? []) {
if (results.length >= target) break;
const url = photo.url;
if (!url || seenUrls.has(url)) continue;
const fullUrl = url.replace("/medium.", "/original.");
seenUrls.add(fullUrl);
results.push(fullUrl);
}
}
return { urls: results, exhausted: results.length < target };
} catch {
return { urls: results, exhausted: false };
}
}
// ─── Wikimedia Commons API ─────────────────────────────────────────────────
async function searchImagesCommons(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<CollectResult> {
const results: string[] = [];
let sroffset = 0;
while (results.length < target) {
const params = new URLSearchParams({
action: "query",
list: "search",
srsearch: query,
srnamespace: "6",
srlimit: "50",
sroffset: String(sroffset),
format: "json",
});
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
try {
const res = await fetch(url, {
headers: { "User-Agent": UA },
signal: AbortSignal.timeout(10_000),
});
if (!res.ok) break;
const data = (await res.json()) as {
query?: { search?: Array<{ title: string }> };
continue?: { sroffset?: number };
};
const hits = data.query?.search ?? [];
if (hits.length === 0) break;
for (const hit of hits) {
if (results.length >= target) break;
const filename = hit.title.replace(/^File:/, "");
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
filename,
)}`;
if (seenUrls.has(imgUrl)) continue;
seenUrls.add(imgUrl);
results.push(imgUrl);
}
sroffset = data.continue?.sroffset ?? sroffset + hits.length;
} catch {
break;
}
}
return { urls: results, exhausted: results.length < target };
}
// ─── Image Download ─────────────────────────────────────────────────────────
async function downloadImage(url: string, destPath: string): Promise<boolean> {
try {
const res = await fetch(url, {
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
signal: AbortSignal.timeout(8_000),
});
if (!res.ok) return false;
const contentType = res.headers.get("content-type") || "";
if (contentType.includes("text/html")) return false;
const buffer = Buffer.from(await res.arrayBuffer());
if (buffer.length < MIN_IMAGE_SIZE) return false;
if (buffer.length > MAX_IMAGE_SIZE) return false;
let ext = extname(new URL(url).pathname).toLowerCase();
if (!ALLOWED_EXTENSIONS.includes(ext)) {
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
else if (contentType.includes("png")) ext = ".png";
else if (contentType.includes("webp")) ext = ".webp";
else ext = ".jpg";
}
const filePath = destPath.replace(/\.\w+$/, ext);
writeFileSync(filePath, buffer);
return true;
} catch {
return false;
}
}
async function downloadBatch(
urls: string[],
classDir: string,
startIndex: number,
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
let downloaded = 0;
let failed = 0;
let index = startIndex;
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
const results = await Promise.all(
chunk.map(async (url) => {
const paddedIndex = String(index).padStart(4, "0");
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
const success = await downloadImage(url, destPath);
return { success, index: index++ };
}),
);
for (const r of results) {
if (r.success) downloaded++;
else failed++;
}
}
return { downloaded, failed, lastIndex: index };
}
// ─── Query Building ─────────────────────────────────────────────────────────
function buildSearchQueries(name: string, plant: string): string[] {
return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
}
function buildHealthyQueries(plant: string): string[] {
const name = plant.replace(/-/g, " ");
return [
`healthy ${name} leaf`,
`${name} leaf closeup`,
`healthy ${name} plant`,
`${name} foliage`,
];
}
// ─── Fill Logic ─────────────────────────────────────────────────────────────
/**
* Try to collect up to `needed` images for a disease by hitting all three
* sources IN PARALLEL. Returns how many new images were actually downloaded.
*
* Sources (DDG with its 3 internal queries, iNat, Commons) all run concurrently.
* As soon as any source completes, its URLs are downloaded immediately while
* other sources are still searching (pipeline).
*/
async function fillClass(
_diseaseId: string,
queries: string[],
needed: number,
classDir: string,
seenUrls: Set<string>,
): Promise<number> {
if (needed <= 0) return 0;
mkdirSync(classDir, { recursive: true });
const startCount = countImagesInDir(classDir);
// ── Run all sources in parallel, pipelining downloads ──────────────────
// Start downloading from each source as soon as it returns results, rather
// than waiting for all sources to complete. DDG is (by far) the richest
// source, so its results start saving to disk while iNat and Commons are
// still searching.
//
// Each source gets a DEDICATED index range so there's no race condition
// writing files. DDG gets [startCount, startCount+199], iNat gets
// [startCount+200, startCount+399], Commons gets [startCount+400,...].
// The 4-digit filename supports up to 9999, well beyond our 200 target.
let totalDownloaded = 0;
let totalFailed = 0;
let anySuccess = false;
const collectAndDownload = async (
label: string,
collector: () => Promise<CollectResult>,
indexOffset: number,
): Promise<void> => {
const result = await collector();
if (result.urls.length === 0) return;
console.log(` ${label}: ${result.urls.length} new URLs`);
// Each source writes to its own non-overlapping range
const { downloaded, failed } = await downloadBatch(result.urls, classDir, indexOffset);
totalDownloaded += downloaded;
totalFailed += failed;
if (downloaded > 0) anySuccess = true;
};
await Promise.allSettled([
collectAndDownload("DDG", () => collectImagesDuckDuckGo(queries, needed, seenUrls), startCount),
collectAndDownload(
"iNat",
() => searchImagesInaturalist(queries[0], needed, seenUrls),
startCount + TARGET_PER_DISEASE,
),
collectAndDownload(
"Commons",
() => searchImagesCommons(queries[0], needed, seenUrls),
startCount + 2 * TARGET_PER_DISEASE,
),
]);
if (!anySuccess) {
console.log(` ✗ No new images found from any source`);
return 0;
}
const newTotal = countImagesInDir(classDir);
const gained = newTotal - startCount;
console.log(
`${totalDownloaded}/${totalDownloaded + totalFailed} downloaded` +
` (${totalFailed} failed, ${gained} new files)`,
);
return gained;
}
// ─── Directory Scanner ─────────────────────────────────────────────────────
interface ScanResult {
/** Disease id → how many images currently on disk */
diseaseCounts: Map<string, number>;
/** How many healthy images on disk */
healthyCount: number;
}
function scanDataset(): ScanResult {
const diseaseCounts = new Map<string, number>();
let healthyCount = 0;
if (!existsSync(DATASET_DIR)) {
return { diseaseCounts, healthyCount: 0 };
}
const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory()) continue;
if (entry.name.startsWith(".")) continue;
if (entry.name === HEALTHY_CLASS) {
healthyCount = countImagesInDir(resolve(DATASET_DIR, entry.name));
} else {
const count = countImagesInDir(resolve(DATASET_DIR, entry.name));
if (count > 0) {
diseaseCounts.set(entry.name, count);
}
}
}
return { diseaseCounts, healthyCount };
}
// ─── CLI Flags ──────────────────────────────────────────────────────────────
function parseFlags(): { reverse: boolean } {
const args = process.argv.slice(2);
return {
reverse: args.includes("--reverse") || args.includes("-r"),
};
}
// ─── Main ───────────────────────────────────────────────────────────────────
async function main() {
const flags = parseFlags();
console.log("=".repeat(60));
console.log("TRAINING DATASET FILL — Parallelized gap-filling download");
if (flags.reverse) console.log(" (reverse order — processing from lowest deficit first)");
console.log("=".repeat(60));
// Ensure dataset directory exists
mkdirSync(DATASET_DIR, { recursive: true });
// ── Step 1: Scan what we already have ────────────────────────────────────
console.log("\nScanning existing dataset...");
const { diseaseCounts, healthyCount } = scanDataset();
console.log(` Found ${diseaseCounts.size} disease directories, ${healthyCount} healthy images`);
// ── Step 2: Load disease info from DB ────────────────────────────────────
console.log("\nLoading disease info from database...");
const db = getDb();
const allDiseases = await db
.select({
id: diseases.id,
plantId: diseases.plantId,
name: diseases.name,
})
.from(diseases);
// Build a deduplicated map: disease id → first disease info found
const diseaseInfo = new Map<string, { name: string; plantId: string }>();
for (const d of allDiseases) {
if (!diseaseInfo.has(d.id)) {
diseaseInfo.set(d.id, { name: d.name, plantId: d.plantId });
}
}
console.log(` Loaded ${diseaseInfo.size} unique diseases from DB`);
// ── Step 3: Build deficit list ──────────────────────────────────────────
const deficits: DiseaseInfo[] = [];
for (const [id, info] of diseaseInfo) {
const have = diseaseCounts.get(id) ?? 0;
const needed = TARGET_PER_DISEASE - have;
if (needed > 0) {
deficits.push({ id, name: info.name, plantId: info.plantId, have, needed });
}
}
// Sort by deficit size (largest first) so we prioritize the neediest diseases
deficits.sort((a, b) => b.needed - a.needed);
// Reverse order if --reverse/-r flag is set (useful to try a different
// direction when the front of the queue keeps hitting dead URLs)
if (flags.reverse) deficits.reverse();
const healthyDeficit = TARGET_HEALTHY - healthyCount;
console.log(`\n${"=".repeat(60)}`);
console.log("DEFICIT REPORT");
console.log(`${"=".repeat(60)}`);
console.log(` Diseases needing images: ${deficits.length}/${diseaseInfo.size}`);
console.log(` Total images missing: ${deficits.reduce((s, d) => s + d.needed, 0)}`);
console.log(` Healthy deficit: ${Math.max(0, healthyDeficit)}`);
console.log(` Parallelism: ${DISEASE_CONCURRENCY} diseases at once`);
console.log(` DDG rate limit: ${DDG_RATE_LIMIT_RPS} req/s (shared)`);
console.log(
` Order: ${flags.reverse ? "reverse (--reverse)" : "normal (deficit-first)"}`,
);
console.log(`${"=".repeat(60)}`);
if (deficits.length === 0 && healthyDeficit <= 0) {
console.log("\n ✓ Nothing to do — all targets met!\n");
await closeDb();
return;
}
// ── Step 4: Load seen-URLs cache ────────────────────────────────────────
const seenUrlsCache = loadSeenUrlsCache();
let totalDownloaded = 0;
let totalFailed = 0;
let diseasesProcessed = 0;
const startTime = Date.now();
// ── Step 5: Fill disease deficits ───────────────────────────────────────
if (deficits.length > 0) {
console.log("\n" + "─".repeat(60));
console.log(`FILLING ${deficits.length} DISEASES (target: ${TARGET_PER_DISEASE} each)`);
console.log("─".repeat(60));
// Process in parallel batches
for (let i = 0; i < deficits.length; i += DISEASE_CONCURRENCY) {
const batch = deficits.slice(i, i + DISEASE_CONCURRENCY);
const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
const totalBatches = Math.ceil(deficits.length / DISEASE_CONCURRENCY);
console.log(`\n[Batch ${batchNum}/${totalBatches}] Processing ${batch.length} diseases...`);
// Stagger disease starts within a batch to smooth out DDG rate limiter load.
// Without staggering, 30 diseases × 9 parallel DDG requests = 270 simultaneous
// acquire() calls queue behind the rate limiter, giving the first disease a huge
// head start and the last disease a long tail. Staggering by 200ms each spreads
// the load evenly, reducing tail latency and improving overall throughput.
const STAGGER_MS = 200;
const batchResults = await Promise.allSettled(
batch.map((d, idx) =>
(async () => {
if (idx > 0) await sleep(idx * STAGGER_MS);
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d.name, d.plantId);
const seen = new Set<string>(seenUrlsCache[d.id] ?? []);
console.log(
` [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
);
const gained = await fillClass(d.id, queries, d.needed, classDir, seen);
// Update seen-URLs cache for this disease
seenUrlsCache[d.id] = Array.from(seen);
return gained;
})(),
),
);
// Aggregate batch results
for (const result of batchResults) {
if (result.status === "fulfilled") {
totalDownloaded += result.value;
} else {
console.error(` ✗ Disease failed: ${result.reason}`);
}
}
diseasesProcessed += batch.length;
// Flush seen-URLs cache to disk periodically (not after every disease)
if (
diseasesProcessed % SEEN_CACHE_FLUSH_INTERVAL < batch.length ||
i + batch.length >= deficits.length
) {
saveSeenUrlsCache(seenUrlsCache);
}
const elapsed = Math.round((Date.now() - startTime) / 1000);
const rate = diseasesProcessed / Math.max(1, elapsed);
const remaining = deficits.length - diseasesProcessed;
const eta = remaining / Math.max(0.01, rate);
console.log(
` [Batch ${batchNum}/${totalBatches}] checkpoint — ` +
`${totalDownloaded} downloaded, ` +
`${diseasesProcessed}/${deficits.length} diseases (${rate.toFixed(1)}/s, ` +
`ETA: ${Math.round(eta)}s)`,
);
}
}
// ── Step 6: Fill healthy deficit ────────────────────────────────────────
if (healthyDeficit > 0) {
console.log("\n" + "─".repeat(60));
console.log(`FILLING HEALTHY CLASS (target: ${TARGET_HEALTHY})`);
console.log("─".repeat(60));
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
mkdirSync(healthyDir, { recursive: true });
// Collect all unique plants from the disease info
const allPlants = [...new Set(diseaseInfo.values())].map((d) => d.plantId);
const allHealthyQueries: string[] = [];
for (const plant of allPlants) {
allHealthyQueries.push(...buildHealthyQueries(plant));
}
const healthySeen = new Set<string>(seenUrlsCache[HEALTHY_CLASS] ?? []);
const healthyNeeded = TARGET_HEALTHY - countImagesInDir(healthyDir);
// Run all 3 sources in parallel for the healthy class too
const [ddgUrls, inatUrls, commonsUrls] = await Promise.allSettled([
collectImagesDuckDuckGo(
allHealthyQueries.slice(0, MAX_HEALTHY_QUERIES),
healthyNeeded,
healthySeen,
),
searchImagesInaturalist(allHealthyQueries[0], healthyNeeded, healthySeen),
searchImagesCommons(allHealthyQueries[0], healthyNeeded, healthySeen),
]);
const allUrls: string[] = [];
for (const settled of [ddgUrls, inatUrls, commonsUrls]) {
if (settled.status === "fulfilled") {
allUrls.push(...settled.value.urls);
}
}
if (allUrls.length > 0) {
console.log(`\n Downloading ${allUrls.length} healthy images...`);
const startIdx = countImagesInDir(healthyDir);
const { downloaded, failed } = await downloadBatch(allUrls, healthyDir, startIdx);
const newTotal = countImagesInDir(healthyDir);
const gained = newTotal - healthyCount;
totalDownloaded += gained;
totalFailed += failed;
console.log(
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images.` +
` Total healthy: ${newTotal}/${TARGET_HEALTHY} (${gained} new)`,
);
} else {
console.log(`\n ✗ No healthy images found`);
}
// Update seen-URLs cache
seenUrlsCache[HEALTHY_CLASS] = Array.from(healthySeen);
saveSeenUrlsCache(seenUrlsCache);
}
// ── Summary ──────────────────────────────────────────────────────────────
const elapsed = Math.round((Date.now() - startTime) / 1000);
const mins = Math.floor(elapsed / 60);
const hrs = Math.floor(mins / 60);
// Final scan
const finalScan = scanDataset();
const totalHave = [...finalScan.diseaseCounts.values()].reduce((s, c) => s + c, 0);
const atTarget = [...finalScan.diseaseCounts.values()].filter(
(c) => c >= TARGET_PER_DISEASE,
).length;
console.log("\n" + "=".repeat(60));
console.log(" ✅ FILL COMPLETE");
console.log("=".repeat(60));
console.log(` Time: ${hrs}h ${mins % 60}m`);
console.log(` Diseases at target: ${atTarget}/${diseaseInfo.size}`);
console.log(` Total images: ${totalHave}`);
console.log(` Healthy images: ${finalScan.healthyCount}/${TARGET_HEALTHY}`);
console.log(` New downloads: ${totalDownloaded}`);
console.log(` Dataset dir: ${DATASET_DIR}/`);
await closeDb();
console.log("=".repeat(60));
}
main().catch((err) => {
console.error("\nFatal error:", `\n${err}`);
process.exit(1);
});