#!/usr/bin/env node /** * scrape-training-dataset.ts * * Collects a training dataset for fine-tuning by scraping DuckDuckGo image search. * * Targets: * - 200 images per disease class (93 diseases) * - 400 images for the "healthy" class * - Full resolution images stored in data/dataset/{class_id}/ * * DuckDuckGo approach (no API key needed): * 1. Fetch the main search page to extract a vqd (query) token * 2. Use the vqd token to paginate through image results * 3. Download each image to the dataset directory * * Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts * * Progress is tracked in data/dataset/.progress.json — interrupt and resume safely. */ import "dotenv/config"; import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs"; import { resolve, extname, join } from "path"; // ─── Config ───────────────────────────────────────────────────────────────── const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json"); const PLANTS_JSON = resolve(__dirname, "../src/data/plants.json"); const DATASET_DIR = resolve(__dirname, "../data/dataset"); const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json"); /** Target images per disease class */ const TARGET_PER_DISEASE = 200; /** Target images for the "healthy" class (2× normal) */ const TARGET_HEALTHY = 400; /** Delay between DuckDuckGo search API calls (ms) */ const SEARCH_DELAY = 1500; /** Delay between image downloads (ms) */ const DOWNLOAD_DELAY = 300; /** Max concurrent downloads */ const CONCURRENT_DOWNLOADS = 5; /** Minimum image size in bytes to accept (reject tiny placeholders) */ const MIN_IMAGE_SIZE = 10_000; // 10KB /** Maximum image size in bytes */ const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB /** Allowed image content types */ const ALLOWED_CONTENT_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif"]; /** Allowed file extensions */ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"]; /** User agent for requests */ const UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"; // ─── Types ────────────────────────────────────────────────────────────────── interface DiseaseSeed { id: string; plantId: string; name: string; // eslint-disable-next-line @typescript-eslint/no-explicit-any [key: string]: any; } interface PlantSeed { id: string; commonName: string; scientificName: string; // eslint-disable-next-line @typescript-eslint/no-explicit-any [key: string]: any; } interface DuckDuckGoImageResult { image: string; title: string; url: string; thumbnail: string; height: number; width: number; } interface ClassProgress { count: number; downloaded: number; failed: number; skipped: number; /** URLs we've already seen (to avoid duplicates) */ seenUrls: string[]; /** Whether we've exhausted search results */ exhausted: boolean; } interface Progress { lastUpdated: string; classes: Record; } /** Class ID for healthy plants */ const HEALTHY_CLASS = "healthy"; // ─── DuckDuckGo API ───────────────────────────────────────────────────────── /** * Extract the vqd token from DuckDuckGo's search page. * Required for paginating image results. */ async function getVqdToken(query: string): Promise { const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`; const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "text/html" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) { throw new Error(`Failed to get vqd token: ${res.status}`); } const html = await res.text(); // Extract vqd token from the HTML // Format: vqd='' or vqd="" const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/); if (!match) { throw new Error(`Could not extract vqd token from DuckDuckGo response for "${query}"`); } return match[1]; } /** * Fetch a page of DuckDuckGo image results. */ async function searchImagesDuckDuckGo( query: string, vqd: string, page: number, ): Promise { const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(query)}&vqd=${vqd}&o=json&p=${page}&f=,,,`; const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "application/json", Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`, }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) { if (res.status === 429) { console.warn(" ⚠ Rate limited (429). Waiting 10s..."); await sleep(10_000); return searchImagesDuckDuckGo(query, vqd, page); // Retry } if (res.status === 403) { console.warn(" ⚠ Forbidden (403). Token may have expired."); return []; // Token expired — no more pages } throw new Error(`DuckDuckGo search failed: ${res.status}`); } const data = (await res.json()) as { results: DuckDuckGoImageResult[] }; return data.results ?? []; } /** * Search DuckDuckGo images, automatically paginating to collect up to `target` results. * Returns unique image URLs. */ async function collectImages( query: string, target: number, seenUrls: Set, ): Promise<{ urls: string[]; exhausted: boolean }> { const results: string[] = []; let page = 1; let exhausted = false; let consecutiveEmpty = 0; // Get vqd token let vqd: string; try { vqd = await getVqdToken(query); } catch (err) { console.warn(` ⚠ Failed to get vqd token: ${err instanceof Error ? err.message : "unknown"}`); return { urls: [], exhausted: true }; } while (results.length < target) { await sleep(SEARCH_DELAY); let pageResults: DuckDuckGoImageResult[]; try { pageResults = await searchImagesDuckDuckGo(query, vqd, page); } catch (err) { console.warn(` ⚠ Search error: ${err instanceof Error ? err.message : "unknown"}`); break; } if (pageResults.length === 0) { consecutiveEmpty++; if (consecutiveEmpty >= 3) { exhausted = true; break; } page++; continue; } consecutiveEmpty = 0; let newCount = 0; for (const r of pageResults) { if (results.length >= target) break; const imgUrl = r.image || r.url; // Skip if we've already seen this URL if (seenUrls.has(imgUrl)) continue; // Validate URL looks like an image const ext = extname(new URL(imgUrl).pathname).toLowerCase(); if (!ALLOWED_EXTENSIONS.includes(ext) && !ext) { // No extension - still try, could be a CDN URL } seenUrls.add(imgUrl); results.push(imgUrl); newCount++; } if (newCount === 0 && pageResults.every((r) => seenUrls.has(r.image || r.url))) { // All results on this page were already seen page++; continue; } if (results.length < target) { page++; } } return { urls: results.slice(0, target), exhausted }; } // ─── Image Download ───────────────────────────────────────────────────────── /** * Download a single image from a URL to the target path. * Returns true if successful, false otherwise. */ async function downloadImage(url: string, destPath: string): Promise { try { const res = await fetch(url, { headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg" }, signal: AbortSignal.timeout(15_000), }); if (!res.ok) return false; const contentType = res.headers.get("content-type") || ""; const contentLength = parseInt(res.headers.get("content-length") || "0", 10); // Validate content type if (!ALLOWED_CONTENT_TYPES.some((t) => contentType.includes(t))) { return false; } // Validate size if (contentLength > 0 && contentLength < MIN_IMAGE_SIZE) return false; if (contentLength > MAX_IMAGE_SIZE) return false; const buffer = Buffer.from(await res.arrayBuffer()); // Double-check actual buffer size if (buffer.length < MIN_IMAGE_SIZE) return false; if (buffer.length > MAX_IMAGE_SIZE) return false; // Determine correct extension from content type or URL let ext = extname(new URL(url).pathname).toLowerCase(); if (!ALLOWED_EXTENSIONS.includes(ext)) { // Map from content type if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg"; else if (contentType.includes("png")) ext = ".png"; else if (contentType.includes("webp")) ext = ".webp"; else ext = ".jpg"; // Default } const filePath = destPath.replace(/\.\w+$/, ext); writeFileSync(filePath, buffer); return true; } catch { return false; } } /** * Download multiple images concurrently, respecting a per-download delay. */ async function downloadBatch( urls: string[], classDir: string, startIndex: number, ): Promise<{ downloaded: number; failed: number; lastIndex: number }> { let downloaded = 0; let failed = 0; let index = startIndex; // Process in chunks to control concurrency for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) { const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS); const results = await Promise.all( chunk.map(async (url) => { const paddedIndex = String(index).padStart(4, "0"); const destPath = resolve(classDir, `img_${paddedIndex}.jpg`); const success = await downloadImage(url, destPath); await sleep(DOWNLOAD_DELAY); return { success, index: index++ }; }), ); for (const r of results) { if (r.success) downloaded++; else failed++; } } return { downloaded, failed, lastIndex: index }; } // ─── Progress Tracking ────────────────────────────────────────────────────── function loadProgress(): Progress { if (!existsSync(PROGRESS_FILE)) { return { lastUpdated: new Date().toISOString(), classes: {} }; } return JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Progress; } function saveProgress(progress: Progress): void { progress.lastUpdated = new Date().toISOString(); writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2)); } function getClassProgress(progress: Progress, classId: string): ClassProgress { if (!progress.classes[classId]) { progress.classes[classId] = { count: 0, downloaded: 0, failed: 0, skipped: 0, seenUrls: [], exhausted: false, }; } return progress.classes[classId]; } // ─── Search Query Building ────────────────────────────────────────────────── function buildSearchQueries(disease: DiseaseSeed, plant: PlantSeed | null): string[] { const name = disease.name; const plantName = plant?.commonName || disease.plantId; return [ `${name} ${plantName} leaf disease`, `${plantName} ${name} symptoms`, `${name} plant disease`, `${plantName} diseased leaf`, ]; } function buildHealthyQueries(plant: PlantSeed): string[] { return [ `healthy ${plant.commonName} leaf`, `${plant.commonName} leaf closeup`, `healthy ${plant.commonName} plant`, `${plant.commonName} foliage`, ]; } // ─── Dataset Collection ───────────────────────────────────────────────────── async function collectClassImages( classId: string, queries: string[], target: number, progress: Progress, classDir: string, ): Promise { const cp = getClassProgress(progress, classId); const seenUrls = new Set(cp.seenUrls); if (cp.count >= target) { console.log(` ✓ Already have ${cp.count}/${target} images`); return; } if (cp.exhausted) { console.log(` ✓ Already exhausted search results (${cp.count}/${target} images)`); return; } mkdirSync(classDir, { recursive: true }); const totalUrls: string[] = []; let exhausted = false; // Search with each query until we hit the target for (const query of queries) { if (totalUrls.length >= target) break; console.log(` Searching: "${query}"...`); const result = await collectImages(query, target - totalUrls.length, seenUrls); totalUrls.push(...result.urls); cp.seenUrls = Array.from(seenUrls); if (result.exhausted) { exhausted = true; } if (totalUrls.length >= target) break; } if (totalUrls.length === 0) { cp.exhausted = exhausted; saveProgress(progress); console.log(` ✗ No images found for "${classId}"`); return; } console.log(` Found ${totalUrls.length} unique image URLs. Downloading...`); // Download the images const { downloaded, failed } = await downloadBatch(totalUrls, classDir, cp.count); cp.count += downloaded; cp.downloaded += downloaded; cp.failed += failed; cp.exhausted = exhausted; saveProgress(progress); const pct = Math.round((cp.count / target) * 100); console.log( ` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`, ); } // ─── Main ─────────────────────────────────────────────────────────────────── async function main() { console.log("=".repeat(60)); console.log("PLANT DISEASE DATASET COLLECTOR"); console.log("=".repeat(60)); // Load knowledge base const diseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[]; const plants = JSON.parse(readFileSync(PLANTS_JSON, "utf-8")) as PlantSeed[]; const plantMap = new Map(); for (const p of plants) { plantMap.set(p.id, p); } console.log(`\nLoaded ${diseases.length} diseases, ${plants.length} plants`); console.log( `Target: ${TARGET_PER_DISEASE} images/disease (×${diseases.length} = ${diseases.length * TARGET_PER_DISEASE})`, ); console.log(`Target: ${TARGET_HEALTHY} images for "healthy" class`); console.log(`Output: ${DATASET_DIR}/`); console.log(""); // Load progress mkdirSync(DATASET_DIR, { recursive: true }); const progress = loadProgress(); const startTime = Date.now(); // ── Phase 1: Disease classes ────────────────────────────────────────────── console.log("─".repeat(60)); console.log("PHASE 1: Disease Images"); console.log("─".repeat(60)); for (let i = 0; i < diseases.length; i++) { const disease = diseases[i]; const plant = plantMap.get(disease.plantId) ?? null; const classDir = resolve(DATASET_DIR, disease.id); const queries = buildSearchQueries(disease, plant); const pct = Math.round((i / diseases.length) * 100); console.log(`\n[${i + 1}/${diseases.length}] (${pct}%) ${disease.name} (${disease.id})`); await collectClassImages(disease.id, queries, TARGET_PER_DISEASE, progress, classDir); } // ── Phase 2: Healthy class ──────────────────────────────────────────────── console.log("\n" + "─".repeat(60)); console.log("PHASE 2: Healthy Plant Images"); console.log("─".repeat(60)); const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS); const healthyCp = getClassProgress(progress, HEALTHY_CLASS); const healthySeen = new Set(healthyCp.seenUrls); if (healthyCp.count >= TARGET_HEALTHY) { console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY} healthy images`); } else { // Build a pool of healthy plant queries const allHealthyQueries: string[] = []; for (const plant of plants) { allHealthyQueries.push(...buildHealthyQueries(plant)); } const totalHealthyUrls: string[] = []; let healthyExhausted = false; for (const query of allHealthyQueries) { if (totalHealthyUrls.length >= TARGET_HEALTHY) break; if (healthyExhausted) break; console.log(`\n Searching: "${query}"...`); const result = await collectImages( query, TARGET_HEALTHY - totalHealthyUrls.length, healthySeen, ); totalHealthyUrls.push(...result.urls); if (result.exhausted) { healthyExhausted = true; } } healthyCp.seenUrls = Array.from(healthySeen); if (totalHealthyUrls.length > 0) { console.log(`\n Found ${totalHealthyUrls.length} healthy image URLs. Downloading...`); const { downloaded, failed } = await downloadBatch( totalHealthyUrls, healthyDir, healthyCp.count, ); healthyCp.count += downloaded; healthyCp.downloaded += downloaded; healthyCp.failed += failed; healthyCp.exhausted = healthyExhausted; const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100); console.log( ` Got ${downloaded} images (${failed} failed). Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`, ); } else { healthyCp.exhausted = true; console.log(` ✗ No healthy images found`); } saveProgress(progress); } // ── Summary ──────────────────────────────────────────────────────────────── const elapsed = Math.round((Date.now() - startTime) / 1000); const mins = Math.floor(elapsed / 60); const secs = elapsed % 60; let totalDownloaded = 0; let totalFailed = 0; let totalTarget = 0; for (const [classId, cp] of Object.entries(progress.classes)) { totalDownloaded += cp.downloaded || 0; totalFailed += cp.failed || 0; totalTarget += classId === HEALTHY_CLASS ? TARGET_HEALTHY : TARGET_PER_DISEASE; } const totalSize = await getDatasetSize(); const sizeGb = (totalSize / (1024 * 1024 * 1024)).toFixed(2); console.log("\n" + "=".repeat(60)); console.log("COMPLETE"); console.log("=".repeat(60)); console.log(` Time: ${mins}m ${secs}s`); console.log(` Downloaded: ${totalDownloaded} images`); console.log(` Failed: ${totalFailed} images`); console.log(` Target: ${totalTarget} images`); console.log(` Dataset size: ${sizeGb} GB`); console.log(` Dataset location: ${DATASET_DIR}/`); console.log(""); console.log("Next steps:"); console.log(" 1. Run the fine-tuning script to train on this dataset"); console.log(" 2. The fine-tuning script will resize to 160×160 and augment"); console.log("=".repeat(60)); } /** * Calculate total size of the dataset directory. */ async function getDatasetSize(): Promise { let total = 0; if (!existsSync(DATASET_DIR)) return 0; const entries = readdirSync(DATASET_DIR, { withFileTypes: true }); for (const entry of entries) { if (!entry.name.startsWith(".")) { const fullPath = resolve(DATASET_DIR, entry.name); if (entry.isDirectory()) { total += dirSize(fullPath); } } } return total; } function dirSize(dirPath: string): number { let total = 0; try { const entries = readdirSync(dirPath, { withFileTypes: true }); for (const entry of entries) { const fullPath = join(dirPath, entry.name); if (entry.isFile()) { total += statSync(fullPath).size; } else if (entry.isDirectory()) { total += dirSize(fullPath); } } } catch { // skip errors } return total; } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } main().catch((err) => { console.error("Fatal error:", err); process.exit(1); });