This commit is contained in:
2026-06-06 17:02:45 -04:00
parent 47609e5e42
commit db4c656730
22 changed files with 6195 additions and 326 deletions

View File

@@ -2,59 +2,113 @@
/**
* scrape-training-dataset.ts
*
* Collects a training dataset for fine-tuning by scraping DuckDuckGo image search.
* Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
*
* Targets:
* - 200 images per disease class (93 diseases)
* - 400 images for the "healthy" class
* - Full resolution images stored in data/dataset/{class_id}/
* Targets (tiered by plant type):
* - Core plants (houseplants + common garden): 100 images per disease
* - Full set (all 11,498 DB diseases): 10 images per disease
* - Healthy: 400 images
*
* DuckDuckGo approach (no API key needed):
* 1. Fetch the main search page to extract a vqd (query) token
* 2. Use the vqd token to paginate through image results
* 3. Download each image to the dataset directory
* Sources (all free, no API keys):
* 1. DB image_url — existing images already found
* 2. DuckDuckGo — general web image search
* 3. iNaturalist — real-world plant observation photos
* 4. Wikimedia Commons — curated scientific/educational images
*
* Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
*
* Progress is tracked in data/dataset/.progress.json — interrupt and resume safely.
* Progress: data/dataset/.progress.json — interrupt and resume safely.
*/
import "dotenv/config";
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
import { resolve, extname, join } from "path";
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
import { resolve, extname } from "path";
// Load .env.development for DB creds
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) process.env[key] = val;
}
}
}
} catch {}
import { getDb, closeDb } from "@/lib/db/index";
import { diseases } from "@/lib/db/schema";
// ─── Config ─────────────────────────────────────────────────────────────────
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
const PLANTS_JSON = resolve(__dirname, "../src/data/plants.json");
const DATASET_DIR = resolve(__dirname, "../data/dataset");
const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");
/** Target images per disease class */
const TARGET_PER_DISEASE = 200;
/** Target images per disease for CORE plants */
const TARGET_CORE = 100;
/** Target images for the "healthy" class (2× normal) */
/** Target images per disease for the FULL set */
const TARGET_FULL = 10;
/** Target images for the "healthy" class */
const TARGET_HEALTHY = 400;
/** Core plants that get higher image targets */
const CORE_PLANTS = new Set([
// Houseplants
"monstera",
"pothos",
"snake-plant",
"peace-lily",
"orchid",
"succulent",
"fiddle-leaf-fig",
"aloe-vera",
"cactus",
"fern",
// Garden plants
"tomato",
"basil",
"rose",
"pepper",
"strawberry",
"cucumber",
"squash",
"lettuce",
"spinach",
"cabbage",
"lavender",
"mint",
"jasmine",
"sunflower",
"daisy",
"zucchini",
"bean",
"eggplant",
"chili",
// General disease patterns
"general",
]);
/** Delay between DuckDuckGo search API calls (ms) */
const SEARCH_DELAY = 1500;
/** Delay between image downloads (ms) */
const DOWNLOAD_DELAY = 300;
const DOWNLOAD_DELAY = 100;
/** Max concurrent downloads */
const CONCURRENT_DOWNLOADS = 5;
const CONCURRENT_DOWNLOADS = 10;
/** Minimum image size in bytes to accept (reject tiny placeholders) */
/** Minimum image size in bytes to accept */
const MIN_IMAGE_SIZE = 10_000; // 10KB
/** Maximum image size in bytes */
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
/** Allowed image content types */
const ALLOWED_CONTENT_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif"];
/** Allowed file extensions */
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
@@ -62,22 +116,16 @@ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
const UA =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
/** Class ID for healthy plants */
const HEALTHY_CLASS = "healthy";
// ─── Types ──────────────────────────────────────────────────────────────────
interface DiseaseSeed {
interface DbDisease {
id: string;
plantId: string;
name: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
[key: string]: any;
}
interface PlantSeed {
id: string;
commonName: string;
scientificName: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
[key: string]: any;
imageUrl: string | null;
}
interface DuckDuckGoImageResult {
@@ -93,10 +141,7 @@ interface ClassProgress {
count: number;
downloaded: number;
failed: number;
skipped: number;
/** URLs we've already seen (to avoid duplicates) */
seenUrls: string[];
/** Whether we've exhausted search results */
exhausted: boolean;
}
@@ -105,15 +150,27 @@ interface Progress {
classes: Record<string, ClassProgress>;
}
/** Class ID for healthy plants */
const HEALTHY_CLASS = "healthy";
// ─── DB Loading ──────────────────────────────────────────────────────────────
/**
* Load all diseases from the database with their existing image URLs.
*/
async function loadDiseasesFromDb(): Promise<DbDisease[]> {
const db = getDb();
const rows = await db
.select({
id: diseases.id,
plantId: diseases.plantId,
name: diseases.name,
imageUrl: diseases.imageUrl,
})
.from(diseases)
.orderBy(diseases.id);
return rows;
}
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
/**
* Extract the vqd token from DuckDuckGo's search page.
* Required for paginating image results.
*/
async function getVqdToken(query: string): Promise<string> {
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
@@ -122,25 +179,15 @@ async function getVqdToken(query: string): Promise<string> {
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) {
throw new Error(`Failed to get vqd token: ${res.status}`);
}
if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
const html = await res.text();
// Extract vqd token from the HTML
// Format: vqd='<token>' or vqd="<token>"
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
if (!match) {
throw new Error(`Could not extract vqd token from DuckDuckGo response for "${query}"`);
}
if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
return match[1];
}
/**
* Fetch a page of DuckDuckGo image results.
*/
async function searchImagesDuckDuckGo(
query: string,
vqd: string,
@@ -161,12 +208,9 @@ async function searchImagesDuckDuckGo(
if (res.status === 429) {
console.warn(" ⚠ Rate limited (429). Waiting 10s...");
await sleep(10_000);
return searchImagesDuckDuckGo(query, vqd, page); // Retry
}
if (res.status === 403) {
console.warn(" ⚠ Forbidden (403). Token may have expired.");
return []; // Token expired — no more pages
return searchImagesDuckDuckGo(query, vqd, page);
}
if (res.status === 403) return [];
throw new Error(`DuckDuckGo search failed: ${res.status}`);
}
@@ -174,11 +218,7 @@ async function searchImagesDuckDuckGo(
return data.results ?? [];
}
/**
* Search DuckDuckGo images, automatically paginating to collect up to `target` results.
* Returns unique image URLs.
*/
async function collectImages(
async function collectImagesDuckDuckGo(
query: string,
target: number,
seenUrls: Set<string>,
@@ -188,27 +228,29 @@ async function collectImages(
let exhausted = false;
let consecutiveEmpty = 0;
// Get vqd token
let vqd: string;
try {
vqd = await getVqdToken(query);
} catch (err) {
console.warn(`Failed to get vqd token: ${err instanceof Error ? err.message : "unknown"}`);
console.warn(`DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
return { urls: [], exhausted: true };
}
while (results.length < target) {
const MAX_PAGES = 5;
let lowNoveltyCount = 0;
while (results.length < target && page <= MAX_PAGES) {
await sleep(SEARCH_DELAY);
let pageResults: DuckDuckGoImageResult[];
try {
pageResults = await searchImagesDuckDuckGo(query, vqd, page);
} catch (err) {
console.warn(`Search error: ${err instanceof Error ? err.message : "unknown"}`);
console.warn(`DDG error: ${err instanceof Error ? err.message : "unknown"}`);
break;
}
if (pageResults.length === 0) {
if (!pageResults || pageResults.length === 0) {
consecutiveEmpty++;
if (consecutiveEmpty >= 3) {
exhausted = true;
@@ -223,78 +265,160 @@ async function collectImages(
for (const r of pageResults) {
if (results.length >= target) break;
const imgUrl = r.image || r.url;
// Skip if we've already seen this URL
if (!imgUrl || typeof imgUrl !== "string") continue;
if (seenUrls.has(imgUrl)) continue;
// Validate URL looks like an image
const ext = extname(new URL(imgUrl).pathname).toLowerCase();
if (!ALLOWED_EXTENSIONS.includes(ext) && !ext) {
// No extension - still try, could be a CDN URL
try {
new URL(imgUrl);
} catch {
continue;
}
seenUrls.add(imgUrl);
results.push(imgUrl);
newCount++;
}
if (newCount === 0 && pageResults.every((r) => seenUrls.has(r.image || r.url))) {
// All results on this page were already seen
page++;
continue;
const newRatio = newCount / pageResults.length;
if (newRatio < 0.05) {
lowNoveltyCount++;
if (lowNoveltyCount >= 2) break;
} else {
lowNoveltyCount = 0;
}
if (results.length < target) {
page++;
}
if (results.length < target) page++;
}
return { urls: results.slice(0, target), exhausted };
}
// ─── iNaturalist API ─────────────────────────────────────────────────────────
async function searchImagesInaturalist(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
const results: string[] = [];
const perPage = Math.min(target, 200);
const apiUrl =
`https://api.inaturalist.org/v1/observations` +
`?q=${encodeURIComponent(query)}` +
`&photos_only=true` +
`&quality_grade=research` +
`&per_page=${perPage}` +
`&order_by=observed_on&order=desc`;
try {
const res = await fetch(apiUrl, {
headers: { "User-Agent": UA, Accept: "application/json" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) return { urls: [], exhausted: false };
const data = (await res.json()) as {
results: Array<{ photos: Array<{ url: string }> }>;
};
for (const obs of data.results ?? []) {
if (results.length >= target) break;
for (const photo of obs.photos ?? []) {
if (results.length >= target) break;
const url = photo.url;
if (!url || seenUrls.has(url)) continue;
const fullUrl = url.replace("/medium.", "/original.");
seenUrls.add(fullUrl);
results.push(fullUrl);
}
}
return { urls: results, exhausted: results.length < target };
} catch {
return { urls: results, exhausted: false };
}
}
// ─── Wikimedia Commons API ──────────────────────────────────────────────────
async function searchImagesCommons(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
const results: string[] = [];
let sroffset = 0;
while (results.length < target) {
const params = new URLSearchParams({
action: "query",
list: "search",
srsearch: query,
srnamespace: "6",
srlimit: "50",
sroffset: String(sroffset),
format: "json",
origin: "*", // server-side API call
});
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
try {
const res = await fetch(url, {
headers: { "User-Agent": UA },
signal: AbortSignal.timeout(10_000),
});
if (!res.ok) break;
const data = (await res.json()) as {
query?: { search?: Array<{ title: string }> };
continue?: { sroffset?: number };
};
const hits = data.query?.search ?? [];
if (hits.length === 0) break;
for (const hit of hits) {
if (results.length >= target) break;
const filename = hit.title.replace(/^File:/, "");
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(filename)}`;
if (seenUrls.has(imgUrl)) continue;
seenUrls.add(imgUrl);
results.push(imgUrl);
}
sroffset = data.continue?.sroffset ?? sroffset + hits.length;
} catch {
break;
}
}
return { urls: results, exhausted: results.length < target };
}
// ─── Image Download ─────────────────────────────────────────────────────────
/**
* Download a single image from a URL to the target path.
* Returns true if successful, false otherwise.
*/
async function downloadImage(url: string, destPath: string): Promise<boolean> {
try {
const res = await fetch(url, {
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg" },
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) return false;
const contentType = res.headers.get("content-type") || "";
const contentLength = parseInt(res.headers.get("content-length") || "0", 10);
// Validate content type
if (!ALLOWED_CONTENT_TYPES.some((t) => contentType.includes(t))) {
return false;
}
// Validate size
if (contentLength > 0 && contentLength < MIN_IMAGE_SIZE) return false;
if (contentLength > MAX_IMAGE_SIZE) return false;
if (contentType.includes("text/html")) return false;
const buffer = Buffer.from(await res.arrayBuffer());
// Double-check actual buffer size
if (buffer.length < MIN_IMAGE_SIZE) return false;
if (buffer.length > MAX_IMAGE_SIZE) return false;
// Determine correct extension from content type or URL
let ext = extname(new URL(url).pathname).toLowerCase();
if (!ALLOWED_EXTENSIONS.includes(ext)) {
// Map from content type
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
else if (contentType.includes("png")) ext = ".png";
else if (contentType.includes("webp")) ext = ".webp";
else ext = ".jpg"; // Default
else ext = ".jpg";
}
const filePath = destPath.replace(/\.\w+$/, ext);
@@ -305,9 +429,6 @@ async function downloadImage(url: string, destPath: string): Promise<boolean> {
}
}
/**
* Download multiple images concurrently, respecting a per-download delay.
*/
async function downloadBatch(
urls: string[],
classDir: string,
@@ -317,7 +438,6 @@ async function downloadBatch(
let failed = 0;
let index = startIndex;
// Process in chunks to control concurrency
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
@@ -325,16 +445,23 @@ async function downloadBatch(
chunk.map(async (url) => {
const paddedIndex = String(index).padStart(4, "0");
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
const success = await downloadImage(url, destPath);
await sleep(DOWNLOAD_DELAY);
return { success, index: index++ };
return { success, index: index++, url: url.substring(0, 50) };
}),
);
for (const r of results) {
if (r.success) downloaded++;
else failed++;
else {
failed++;
if (failed % 20 === 1) console.log(` ⚠ Failed: ${r.url}...`);
}
}
const total = downloaded + failed;
if (total % 30 === 0 || total === urls.length) {
console.log(` Progress: ${downloaded}/${urls.length} (${failed} failed)`);
}
}
@@ -361,7 +488,6 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
count: 0,
downloaded: 0,
failed: 0,
skipped: 0,
seenUrls: [],
exhausted: false,
};
@@ -369,26 +495,22 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
return progress.classes[classId];
}
// ─── Search Query Building ──────────────────────────────────────────────────
// ─── Query Building ─────────────────────────────────────────────────────────
function buildSearchQueries(disease: DiseaseSeed, plant: PlantSeed | null): string[] {
const name = disease.name;
const plantName = plant?.commonName || disease.plantId;
return [
`${name} ${plantName} leaf disease`,
`${plantName} ${name} symptoms`,
`${name} plant disease`,
`${plantName} diseased leaf`,
];
function buildSearchQueries(disease: DbDisease): string[] {
const name = disease.name || disease.id.replace(/-/g, " ");
const plant = disease.plantId.replace(/-/g, " ");
// Every query keeps the disease NAME to avoid noisy labels
return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
}
function buildHealthyQueries(plant: PlantSeed): string[] {
function buildHealthyQueries(plant: string): string[] {
const name = plant.replace(/-/g, " ");
return [
`healthy ${plant.commonName} leaf`,
`${plant.commonName} leaf closeup`,
`healthy ${plant.commonName} plant`,
`${plant.commonName} foliage`,
`healthy ${name} leaf`,
`${name} leaf closeup`,
`healthy ${name} plant`,
`${name} foliage`,
];
}
@@ -400,64 +522,97 @@ async function collectClassImages(
target: number,
progress: Progress,
classDir: string,
existingUrls: string[] = [],
fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
): Promise<void> {
const cp = getClassProgress(progress, classId);
const seenUrls = new Set(cp.seenUrls);
if (cp.count >= target) {
console.log(` ✓ Already have ${cp.count}/${target} images`);
console.log(` ✓ Already have ${cp.count}/${target}`);
return;
}
if (cp.exhausted) {
console.log(`Already exhausted search results (${cp.count}/${target} images)`);
console.log(`Exhausted (${cp.count}/${target})`);
return;
}
mkdirSync(classDir, { recursive: true });
const totalUrls: string[] = [];
const allUrls: string[] = [];
let exhausted = false;
// Search with each query until we hit the target
for (const query of queries) {
if (totalUrls.length >= target) break;
console.log(` Searching: "${query}"...`);
const result = await collectImages(query, target - totalUrls.length, seenUrls);
totalUrls.push(...result.urls);
cp.seenUrls = Array.from(seenUrls);
if (result.exhausted) {
exhausted = true;
// ── Source 0: Existing DB URLs ──────────────────────────────────────────
const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
if (freshDbUrls.length > 0) {
console.log(` DB: ${freshDbUrls.length} existing URLs`);
for (const url of freshDbUrls) {
if (allUrls.length >= target) break;
seenUrls.add(url);
allUrls.push(url);
}
if (totalUrls.length >= target) break;
}
if (totalUrls.length === 0) {
// ── Source 1: DuckDuckGo ──────────────────────────────────────────────
// Skip DDG in fast mode (full set — DDG is slowest source)
if (!fastMode && allUrls.length < target) {
for (const query of queries) {
if (allUrls.length >= target) break;
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
console.log(`${result.urls.length} new`);
if (allUrls.length >= target) break;
}
}
// ── Source 2: iNaturalist ──────────────────────────────────────────────
if (allUrls.length < target) {
const primaryQuery = queries[0];
console.log(` iNat: Searching...`);
const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
console.log(` iNat: ${result.urls.length} images`);
}
// ── Source 3: Wikimedia Commons ────────────────────────────────────────
if (allUrls.length < target) {
const primaryQuery = queries[0];
console.log(` Commons: Searching...`);
const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
console.log(` Commons: ${result.urls.length} images`);
}
if (allUrls.length === 0) {
cp.exhausted = exhausted;
saveProgress(progress);
console.log(` ✗ No images found for "${classId}"`);
console.log(` ✗ No images found`);
return;
}
console.log(` Found ${totalUrls.length} unique image URLs. Downloading...`);
// Save progress with seen URLs BEFORE downloading
cp.seenUrls = Array.from(seenUrls);
cp.exhausted = exhausted;
saveProgress(progress);
// Download the images
const { downloaded, failed } = await downloadBatch(totalUrls, classDir, cp.count);
console.log(` Downloading ${allUrls.length} images...`);
const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count);
cp.count += downloaded;
cp.downloaded += downloaded;
cp.failed += failed;
cp.exhausted = exhausted;
saveProgress(progress);
const pct = Math.round((cp.count / target) * 100);
console.log(
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${allUrls.length} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
);
}
@@ -465,25 +620,18 @@ async function collectClassImages(
async function main() {
console.log("=".repeat(60));
console.log("PLANT DISEASE DATASET COLLECTOR");
console.log("PLANT DISEASE DATASET COLLECTOR — FULL DB");
console.log("=".repeat(60));
// Load knowledge base
const diseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
const plants = JSON.parse(readFileSync(PLANTS_JSON, "utf-8")) as PlantSeed[];
// Load diseases from DB
console.log("\nLoading diseases from database...");
const dbDiseases = await loadDiseasesFromDb();
console.log(` ${dbDiseases.length} diseases loaded`);
const plantMap = new Map<string, PlantSeed>();
for (const p of plants) {
plantMap.set(p.id, p);
}
console.log(`\nLoaded ${diseases.length} diseases, ${plants.length} plants`);
console.log(
`Target: ${TARGET_PER_DISEASE} images/disease (×${diseases.length} = ${diseases.length * TARGET_PER_DISEASE})`,
);
console.log(`Target: ${TARGET_HEALTHY} images for "healthy" class`);
console.log(`Output: ${DATASET_DIR}/`);
console.log("");
const coreDiseases = dbDiseases.filter((d) => CORE_PLANTS.has(d.plantId));
const fullDiseases = dbDiseases.filter((d) => !CORE_PLANTS.has(d.plantId));
console.log(` Core plants: ${coreDiseases.length} diseases (target: ${TARGET_CORE})`);
console.log(` Full set: ${fullDiseases.length} diseases (target: ${TARGET_FULL})`);
// Load progress
mkdirSync(DATASET_DIR, { recursive: true });
@@ -491,28 +639,46 @@ async function main() {
const startTime = Date.now();
// ── Phase 1: Disease classes ──────────────────────────────────────────────
console.log("─".repeat(60));
console.log("PHASE 1: Disease Images");
console.log("─".repeat(60));
for (let i = 0; i < diseases.length; i++) {
const disease = diseases[i];
const plant = plantMap.get(disease.plantId) ?? null;
const classDir = resolve(DATASET_DIR, disease.id);
const queries = buildSearchQueries(disease, plant);
const pct = Math.round((i / diseases.length) * 100);
console.log(`\n[${i + 1}/${diseases.length}] (${pct}%) ${disease.name} (${disease.id})`);
await collectClassImages(disease.id, queries, TARGET_PER_DISEASE, progress, classDir);
}
// ── Phase 2: Healthy class ────────────────────────────────────────────────
// ── Phase 1: Core set ──────────────────────────────────────────────────
console.log("\n" + "─".repeat(60));
console.log("PHASE 2: Healthy Plant Images");
console.log("PHASE 1: Core Diseases (100 images each)");
console.log("─".repeat(60));
for (let i = 0; i < coreDiseases.length; i++) {
const d = coreDiseases[i];
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d);
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
const pct = Math.round((i / coreDiseases.length) * 100);
console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
}
// ── Phase 2: Full set ──────────────────────────────────────────────────
console.log("\n" + "─".repeat(60));
console.log("PHASE 2: Full Disease Set (10 images each)");
console.log("─".repeat(60));
for (let i = 0; i < fullDiseases.length; i++) {
const d = fullDiseases[i];
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d);
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
const pct = Math.round((i / fullDiseases.length) * 100);
console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
}
// ── Phase 3: Healthy class ──────────────────────────────────────────────
console.log("\n" + "─".repeat(60));
console.log("PHASE 3: Healthy Plant Images");
console.log("─".repeat(60));
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
@@ -520,39 +686,50 @@ async function main() {
const healthySeen = new Set(healthyCp.seenUrls);
if (healthyCp.count >= TARGET_HEALTHY) {
console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY} healthy images`);
console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`);
} else {
// Build a pool of healthy plant queries
// Collect all unique plants
const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))];
const allHealthyQueries: string[] = [];
for (const plant of plants) {
for (const plant of allPlants) {
allHealthyQueries.push(...buildHealthyQueries(plant));
}
const healthySources = [
{ name: "DDG", collector: collectImagesDuckDuckGo },
{ name: "iNat", collector: searchImagesInaturalist },
{ name: "Commons", collector: searchImagesCommons },
] as const;
const totalHealthyUrls: string[] = [];
let healthyExhausted = false;
let anyRemaining = false;
for (const query of allHealthyQueries) {
for (const source of healthySources) {
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
if (healthyExhausted) break;
console.log(`\n Source: ${source.name}`);
console.log(`\n Searching: "${query}"...`);
const result = await collectImages(
query,
TARGET_HEALTHY - totalHealthyUrls.length,
healthySeen,
);
for (const query of allHealthyQueries.slice(0, 20)) {
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
totalHealthyUrls.push(...result.urls);
if (result.exhausted) {
healthyExhausted = true;
process.stdout.write(` "${query}"... `);
const result = await source.collector(
query,
TARGET_HEALTHY - totalHealthyUrls.length,
healthySeen,
);
totalHealthyUrls.push(...result.urls);
if (!result.exhausted) anyRemaining = true;
console.log(`${result.urls.length} new`);
}
}
healthyCp.seenUrls = Array.from(healthySeen);
if (totalHealthyUrls.length > 0) {
console.log(`\n Found ${totalHealthyUrls.length} healthy image URLs. Downloading...`);
healthyCp.exhausted = !anyRemaining;
saveProgress(progress);
console.log(`\n Downloading ${totalHealthyUrls.length} healthy images...`);
const { downloaded, failed } = await downloadBatch(
totalHealthyUrls,
healthyDir,
@@ -562,14 +739,12 @@ async function main() {
healthyCp.count += downloaded;
healthyCp.downloaded += downloaded;
healthyCp.failed += failed;
healthyCp.exhausted = healthyExhausted;
const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
console.log(
` Got ${downloaded} images (${failed} failed). Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
` Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
);
} else {
healthyCp.exhausted = true;
console.log(` ✗ No healthy images found`);
}
@@ -580,76 +755,27 @@ async function main() {
const elapsed = Math.round((Date.now() - startTime) / 1000);
const mins = Math.floor(elapsed / 60);
const secs = elapsed % 60;
const hrs = Math.floor(mins / 60);
let totalDownloaded = 0;
let totalFailed = 0;
let totalTarget = 0;
for (const [classId, cp] of Object.entries(progress.classes)) {
for (const [, cp] of Object.entries(progress.classes)) {
totalDownloaded += cp.downloaded || 0;
totalFailed += cp.failed || 0;
totalTarget += classId === HEALTHY_CLASS ? TARGET_HEALTHY : TARGET_PER_DISEASE;
}
const totalSize = await getDatasetSize();
const sizeGb = (totalSize / (1024 * 1024 * 1024)).toFixed(2);
console.log("\n" + "=".repeat(60));
console.log("COMPLETE");
console.log("=".repeat(60));
console.log(` Time: ${mins}m ${secs}s`);
console.log(` Time: ${hrs}h ${mins % 60}m`);
console.log(` Downloaded: ${totalDownloaded} images`);
console.log(` Failed: ${totalFailed} images`);
console.log(` Target: ${totalTarget} images`);
console.log(` Dataset size: ${sizeGb} GB`);
console.log(` Dataset location: ${DATASET_DIR}/`);
console.log("");
console.log("Next steps:");
console.log(" 1. Run the fine-tuning script to train on this dataset");
console.log(" 2. The fine-tuning script will resize to 160×160 and augment");
console.log(` Dataset: ${DATASET_DIR}/`);
await closeDb();
console.log("=".repeat(60));
}
/**
* Calculate total size of the dataset directory.
*/
async function getDatasetSize(): Promise<number> {
let total = 0;
if (!existsSync(DATASET_DIR)) return 0;
const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
for (const entry of entries) {
if (!entry.name.startsWith(".")) {
const fullPath = resolve(DATASET_DIR, entry.name);
if (entry.isDirectory()) {
total += dirSize(fullPath);
}
}
}
return total;
}
function dirSize(dirPath: string): number {
let total = 0;
try {
const entries = readdirSync(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = join(dirPath, entry.name);
if (entry.isFile()) {
total += statSync(fullPath).size;
} else if (entry.isDirectory()) {
total += dirSize(fullPath);
}
}
} catch {
// skip errors
}
return total;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}