Files
plant-disease-id/apps/web/scripts/fill-training-dataset.ts
2026-06-07 12:06:41 -04:00

769 lines
26 KiB
JavaScript

#!/usr/bin/env node
/**
* fill-training-dataset.ts
*
* Scans the existing dataset directory and downloads any missing images
* to reach the target counts (200 per disease, 400 for healthy).
*
* Does NOT re-run prevalence queries — just fills gaps from image sources.
* Each run scans the directory, reports deficits, then fills them.
* Interrupt-safe: re-run to pick up where you left off.
*
* Usage: cd apps/web && npx tsx scripts/fill-training-dataset.ts
*/
import "dotenv/config";
import { readFileSync, readdirSync, writeFileSync, existsSync, mkdirSync } from "fs";
import { resolve, extname } from "path";
// Load .env.development for DB creds
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) process.env[key] = val;
}
}
}
} catch {}
import { getDb, closeDb } from "@/lib/db/index";
import { diseases } from "@/lib/db/schema";
import { sql } from "drizzle-orm";
// ─── Config ─────────────────────────────────────────────────────────────────
const DATASET_DIR = resolve(__dirname, "../data/dataset");
const SEEN_CACHE_FILE = resolve(DATASET_DIR, ".fill-seen-urls.json");
/** Target images per disease */
const TARGET_PER_DISEASE = 200;
/** Target images for the "healthy" class */
const TARGET_HEALTHY = 400;
/** Delay between DuckDuckGo search API calls (ms) */
const SEARCH_DELAY = 1500;
/** Max concurrent image downloads per disease */
const CONCURRENT_DOWNLOADS = 30;
/** Number of diseases to process in parallel */
const DISEASE_CONCURRENCY = 5;
/** Minimum image size in bytes to accept */
const MIN_IMAGE_SIZE = 10_000; // 10KB
/** Maximum image size in bytes */
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
/** Allowed file extensions */
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
/** User agent for requests */
const UA =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
/** Healthy class directory name */
const HEALTHY_CLASS = "healthy";
// ─── Types ──────────────────────────────────────────────────────────────────
interface DuckDuckGoImageResult {
image: string;
title: string;
url: string;
thumbnail: string;
height: number;
width: number;
}
interface DiseaseInfo {
id: string;
name: string;
plantId: string;
have: number;
needed: number;
}
// ─── Helpers ────────────────────────────────────────────────────────────────
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/** Count actual image files in a directory (matching img_* pattern). */
function countImagesInDir(dir: string): number {
if (!existsSync(dir)) return 0;
try {
const files = readdirSync(dir);
return files.filter((f) => f.startsWith("img_")).length;
} catch {
return 0;
}
}
/** Format bytes for display */
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}
// ─── Seen-URLs Cache ──────────────────────────────────────────────────────
/**
* Load the per-disease seen-URLs cache from disk.
* This prevents re-fetching the same URLs across runs.
*/
function loadSeenUrlsCache(): Record<string, string[]> {
if (existsSync(SEEN_CACHE_FILE)) {
try {
return JSON.parse(readFileSync(SEEN_CACHE_FILE, "utf-8"));
} catch {}
}
return {};
}
/**
* Save the seen-URLs cache to disk.
*/
function saveSeenUrlsCache(cache: Record<string, string[]>): void {
writeFileSync(SEEN_CACHE_FILE, JSON.stringify(cache, null, 2));
}
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
async function getVqdToken(query: string): Promise<string> {
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
const res = await fetch(url, {
headers: { "User-Agent": UA, Accept: "text/html" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
const html = await res.text();
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
return match[1];
}
async function searchImagesDuckDuckGo(
query: string,
vqd: string,
page: number,
): Promise<DuckDuckGoImageResult[]> {
const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
query,
)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
const res = await fetch(url, {
headers: {
"User-Agent": UA,
Accept: "application/json",
Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
},
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) {
if (res.status === 429) {
console.warn(" ⚠ DDG rate limited (429). Waiting 10s...");
await sleep(10_000);
return searchImagesDuckDuckGo(query, vqd, page);
}
if (res.status === 403) return [];
throw new Error(`DuckDuckGo search failed: ${res.status}`);
}
const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
return data.results ?? [];
}
async function collectImagesDuckDuckGo(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
const results: string[] = [];
let page = 1;
let exhausted = false;
let consecutiveEmpty = 0;
let vqd: string;
try {
vqd = await getVqdToken(query);
} catch (err) {
console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
return { urls: [], exhausted: true };
}
const MAX_PAGES = 5;
let lowNoveltyCount = 0;
while (results.length < target && page <= MAX_PAGES) {
await sleep(SEARCH_DELAY);
let pageResults: DuckDuckGoImageResult[];
try {
pageResults = await searchImagesDuckDuckGo(query, vqd, page);
} catch (err) {
console.warn(` ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
break;
}
if (!pageResults || pageResults.length === 0) {
consecutiveEmpty++;
if (consecutiveEmpty >= 3) {
exhausted = true;
break;
}
page++;
continue;
}
consecutiveEmpty = 0;
let newCount = 0;
for (const r of pageResults) {
if (results.length >= target) break;
const imgUrl = r.image || r.url;
if (!imgUrl || typeof imgUrl !== "string") continue;
if (seenUrls.has(imgUrl)) continue;
try {
new URL(imgUrl);
} catch {
continue;
}
seenUrls.add(imgUrl);
results.push(imgUrl);
newCount++;
}
const newRatio = newCount / pageResults.length;
if (newRatio < 0.05) {
lowNoveltyCount++;
if (lowNoveltyCount >= 2) break;
} else {
lowNoveltyCount = 0;
}
if (results.length < target) page++;
}
return { urls: results.slice(0, target), exhausted };
}
// ─── iNaturalist API ───────────────────────────────────────────────────────
async function searchImagesInaturalist(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
const results: string[] = [];
const perPage = Math.min(target, 200);
const apiUrl =
`https://api.inaturalist.org/v1/observations` +
`?q=${encodeURIComponent(query)}` +
`&photos_only=true` +
`&quality_grade=research` +
`&per_page=${perPage}` +
`&order_by=observed_on&order=desc`;
try {
const res = await fetch(apiUrl, {
headers: { "User-Agent": UA, Accept: "application/json" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) return { urls: [], exhausted: false };
const data = (await res.json()) as {
results: Array<{ photos: Array<{ url: string }> }>;
};
for (const obs of data.results ?? []) {
if (results.length >= target) break;
for (const photo of obs.photos ?? []) {
if (results.length >= target) break;
const url = photo.url;
if (!url || seenUrls.has(url)) continue;
const fullUrl = url.replace("/medium.", "/original.");
seenUrls.add(fullUrl);
results.push(fullUrl);
}
}
return { urls: results, exhausted: results.length < target };
} catch {
return { urls: results, exhausted: false };
}
}
// ─── Wikimedia Commons API ─────────────────────────────────────────────────
async function searchImagesCommons(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
const results: string[] = [];
let sroffset = 0;
while (results.length < target) {
const params = new URLSearchParams({
action: "query",
list: "search",
srsearch: query,
srnamespace: "6",
srlimit: "50",
sroffset: String(sroffset),
format: "json",
});
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
try {
const res = await fetch(url, {
headers: { "User-Agent": UA },
signal: AbortSignal.timeout(10_000),
});
if (!res.ok) break;
const data = (await res.json()) as {
query?: { search?: Array<{ title: string }> };
continue?: { sroffset?: number };
};
const hits = data.query?.search ?? [];
if (hits.length === 0) break;
for (const hit of hits) {
if (results.length >= target) break;
const filename = hit.title.replace(/^File:/, "");
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
filename,
)}`;
if (seenUrls.has(imgUrl)) continue;
seenUrls.add(imgUrl);
results.push(imgUrl);
}
sroffset = data.continue?.sroffset ?? sroffset + hits.length;
} catch {
break;
}
}
return { urls: results, exhausted: results.length < target };
}
// ─── Image Download ─────────────────────────────────────────────────────────
async function downloadImage(url: string, destPath: string): Promise<boolean> {
try {
const res = await fetch(url, {
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) return false;
const contentType = res.headers.get("content-type") || "";
if (contentType.includes("text/html")) return false;
const buffer = Buffer.from(await res.arrayBuffer());
if (buffer.length < MIN_IMAGE_SIZE) return false;
if (buffer.length > MAX_IMAGE_SIZE) return false;
let ext = extname(new URL(url).pathname).toLowerCase();
if (!ALLOWED_EXTENSIONS.includes(ext)) {
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
else if (contentType.includes("png")) ext = ".png";
else if (contentType.includes("webp")) ext = ".webp";
else ext = ".jpg";
}
const filePath = destPath.replace(/\.\w+$/, ext);
writeFileSync(filePath, buffer);
return true;
} catch {
return false;
}
}
async function downloadBatch(
urls: string[],
classDir: string,
startIndex: number,
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
let downloaded = 0;
let failed = 0;
let index = startIndex;
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
const results = await Promise.all(
chunk.map(async (url) => {
const paddedIndex = String(index).padStart(4, "0");
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
const success = await downloadImage(url, destPath);
return { success, index: index++ };
}),
);
for (const r of results) {
if (r.success) downloaded++;
else failed++;
}
const total = downloaded + failed;
if (total % 30 === 0 || total === urls.length) {
process.stdout.write(`\r Progress: ${downloaded}/${urls.length} (${failed} failed)`);
}
}
console.log();
return { downloaded, failed, lastIndex: index };
}
// ─── Query Building ─────────────────────────────────────────────────────────
function buildSearchQueries(name: string, plant: string): string[] {
return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
}
function buildHealthyQueries(plant: string): string[] {
const name = plant.replace(/-/g, " ");
return [
`healthy ${name} leaf`,
`${name} leaf closeup`,
`healthy ${name} plant`,
`${name} foliage`,
];
}
// ─── Fill Logic ─────────────────────────────────────────────────────────────
/**
* Try to collect up to `needed` images for a disease by hitting all three
* sources in order. Returns how many new images were actually downloaded.
*/
async function fillClass(
diseaseId: string,
queries: string[],
needed: number,
classDir: string,
seenUrls: Set<string>,
): Promise<number> {
if (needed <= 0) return 0;
mkdirSync(classDir, { recursive: true });
const allUrls: string[] = [];
// ── Source 1: DuckDuckGo ───────────────────────────────────────────────
if (allUrls.length < needed) {
for (const query of queries) {
if (allUrls.length >= needed) break;
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
allUrls.push(...result.urls);
console.log(`${result.urls.length} new`);
if (result.exhausted) break;
}
}
// ── Source 2: iNaturalist ──────────────────────────────────────────────
if (allUrls.length < needed) {
process.stdout.write(` iNat: Searching... `);
const result = await searchImagesInaturalist(queries[0], needed - allUrls.length, seenUrls);
allUrls.push(...result.urls);
console.log(`${result.urls.length} new`);
}
// ── Source 3: Wikimedia Commons ────────────────────────────────────────
if (allUrls.length < needed) {
process.stdout.write(` Commons: Searching... `);
const result = await searchImagesCommons(queries[0], needed - allUrls.length, seenUrls);
allUrls.push(...result.urls);
console.log(`${result.urls.length} new`);
}
if (allUrls.length === 0) {
console.log(` ✗ No new images found from any source`);
return 0;
}
console.log(` Downloading ${allUrls.length} images...`);
const startIndex = countImagesInDir(classDir);
const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);
const newTotal = countImagesInDir(classDir);
const gained = newTotal - startIndex;
console.log(
` ${downloaded > 0 ? "✓" : "✗"} Downloaded ${downloaded}/${allUrls.length}` +
` (${failed} failed, ${gained} new files)`,
);
return gained;
}
// ─── Directory Scanner ─────────────────────────────────────────────────────
interface ScanResult {
/** Disease id → how many images currently on disk */
diseaseCounts: Map<string, number>;
/** How many healthy images on disk */
healthyCount: number;
}
function scanDataset(): ScanResult {
const diseaseCounts = new Map<string, number>();
let healthyCount = 0;
if (!existsSync(DATASET_DIR)) {
return { diseaseCounts, healthyCount: 0 };
}
const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory()) continue;
if (entry.name.startsWith(".")) continue;
if (entry.name === HEALTHY_CLASS) {
healthyCount = countImagesInDir(resolve(DATASET_DIR, entry.name));
} else {
const count = countImagesInDir(resolve(DATASET_DIR, entry.name));
if (count > 0) {
diseaseCounts.set(entry.name, count);
}
}
}
return { diseaseCounts, healthyCount };
}
// ─── Main ───────────────────────────────────────────────────────────────────
async function main() {
console.log("=".repeat(60));
console.log("TRAINING DATASET FILL — Gap-filling download");
console.log("=".repeat(60));
// Ensure dataset directory exists
mkdirSync(DATASET_DIR, { recursive: true });
// ── Step 1: Scan what we already have ────────────────────────────────────
console.log("\nScanning existing dataset...");
const { diseaseCounts, healthyCount } = scanDataset();
console.log(` Found ${diseaseCounts.size} disease directories, ${healthyCount} healthy images`);
// ── Step 2: Load disease info from DB ────────────────────────────────────
console.log("\nLoading disease info from database...");
const db = getDb();
const allDiseases = await db
.select({
id: diseases.id,
plantId: diseases.plantId,
name: diseases.name,
})
.from(diseases);
// Build a deduplicated map: disease id → first disease info found
const diseaseInfo = new Map<string, { name: string; plantId: string }>();
for (const d of allDiseases) {
if (!diseaseInfo.has(d.id)) {
diseaseInfo.set(d.id, { name: d.name, plantId: d.plantId });
}
}
console.log(` Loaded ${diseaseInfo.size} unique diseases from DB`);
// ── Step 3: Build deficit list ──────────────────────────────────────────
const deficits: DiseaseInfo[] = [];
for (const [id, info] of diseaseInfo) {
const have = diseaseCounts.get(id) ?? 0;
const needed = TARGET_PER_DISEASE - have;
if (needed > 0) {
deficits.push({ id, name: info.name, plantId: info.plantId, have, needed });
}
}
// Sort by deficit size (largest first) so we prioritize the neediest diseases
deficits.sort((a, b) => b.needed - a.needed);
const healthyDeficit = TARGET_HEALTHY - healthyCount;
console.log(`\n${"=".repeat(60)}`);
console.log("DEFICIT REPORT");
console.log(`${"=".repeat(60)}`);
console.log(` Diseases needing images: ${deficits.length}/${diseaseInfo.size}`);
console.log(` Total images missing: ${deficits.reduce((s, d) => s + d.needed, 0)}`);
console.log(` Healthy deficit: ${Math.max(0, healthyDeficit)}`);
console.log(`${"=".repeat(60)}`);
if (deficits.length === 0 && healthyDeficit <= 0) {
console.log("\n ✓ Nothing to do — all targets met!\n");
await closeDb();
return;
}
// ── Step 4: Load seen-URLs cache ────────────────────────────────────────
const seenUrlsCache = loadSeenUrlsCache();
let totalDownloaded = 0;
let totalFailed = 0;
const startTime = Date.now();
// ── Step 5: Fill disease deficits ───────────────────────────────────────
if (deficits.length > 0) {
console.log("\n" + "─".repeat(60));
console.log(`FILLING ${deficits.length} DISEASES (target: ${TARGET_PER_DISEASE} each)`);
console.log("─".repeat(60));
// Process in parallel batches
for (let i = 0; i < deficits.length; i += DISEASE_CONCURRENCY) {
const batch = deficits.slice(i, i + DISEASE_CONCURRENCY);
const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
const totalBatches = Math.ceil(deficits.length / DISEASE_CONCURRENCY);
console.log(`\n[Batch ${batchNum}/${totalBatches}] Processing ${batch.length} diseases...`);
await Promise.all(
batch.map(async (d) => {
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d.name, d.plantId);
const seen = new Set<string>(seenUrlsCache[d.id] ?? []);
console.log(
` [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
);
const gained = await fillClass(d.id, queries, d.needed, classDir, seen);
// Update seen-URLs cache for this disease
seenUrlsCache[d.id] = Array.from(seen);
saveSeenUrlsCache(seenUrlsCache);
totalDownloaded += gained;
}),
);
// Save seen cache after every batch
saveSeenUrlsCache(seenUrlsCache);
const elapsed = Math.round((Date.now() - startTime) / 1000);
console.log(
` [Batch ${batchNum}/${totalBatches}] checkpoint — ` +
`${totalDownloaded} downloaded so far (${elapsed}s elapsed)`,
);
}
}
// ── Step 6: Fill healthy deficit ────────────────────────────────────────
if (healthyDeficit > 0) {
console.log("\n" + "─".repeat(60));
console.log(`FILLING HEALTHY CLASS (target: ${TARGET_HEALTHY})`);
console.log("─".repeat(60));
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
mkdirSync(healthyDir, { recursive: true });
// Collect all unique plants from the disease info
const allPlants = [...new Set(diseaseInfo.values())].map((d) => d.plantId);
const allHealthyQueries: string[] = [];
for (const plant of allPlants) {
allHealthyQueries.push(...buildHealthyQueries(plant));
}
const healthySeen = new Set<string>(seenUrlsCache[HEALTHY_CLASS] ?? []);
const healthyNeeded = TARGET_HEALTHY - countImagesInDir(healthyDir);
const allUrls: string[] = [];
// Try each source with up to 20 healthy queries
const sources = [
{ name: "DDG", collector: collectImagesDuckDuckGo },
{ name: "iNat", collector: searchImagesInaturalist },
{ name: "Commons", collector: searchImagesCommons },
] as const;
for (const source of sources) {
if (allUrls.length >= healthyNeeded) break;
console.log(`\n Source: ${source.name}`);
for (const query of allHealthyQueries.slice(0, 20)) {
if (allUrls.length >= healthyNeeded) break;
process.stdout.write(` "${query}"... `);
const result = await source.collector(query, healthyNeeded - allUrls.length, healthySeen);
allUrls.push(...result.urls);
console.log(`${result.urls.length} new`);
}
}
if (allUrls.length > 0) {
console.log(`\n Downloading ${allUrls.length} healthy images...`);
const startIdx = countImagesInDir(healthyDir);
const { downloaded, failed } = await downloadBatch(allUrls, healthyDir, startIdx);
const newTotal = countImagesInDir(healthyDir);
const gained = newTotal - healthyCount;
totalDownloaded += gained;
totalFailed += failed;
console.log(
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images.` +
` Total healthy: ${newTotal}/${TARGET_HEALTHY} (${gained} new)`,
);
} else {
console.log(`\n ✗ No healthy images found`);
}
// Update seen-URLs cache
seenUrlsCache[HEALTHY_CLASS] = Array.from(healthySeen);
saveSeenUrlsCache(seenUrlsCache);
}
// ── Summary ──────────────────────────────────────────────────────────────
const elapsed = Math.round((Date.now() - startTime) / 1000);
const mins = Math.floor(elapsed / 60);
const hrs = Math.floor(mins / 60);
// Final scan
const finalScan = scanDataset();
const totalHave = [...finalScan.diseaseCounts.values()].reduce((s, c) => s + c, 0);
const atTarget = [...finalScan.diseaseCounts.values()].filter(
(c) => c >= TARGET_PER_DISEASE,
).length;
console.log("\n" + "=".repeat(60));
console.log(" ✅ FILL COMPLETE");
console.log("=".repeat(60));
console.log(` Time: ${hrs}h ${mins % 60}m`);
console.log(` Diseases at target: ${atTarget}/${diseaseInfo.size}`);
console.log(` Total images: ${totalHave}`);
console.log(` Healthy images: ${finalScan.healthyCount}/${TARGET_HEALTHY}`);
console.log(` New downloads: ${totalDownloaded}`);
console.log(` Dataset dir: ${DATASET_DIR}/`);
await closeDb();
console.log("=".repeat(60));
}
main().catch((err) => {
console.error("\nFatal error:", err);
process.exit(1);
});