1180 lines
38 KiB
JavaScript
1180 lines
38 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* scrape-training-dataset.ts
|
|
*
|
|
* Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
|
|
*
|
|
* Target: Top 200 most common plant diseases (ranked by iNaturalist observation counts)
|
|
* - 200 images per disease
|
|
* - 200 healthy plant images
|
|
* - Processes 5 diseases in parallel with 30 concurrent downloads each
|
|
*
|
|
* Sources (all free, no API keys):
|
|
* 1. DB image_url — existing images already found
|
|
* 2. DuckDuckGo — general web image search
|
|
* 3. iNaturalist — real-world plant observation photos
|
|
* 4. Wikimedia Commons — curated scientific/educational images
|
|
*
|
|
* Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
|
|
* Progress: data/dataset/.progress.json — interrupt and resume safely.
|
|
*/
|
|
|
|
import "dotenv/config";
|
|
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "fs";
|
|
import { resolve, extname } from "path";
|
|
|
|
// Load .env.development for DB creds
|
|
const envPath = resolve(__dirname, "../.env.development");
|
|
try {
|
|
const env = readFileSync(envPath, "utf-8");
|
|
for (const line of env.split("\n")) {
|
|
const trimmed = line.trim();
|
|
if (trimmed && !trimmed.startsWith("#")) {
|
|
const eqIdx = trimmed.indexOf("=");
|
|
if (eqIdx > 0) {
|
|
const key = trimmed.slice(0, eqIdx).trim();
|
|
const val = trimmed.slice(eqIdx + 1).trim();
|
|
if (!process.env[key]) process.env[key] = val;
|
|
}
|
|
}
|
|
}
|
|
} catch {}
|
|
|
|
import { getDb, closeDb } from "@/lib/db/index";
|
|
import { diseases } from "@/lib/db/schema";
|
|
import { sql } from "drizzle-orm";
|
|
|
|
// ─── Config ─────────────────────────────────────────────────────────────────
|
|
|
|
const DATASET_DIR = resolve(__dirname, "../data/dataset");
|
|
const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");
|
|
|
|
/** Target images per disease */
|
|
const TARGET_PER_DISEASE = 200;
|
|
|
|
/** Number of diseases to target (most common first) */
|
|
const TARGET_DISEASE_COUNT = 200;
|
|
|
|
/** Target images for the "healthy" class */
|
|
const TARGET_HEALTHY = 400;
|
|
|
|
/** Delay between DuckDuckGo search API calls (ms) */
|
|
const SEARCH_DELAY = 1500;
|
|
|
|
/** Max concurrent image downloads per disease */
|
|
const CONCURRENT_DOWNLOADS = 30;
|
|
|
|
/** Number of diseases to process in parallel */
|
|
const DISEASE_CONCURRENCY = 5;
|
|
|
|
/** Minimum image size in bytes to accept */
|
|
const MIN_IMAGE_SIZE = 10_000; // 10KB
|
|
|
|
/** Maximum image size in bytes */
|
|
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
|
|
|
|
/** Allowed file extensions */
|
|
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
|
|
|
|
/** User agent for requests */
|
|
const UA =
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
|
|
/** Class ID for healthy plants */
|
|
const HEALTHY_CLASS = "healthy";
|
|
|
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
|
|
|
interface DbDisease {
|
|
id: string;
|
|
plantId: string;
|
|
name: string;
|
|
imageUrl: string | null;
|
|
}
|
|
|
|
interface DuckDuckGoImageResult {
|
|
image: string;
|
|
title: string;
|
|
url: string;
|
|
thumbnail: string;
|
|
height: number;
|
|
width: number;
|
|
}
|
|
|
|
interface SourceState {
|
|
exhausted: boolean;
|
|
}
|
|
|
|
interface ClassProgress {
|
|
count: number;
|
|
downloaded: number;
|
|
failed: number;
|
|
seenUrls: string[];
|
|
exhausted: boolean;
|
|
/** Per-source exhaustion tracking — prevents re-scraping exhausted sources on resume */
|
|
sources: {
|
|
db: SourceState;
|
|
duckduckgo: SourceState;
|
|
inaturalist: SourceState;
|
|
wikimedia: SourceState;
|
|
};
|
|
}
|
|
|
|
interface Progress {
|
|
lastUpdated: string;
|
|
classes: Record<string, ClassProgress>;
|
|
/** Phase checkpoint: 0=core, 1=full, 2=healthy. On resume, skip to this phase. */
|
|
phase: number;
|
|
/** Index within the current phase's disease array. On resume, skip to this index. */
|
|
phaseIndex: number;
|
|
}
|
|
|
|
// ─── DB Loading ──────────────────────────────────────────────────────────────
|
|
|
|
const INAT_CACHE_FILE = resolve(DATASET_DIR, ".inat-prevalence-cache.json");
|
|
|
|
/**
|
|
* Query iNaturalist for real-world prevalence of a disease.
|
|
* Returns observation count (higher = more common in the real world).
|
|
*/
|
|
async function getInatPrevalence(diseaseName: string, plantName?: string): Promise<number> {
|
|
try {
|
|
const headers = { "User-Agent": UA, Accept: "application/json" };
|
|
const signal = AbortSignal.timeout(10_000);
|
|
const baseUrl = "https://api.inaturalist.org/v1/observations";
|
|
|
|
// Tier 1: disease + plant name, research-grade, Plantae/Fungi/Chromista
|
|
// This is the most specific and reliable query — filters to relevant kingdoms
|
|
// and only counts community-verified observations.
|
|
if (plantName) {
|
|
const q = `${diseaseName} ${plantName}`;
|
|
const url =
|
|
`${baseUrl}?q=${encodeURIComponent(q)}` +
|
|
`&quality_grade=research` +
|
|
`&iconic_taxon_id=47126,47158,47686` +
|
|
`&photos_only=true&per_page=1`;
|
|
const res = await fetch(url, { headers, signal });
|
|
if (res.ok) {
|
|
const data = (await res.json()) as { total_results: number };
|
|
if ((data.total_results ?? 0) > 0) return data.total_results!;
|
|
}
|
|
}
|
|
|
|
// Fallback: disease name only, all quality grades (original behavior)
|
|
const url = `${baseUrl}?q=${encodeURIComponent(diseaseName.toLowerCase())}&photos_only=true&per_page=1`;
|
|
const res = await fetch(url, { headers, signal });
|
|
if (!res.ok) return 0;
|
|
const data = (await res.json()) as { total_results: number };
|
|
return data.total_results ?? 0;
|
|
} catch {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load prevalence data from cache or build it by querying iNaturalist.
|
|
* Caches results to avoid re-querying on every run.
|
|
*/
|
|
async function loadPrevalenceData(
|
|
uniqueNames: string[],
|
|
plantMap?: Map<string, string>,
|
|
): Promise<Map<string, number>> {
|
|
// Load cache if exists
|
|
let cache: Record<string, number> = {};
|
|
if (existsSync(INAT_CACHE_FILE)) {
|
|
try {
|
|
cache = JSON.parse(readFileSync(INAT_CACHE_FILE, "utf-8"));
|
|
} catch {}
|
|
}
|
|
|
|
const prevalenceMap = new Map<string, number>();
|
|
const toQuery: string[] = [];
|
|
|
|
// Check which names need querying
|
|
for (const name of uniqueNames) {
|
|
const key = name.toLowerCase();
|
|
if (key in cache) {
|
|
prevalenceMap.set(name, cache[key]);
|
|
} else {
|
|
toQuery.push(name);
|
|
}
|
|
}
|
|
|
|
if (toQuery.length > 0) {
|
|
console.log(`\n Querying iNaturalist for ${toQuery.length} disease prevalence scores...`);
|
|
let queried = 0;
|
|
|
|
for (const name of toQuery) {
|
|
const count = await getInatPrevalence(name, plantMap?.get(name));
|
|
const key = name.toLowerCase();
|
|
cache[key] = count;
|
|
prevalenceMap.set(name, count);
|
|
queried++;
|
|
|
|
// Save cache every 10 queries
|
|
if (queried % 10 === 0) {
|
|
writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2));
|
|
console.log(` Queried ${queried}/${toQuery.length}...`);
|
|
}
|
|
|
|
// Rate limit: ~100 req/min
|
|
await sleep(600);
|
|
}
|
|
|
|
// Final cache save
|
|
writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2));
|
|
console.log(` ✓ Queried ${queried} diseases, cached to ${INAT_CACHE_FILE}`);
|
|
}
|
|
|
|
return prevalenceMap;
|
|
}
|
|
|
|
/**
|
|
* Persist prevalence scores to the database and update prevalence enum.
|
|
* Maps observation counts to common/uncommon/rare based on thresholds.
|
|
*/
|
|
async function persistPrevalenceData(
|
|
db: ReturnType<typeof getDb>,
|
|
prevalenceMap: Map<string, number>,
|
|
): Promise<void> {
|
|
// Load all diseases to update
|
|
const allDiseases = await db
|
|
.select({
|
|
id: diseases.id,
|
|
name: diseases.name,
|
|
})
|
|
.from(diseases);
|
|
|
|
// Compute percentile-based thresholds from actual score distribution.
|
|
// Top 25% → common, bottom 25% → rare, middle 50% → uncommon.
|
|
// This guarantees meaningful classification regardless of absolute scale.
|
|
const scores = Array.from(prevalenceMap.values())
|
|
.filter((s) => s > 0)
|
|
.sort((a, b) => a - b);
|
|
const n = scores.length;
|
|
const commonThreshold = n > 0 ? scores[Math.floor(n * 0.75)] : 1000;
|
|
const rareThreshold = n > 0 ? scores[Math.floor(n * 0.25)] : 10;
|
|
|
|
console.log(
|
|
`\n Prevalence distribution: ${n} non-zero scores` +
|
|
`, p25=${rareThreshold.toLocaleString()}` +
|
|
`, p75=${commonThreshold.toLocaleString()}`,
|
|
);
|
|
console.log(` Persisting prevalence data for ${allDiseases.length} diseases...`);
|
|
let updated = 0;
|
|
|
|
for (const disease of allDiseases) {
|
|
const score = prevalenceMap.get(disease.name) ?? 0;
|
|
|
|
// Map score to prevalence enum using distribution-based thresholds.
|
|
// Score of 0 means no iNaturalist observations found — genuinely rare.
|
|
let prevalence: "common" | "uncommon" | "rare" | "very_rare";
|
|
if (score === 0) {
|
|
prevalence = "very_rare";
|
|
} else if (score >= commonThreshold) {
|
|
prevalence = "common";
|
|
} else if (score > rareThreshold) {
|
|
prevalence = "uncommon";
|
|
} else {
|
|
prevalence = "rare";
|
|
}
|
|
|
|
await db
|
|
.update(diseases)
|
|
.set({
|
|
prevalenceScore: score,
|
|
prevalence,
|
|
updatedAt: sql`(datetime('now'))`,
|
|
})
|
|
.where(sql`${diseases.id} = ${disease.id}`);
|
|
|
|
updated++;
|
|
if (updated % 100 === 0) {
|
|
console.log(` Updated ${updated}/${allDiseases.length}...`);
|
|
}
|
|
}
|
|
|
|
console.log(` ✓ Updated ${updated} diseases with prevalence data`);
|
|
}
|
|
|
|
/**
|
|
* Load the top 200 most common diseases from the database.
|
|
* Ranks by iNaturalist observation counts (real-world prevalence data).
|
|
*/
|
|
async function loadDiseasesFromDb(): Promise<DbDisease[]> {
|
|
const db = getDb();
|
|
|
|
// Get unique disease names and their most common host plant for better iNaturalist queries.
|
|
const nameStats = await db
|
|
.select({
|
|
name: diseases.name,
|
|
plantId: diseases.plantId,
|
|
count: sql<number>`COUNT(*)`.mapWith(Number),
|
|
})
|
|
.from(diseases)
|
|
.groupBy(diseases.name, diseases.plantId);
|
|
|
|
// Aggregate: unique names, name frequency (across all plants), and most common plant per name
|
|
const seenNames = new Set<string>();
|
|
const nameFrequency = new Map<string, number>();
|
|
const plantFreq = new Map<string, Map<string, number>>();
|
|
let totalDiseases = 0;
|
|
|
|
for (const row of nameStats) {
|
|
seenNames.add(row.name);
|
|
nameFrequency.set(row.name, (nameFrequency.get(row.name) ?? 0) + row.count);
|
|
totalDiseases += row.count;
|
|
|
|
if (!plantFreq.has(row.name)) plantFreq.set(row.name, new Map());
|
|
plantFreq.get(row.name)!.set(row.plantId, row.count);
|
|
}
|
|
|
|
const uniqueNames = [...seenNames];
|
|
|
|
// For each disease name, pick the most frequent host plant for more specific iNaturalist queries
|
|
const plantMap = new Map<string, string>();
|
|
for (const [name, freq] of plantFreq) {
|
|
const top = [...freq.entries()].sort((a, b) => b[1] - a[1])[0];
|
|
plantMap.set(name, top[0]);
|
|
}
|
|
|
|
console.log(
|
|
` Found ${uniqueNames.length} unique disease names across ${totalDiseases} diseases`,
|
|
);
|
|
|
|
// Load or build prevalence data from iNaturalist (with plant context for better queries)
|
|
const prevalenceMap = await loadPrevalenceData(uniqueNames, plantMap);
|
|
|
|
// Persist prevalence scores to database
|
|
await persistPrevalenceData(db, prevalenceMap);
|
|
|
|
// Load all diseases
|
|
const allDiseases = await db
|
|
.select({
|
|
id: diseases.id,
|
|
plantId: diseases.plantId,
|
|
name: diseases.name,
|
|
imageUrl: diseases.imageUrl,
|
|
})
|
|
.from(diseases);
|
|
|
|
// Sort by iNaturalist prevalence (descending), then by name frequency as tiebreaker
|
|
allDiseases.sort((a, b) => {
|
|
const prevA = prevalenceMap.get(a.name) ?? 0;
|
|
const prevB = prevalenceMap.get(b.name) ?? 0;
|
|
if (prevA !== prevB) return prevB - prevA;
|
|
// Tiebreaker: name frequency
|
|
const freqA = nameFrequency.get(a.name) ?? 0;
|
|
const freqB = nameFrequency.get(b.name) ?? 0;
|
|
return freqB - freqA;
|
|
});
|
|
|
|
// Return top TARGET_DISEASE_COUNT
|
|
return allDiseases.slice(0, TARGET_DISEASE_COUNT);
|
|
}
|
|
|
|
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
|
|
|
|
async function getVqdToken(query: string): Promise<string> {
|
|
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
|
|
|
|
const res = await fetch(url, {
|
|
headers: { "User-Agent": UA, Accept: "text/html" },
|
|
signal: AbortSignal.timeout(15_000),
|
|
});
|
|
|
|
if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
|
|
|
|
const html = await res.text();
|
|
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
|
|
if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
|
|
|
|
return match[1];
|
|
}
|
|
|
|
async function searchImagesDuckDuckGo(
|
|
query: string,
|
|
vqd: string,
|
|
page: number,
|
|
): Promise<DuckDuckGoImageResult[]> {
|
|
const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
|
|
query,
|
|
)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
|
|
|
|
const res = await fetch(url, {
|
|
headers: {
|
|
"User-Agent": UA,
|
|
Accept: "application/json",
|
|
Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
|
|
},
|
|
signal: AbortSignal.timeout(15_000),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
if (res.status === 429) {
|
|
console.warn(" ⚠ Rate limited (429). Waiting 10s...");
|
|
await sleep(10_000);
|
|
return searchImagesDuckDuckGo(query, vqd, page);
|
|
}
|
|
if (res.status === 403) return [];
|
|
throw new Error(`DuckDuckGo search failed: ${res.status}`);
|
|
}
|
|
|
|
const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
|
|
return data.results ?? [];
|
|
}
|
|
|
|
async function collectImagesDuckDuckGo(
|
|
query: string,
|
|
target: number,
|
|
seenUrls: Set<string>,
|
|
): Promise<{ urls: string[]; exhausted: boolean }> {
|
|
const results: string[] = [];
|
|
let page = 1;
|
|
let exhausted = false;
|
|
let consecutiveEmpty = 0;
|
|
|
|
let vqd: string;
|
|
try {
|
|
vqd = await getVqdToken(query);
|
|
} catch (err) {
|
|
console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
|
|
return { urls: [], exhausted: true };
|
|
}
|
|
|
|
const MAX_PAGES = 5;
|
|
let lowNoveltyCount = 0;
|
|
|
|
while (results.length < target && page <= MAX_PAGES) {
|
|
await sleep(SEARCH_DELAY);
|
|
|
|
let pageResults: DuckDuckGoImageResult[];
|
|
try {
|
|
pageResults = await searchImagesDuckDuckGo(query, vqd, page);
|
|
} catch (err) {
|
|
console.warn(` ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
|
|
break;
|
|
}
|
|
|
|
if (!pageResults || pageResults.length === 0) {
|
|
consecutiveEmpty++;
|
|
if (consecutiveEmpty >= 3) {
|
|
exhausted = true;
|
|
break;
|
|
}
|
|
page++;
|
|
continue;
|
|
}
|
|
|
|
consecutiveEmpty = 0;
|
|
let newCount = 0;
|
|
|
|
for (const r of pageResults) {
|
|
if (results.length >= target) break;
|
|
const imgUrl = r.image || r.url;
|
|
if (!imgUrl || typeof imgUrl !== "string") continue;
|
|
if (seenUrls.has(imgUrl)) continue;
|
|
try {
|
|
new URL(imgUrl);
|
|
} catch {
|
|
continue;
|
|
}
|
|
seenUrls.add(imgUrl);
|
|
results.push(imgUrl);
|
|
newCount++;
|
|
}
|
|
|
|
const newRatio = newCount / pageResults.length;
|
|
if (newRatio < 0.05) {
|
|
lowNoveltyCount++;
|
|
if (lowNoveltyCount >= 2) break;
|
|
} else {
|
|
lowNoveltyCount = 0;
|
|
}
|
|
|
|
if (results.length < target) page++;
|
|
}
|
|
|
|
return { urls: results.slice(0, target), exhausted };
|
|
}
|
|
|
|
// ─── iNaturalist API ─────────────────────────────────────────────────────────
|
|
|
|
async function searchImagesInaturalist(
|
|
query: string,
|
|
target: number,
|
|
seenUrls: Set<string>,
|
|
): Promise<{ urls: string[]; exhausted: boolean }> {
|
|
const results: string[] = [];
|
|
const perPage = Math.min(target, 200);
|
|
|
|
const apiUrl =
|
|
`https://api.inaturalist.org/v1/observations` +
|
|
`?q=${encodeURIComponent(query)}` +
|
|
`&photos_only=true` +
|
|
`&quality_grade=research` +
|
|
`&per_page=${perPage}` +
|
|
`&order_by=observed_on&order=desc`;
|
|
|
|
try {
|
|
const res = await fetch(apiUrl, {
|
|
headers: { "User-Agent": UA, Accept: "application/json" },
|
|
signal: AbortSignal.timeout(15_000),
|
|
});
|
|
if (!res.ok) return { urls: [], exhausted: false };
|
|
|
|
const data = (await res.json()) as {
|
|
results: Array<{ photos: Array<{ url: string }> }>;
|
|
};
|
|
|
|
for (const obs of data.results ?? []) {
|
|
if (results.length >= target) break;
|
|
for (const photo of obs.photos ?? []) {
|
|
if (results.length >= target) break;
|
|
const url = photo.url;
|
|
if (!url || seenUrls.has(url)) continue;
|
|
const fullUrl = url.replace("/medium.", "/original.");
|
|
seenUrls.add(fullUrl);
|
|
results.push(fullUrl);
|
|
}
|
|
}
|
|
|
|
return { urls: results, exhausted: results.length < target };
|
|
} catch {
|
|
return { urls: results, exhausted: false };
|
|
}
|
|
}
|
|
|
|
// ─── Wikimedia Commons API ──────────────────────────────────────────────────
|
|
|
|
async function searchImagesCommons(
|
|
query: string,
|
|
target: number,
|
|
seenUrls: Set<string>,
|
|
): Promise<{ urls: string[]; exhausted: boolean }> {
|
|
const results: string[] = [];
|
|
let sroffset = 0;
|
|
|
|
while (results.length < target) {
|
|
const params = new URLSearchParams({
|
|
action: "query",
|
|
list: "search",
|
|
srsearch: query,
|
|
srnamespace: "6",
|
|
srlimit: "50",
|
|
sroffset: String(sroffset),
|
|
format: "json",
|
|
// No origin needed — server-side fetch, Wikimedia ignores CORS headers on API calls
|
|
});
|
|
|
|
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
|
|
|
|
try {
|
|
const res = await fetch(url, {
|
|
headers: { "User-Agent": UA },
|
|
signal: AbortSignal.timeout(10_000),
|
|
});
|
|
if (!res.ok) break;
|
|
|
|
const data = (await res.json()) as {
|
|
query?: { search?: Array<{ title: string }> };
|
|
continue?: { sroffset?: number };
|
|
};
|
|
|
|
const hits = data.query?.search ?? [];
|
|
if (hits.length === 0) break;
|
|
|
|
for (const hit of hits) {
|
|
if (results.length >= target) break;
|
|
const filename = hit.title.replace(/^File:/, "");
|
|
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
|
|
filename,
|
|
)}`;
|
|
if (seenUrls.has(imgUrl)) continue;
|
|
seenUrls.add(imgUrl);
|
|
results.push(imgUrl);
|
|
}
|
|
|
|
sroffset = data.continue?.sroffset ?? sroffset + hits.length;
|
|
} catch {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return { urls: results, exhausted: results.length < target };
|
|
}
|
|
|
|
// ─── Image Download ─────────────────────────────────────────────────────────
|
|
|
|
async function downloadImage(url: string, destPath: string): Promise<boolean> {
|
|
try {
|
|
const res = await fetch(url, {
|
|
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
|
|
signal: AbortSignal.timeout(15_000),
|
|
});
|
|
if (!res.ok) return false;
|
|
|
|
const contentType = res.headers.get("content-type") || "";
|
|
if (contentType.includes("text/html")) return false;
|
|
|
|
const buffer = Buffer.from(await res.arrayBuffer());
|
|
if (buffer.length < MIN_IMAGE_SIZE) return false;
|
|
if (buffer.length > MAX_IMAGE_SIZE) return false;
|
|
|
|
let ext = extname(new URL(url).pathname).toLowerCase();
|
|
if (!ALLOWED_EXTENSIONS.includes(ext)) {
|
|
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
|
|
else if (contentType.includes("png")) ext = ".png";
|
|
else if (contentType.includes("webp")) ext = ".webp";
|
|
else ext = ".jpg";
|
|
}
|
|
|
|
const filePath = destPath.replace(/\.\w+$/, ext);
|
|
writeFileSync(filePath, buffer);
|
|
return true;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
async function downloadBatch(
|
|
urls: string[],
|
|
classDir: string,
|
|
startIndex: number,
|
|
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
|
|
let downloaded = 0;
|
|
let failed = 0;
|
|
let index = startIndex;
|
|
|
|
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
|
|
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
|
|
|
|
const results = await Promise.all(
|
|
chunk.map(async (url) => {
|
|
const paddedIndex = String(index).padStart(4, "0");
|
|
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
|
|
const success = await downloadImage(url, destPath);
|
|
return { success, index: index++, url: url.substring(0, 50) };
|
|
}),
|
|
);
|
|
|
|
for (const r of results) {
|
|
if (r.success) downloaded++;
|
|
else {
|
|
failed++;
|
|
if (failed % 20 === 1) console.log(` ⚠ Failed: ${r.url}...`);
|
|
}
|
|
}
|
|
|
|
const total = downloaded + failed;
|
|
if (total % 30 === 0 || total === urls.length) {
|
|
console.log(` Progress: ${downloaded}/${urls.length} (${failed} failed)`);
|
|
}
|
|
}
|
|
|
|
return { downloaded, failed, lastIndex: index };
|
|
}
|
|
|
|
// ─── Progress Tracking ──────────────────────────────────────────────────────
|
|
|
|
function loadProgress(): Progress {
|
|
if (!existsSync(PROGRESS_FILE)) {
|
|
return {
|
|
lastUpdated: new Date().toISOString(),
|
|
classes: {},
|
|
phase: 0,
|
|
phaseIndex: 0,
|
|
};
|
|
}
|
|
try {
|
|
const raw = JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Partial<Progress>;
|
|
raw.classes ??= {};
|
|
|
|
// Migration: detect old tiered system (phaseIndex > 200 means it's from old core/full system)
|
|
const isOldFormat = (raw.phaseIndex ?? 0) > 200 || !raw.phase;
|
|
if (isOldFormat) {
|
|
console.warn(" ↻ Migrating progress file from old tiered system to new format");
|
|
console.warn(" Phase checkpoint reset to 0 (will re-scan all 200 diseases)");
|
|
console.warn(" Per-class progress (seenUrls, counts) preserved");
|
|
raw.phase = 0;
|
|
raw.phaseIndex = 0;
|
|
} else {
|
|
raw.phase ??= 0;
|
|
raw.phaseIndex ??= 0;
|
|
}
|
|
|
|
// Ensure each class has the sources field
|
|
for (const key of Object.keys(raw.classes)) {
|
|
const cp = raw.classes[key] as Partial<ClassProgress>;
|
|
|
|
// Migrate class-level exhausted to per-source exhausted if needed
|
|
if (!cp.sources) {
|
|
const classExhausted = cp.exhausted ?? false;
|
|
cp.sources = {
|
|
db: { exhausted: classExhausted },
|
|
duckduckgo: { exhausted: classExhausted },
|
|
inaturalist: { exhausted: classExhausted },
|
|
wikimedia: { exhausted: classExhausted },
|
|
};
|
|
}
|
|
|
|
cp.seenUrls ??= [];
|
|
}
|
|
return raw as Progress;
|
|
} catch {
|
|
console.warn(" ⚠ Corrupt progress file, starting fresh");
|
|
return {
|
|
lastUpdated: new Date().toISOString(),
|
|
classes: {},
|
|
phase: 0,
|
|
phaseIndex: 0,
|
|
};
|
|
}
|
|
}
|
|
|
|
function saveProgress(progress: Progress): void {
|
|
progress.lastUpdated = new Date().toISOString();
|
|
writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2));
|
|
}
|
|
|
|
function getClassProgress(progress: Progress, classId: string): ClassProgress {
|
|
if (!progress.classes[classId]) {
|
|
progress.classes[classId] = {
|
|
count: 0,
|
|
downloaded: 0,
|
|
failed: 0,
|
|
seenUrls: [],
|
|
exhausted: false,
|
|
sources: {
|
|
db: { exhausted: false },
|
|
duckduckgo: { exhausted: false },
|
|
inaturalist: { exhausted: false },
|
|
wikimedia: { exhausted: false },
|
|
},
|
|
};
|
|
}
|
|
return progress.classes[classId];
|
|
}
|
|
|
|
// ─── Query Building ─────────────────────────────────────────────────────────
|
|
|
|
function buildSearchQueries(disease: DbDisease): string[] {
|
|
const name = disease.name || disease.id.replace(/-/g, " ");
|
|
const plant = disease.plantId.replace(/-/g, " ");
|
|
// Every query keeps the disease NAME to avoid noisy labels
|
|
return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
|
|
}
|
|
|
|
function buildHealthyQueries(plant: string): string[] {
|
|
const name = plant.replace(/-/g, " ");
|
|
return [
|
|
`healthy ${name} leaf`,
|
|
`${name} leaf closeup`,
|
|
`healthy ${name} plant`,
|
|
`${name} foliage`,
|
|
];
|
|
}
|
|
|
|
// ─── File Reconciliation ───────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Count actual image files in a class directory.
|
|
* Returns the count of files matching img_* pattern, OR 0 if dir doesn't exist.
|
|
*/
|
|
function countImagesInDir(classDir: string): number {
|
|
if (!existsSync(classDir)) return 0;
|
|
try {
|
|
const files = readdirSync(classDir);
|
|
return files.filter((f) => f.startsWith("img_")).length;
|
|
} catch {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reconcile a class's progress count with actual files on disk.
|
|
* If files were deleted after the progress file was saved, this
|
|
* adjusts the count downward so we re-download the missing ones.
|
|
* Returns the reconciled count.
|
|
*/
|
|
function reconcileClassCount(classDir: string, progressCount: number): number {
|
|
const fileCount = countImagesInDir(classDir);
|
|
if (fileCount < progressCount) {
|
|
console.log(
|
|
` ↻ File count (${fileCount}) < progress count (${progressCount}) — reconciling`,
|
|
);
|
|
return fileCount;
|
|
}
|
|
return progressCount;
|
|
}
|
|
|
|
// ─── Dataset Collection ─────────────────────────────────────────────────────
|
|
|
|
async function collectClassImages(
|
|
classId: string,
|
|
queries: string[],
|
|
target: number,
|
|
progress: Progress,
|
|
classDir: string,
|
|
existingUrls: string[] = [],
|
|
): Promise<void> {
|
|
const cp = getClassProgress(progress, classId);
|
|
|
|
// ── Reconcile with actual files on disk ─────────────────────────────────
|
|
const actualCount = reconcileClassCount(classDir, cp.count);
|
|
if (actualCount !== cp.count) {
|
|
cp.count = actualCount;
|
|
saveProgress(progress);
|
|
}
|
|
|
|
const seenUrls = new Set(cp.seenUrls);
|
|
const sources = cp.sources;
|
|
|
|
if (cp.count >= target) {
|
|
console.log(` ✓ Already have ${cp.count}/${target}`);
|
|
return;
|
|
}
|
|
|
|
// Check if ALL sources are exhausted
|
|
const allExhausted =
|
|
sources.db.exhausted &&
|
|
sources.duckduckgo.exhausted &&
|
|
sources.inaturalist.exhausted &&
|
|
sources.wikimedia.exhausted;
|
|
|
|
if (allExhausted) {
|
|
cp.exhausted = true;
|
|
saveProgress(progress);
|
|
console.log(` ✓ Exhausted (${cp.count}/${target})`);
|
|
return;
|
|
}
|
|
|
|
mkdirSync(classDir, { recursive: true });
|
|
|
|
const allUrls: string[] = [];
|
|
let anyNewResults = false;
|
|
const needed = target - cp.count;
|
|
|
|
// ── Source 0: Existing DB URLs ──────────────────────────────────────────
|
|
if (!sources.db.exhausted) {
|
|
const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
|
|
if (freshDbUrls.length > 0) {
|
|
console.log(` DB: ${freshDbUrls.length} existing URLs`);
|
|
for (const url of freshDbUrls) {
|
|
if (allUrls.length >= needed) break;
|
|
seenUrls.add(url);
|
|
allUrls.push(url);
|
|
}
|
|
if (freshDbUrls.length > 0) anyNewResults = true;
|
|
}
|
|
// DB source is always "exhausted" after processing its initial URLs
|
|
sources.db.exhausted = true;
|
|
}
|
|
|
|
// ── Source 1: DuckDuckGo ──────────────────────────────────────────────
|
|
if (!sources.duckduckgo.exhausted && allUrls.length < needed) {
|
|
for (const query of queries) {
|
|
if (allUrls.length >= needed) break;
|
|
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
|
|
const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
|
|
allUrls.push(...result.urls);
|
|
if (result.exhausted) {
|
|
sources.duckduckgo.exhausted = true;
|
|
}
|
|
if (result.urls.length > 0) anyNewResults = true;
|
|
console.log(`${result.urls.length} new`);
|
|
if (allUrls.length >= needed) break;
|
|
}
|
|
// If DDG never gave us anything, mark exhausted to avoid re-trying
|
|
if (!anyNewResults && sources.duckduckgo.exhausted) {
|
|
/* already marked */
|
|
}
|
|
}
|
|
|
|
// ── Source 2: iNaturalist ──────────────────────────────────────────────
|
|
if (!sources.inaturalist.exhausted && allUrls.length < needed) {
|
|
const primaryQuery = queries[0];
|
|
console.log(` iNat: Searching...`);
|
|
const result = await searchImagesInaturalist(primaryQuery, needed - allUrls.length, seenUrls);
|
|
allUrls.push(...result.urls);
|
|
if (result.exhausted) sources.inaturalist.exhausted = true;
|
|
if (result.urls.length > 0) anyNewResults = true;
|
|
console.log(` iNat: ${result.urls.length} images`);
|
|
}
|
|
|
|
// ── Source 3: Wikimedia Commons ────────────────────────────────────────
|
|
if (!sources.wikimedia.exhausted && allUrls.length < needed) {
|
|
const primaryQuery = queries[0];
|
|
console.log(` Commons: Searching...`);
|
|
const result = await searchImagesCommons(primaryQuery, needed - allUrls.length, seenUrls);
|
|
allUrls.push(...result.urls);
|
|
if (result.exhausted) sources.wikimedia.exhausted = true;
|
|
if (result.urls.length > 0) anyNewResults = true;
|
|
console.log(` Commons: ${result.urls.length} images`);
|
|
}
|
|
|
|
if (allUrls.length === 0) {
|
|
cp.exhausted = true;
|
|
saveProgress(progress);
|
|
console.log(` ✗ No images found — exhausted`);
|
|
return;
|
|
}
|
|
|
|
if (!anyNewResults && allUrls.length > 0) {
|
|
// Only DB URLs survived — nothing more will come from searches
|
|
cp.exhausted = true;
|
|
saveProgress(progress);
|
|
}
|
|
|
|
// Save progress with seen URLs BEFORE downloading
|
|
cp.seenUrls = Array.from(seenUrls);
|
|
saveProgress(progress);
|
|
|
|
console.log(` Downloading ${allUrls.length} images...`);
|
|
|
|
// Use actual file count as start index so filenames don't have gaps
|
|
const startIndex = countImagesInDir(classDir);
|
|
const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);
|
|
|
|
// Re-count actual files on disk after download (more reliable than tracking)
|
|
const newTotal = countImagesInDir(classDir);
|
|
cp.count = newTotal;
|
|
cp.downloaded += downloaded;
|
|
cp.failed += failed;
|
|
|
|
// Check if all sources exhausted
|
|
if (
|
|
sources.db.exhausted &&
|
|
sources.duckduckgo.exhausted &&
|
|
sources.inaturalist.exhausted &&
|
|
sources.wikimedia.exhausted
|
|
) {
|
|
cp.exhausted = true;
|
|
}
|
|
|
|
// Don't mark exhausted if we still have room to grow
|
|
if (cp.count >= target) {
|
|
cp.exhausted = true;
|
|
}
|
|
|
|
saveProgress(progress);
|
|
|
|
const pct = Math.round((cp.count / target) * 100);
|
|
console.log(
|
|
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${
|
|
allUrls.length
|
|
} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
|
|
);
|
|
}
|
|
|
|
// ─── Main ───────────────────────────────────────────────────────────────────
|
|
|
|
async function main() {
|
|
console.log("=".repeat(60));
|
|
console.log("PLANT DISEASE DATASET COLLECTOR — TOP 200 COMMON DISEASES");
|
|
console.log("=".repeat(60));
|
|
|
|
// Ensure dataset directory exists before any cache writes
|
|
mkdirSync(DATASET_DIR, { recursive: true });
|
|
|
|
// Load diseases from DB
|
|
console.log("\nLoading top 200 most common diseases from database...");
|
|
const dbDiseases = await loadDiseasesFromDb();
|
|
console.log(` ${dbDiseases.length} diseases loaded`);
|
|
|
|
// Load progress
|
|
const progress = loadProgress();
|
|
|
|
// If all phases complete, exit early
|
|
if (progress.phase === 3) {
|
|
console.log(" ✓ All phases already complete. Delete .progress.json to re-run.");
|
|
await closeDb();
|
|
return;
|
|
}
|
|
|
|
const startTime = Date.now();
|
|
|
|
// ── Phase 1: Common diseases (200 images each) ──────────────────────────
|
|
|
|
console.log("\n" + "─".repeat(60));
|
|
console.log("PHASE 1: Common Diseases (200 images each)");
|
|
console.log("─".repeat(60));
|
|
|
|
const diseaseStart = progress.phase === 0 ? progress.phaseIndex : 0;
|
|
if (diseaseStart > 0) {
|
|
console.log(
|
|
` Resuming from disease #${diseaseStart + 1} (${(
|
|
(diseaseStart / dbDiseases.length) *
|
|
100
|
|
).toFixed(0)}% done)`,
|
|
);
|
|
}
|
|
|
|
// Process diseases in parallel batches
|
|
for (let i = diseaseStart; i < dbDiseases.length; i += DISEASE_CONCURRENCY) {
|
|
const batch = dbDiseases.slice(i, i + DISEASE_CONCURRENCY);
|
|
const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
|
|
const totalBatches = Math.ceil(dbDiseases.length / DISEASE_CONCURRENCY);
|
|
const pct = Math.round((i / dbDiseases.length) * 100);
|
|
|
|
console.log(
|
|
`\n[Batch ${batchNum}/${totalBatches}] (${pct}%) Processing ${batch.length} diseases in parallel...`,
|
|
);
|
|
|
|
// Process all diseases in this batch concurrently
|
|
await Promise.all(
|
|
batch.map(async (d, batchIdx) => {
|
|
const diseaseIdx = i + batchIdx;
|
|
const classDir = resolve(DATASET_DIR, d.id);
|
|
const queries = buildSearchQueries(d);
|
|
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
|
|
|
|
console.log(` [${diseaseIdx + 1}/${dbDiseases.length}] ${d.name || d.id} (${d.plantId})`);
|
|
|
|
await collectClassImages(
|
|
d.id,
|
|
queries,
|
|
TARGET_PER_DISEASE,
|
|
progress,
|
|
classDir,
|
|
existingUrls,
|
|
);
|
|
}),
|
|
);
|
|
|
|
// Save checkpoint: phase 0, at index i + batch.length
|
|
progress.phase = 0;
|
|
progress.phaseIndex = i + batch.length;
|
|
saveProgress(progress);
|
|
}
|
|
|
|
// ── Phase 3: Healthy class ──────────────────────────────────────────────
|
|
|
|
console.log("\n" + "─".repeat(60));
|
|
console.log("PHASE 3: Healthy Plant Images");
|
|
console.log("─".repeat(60));
|
|
|
|
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
|
|
const healthyCp = getClassProgress(progress, HEALTHY_CLASS);
|
|
|
|
// Reconcile healthy class with files on disk
|
|
const healthyActualCount = reconcileClassCount(healthyDir, healthyCp.count);
|
|
if (healthyActualCount !== healthyCp.count) {
|
|
healthyCp.count = healthyActualCount;
|
|
saveProgress(progress);
|
|
}
|
|
|
|
const healthySeen = new Set(healthyCp.seenUrls);
|
|
|
|
if (healthyCp.count >= TARGET_HEALTHY) {
|
|
console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`);
|
|
} else {
|
|
// Collect all unique plants
|
|
const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))];
|
|
const allHealthyQueries: string[] = [];
|
|
for (const plant of allPlants) {
|
|
allHealthyQueries.push(...buildHealthyQueries(plant));
|
|
}
|
|
|
|
const healthySources = [
|
|
{ name: "DDG", collector: collectImagesDuckDuckGo },
|
|
{ name: "iNat", collector: searchImagesInaturalist },
|
|
{ name: "Commons", collector: searchImagesCommons },
|
|
] as const;
|
|
|
|
const totalHealthyUrls: string[] = [];
|
|
let anyRemaining = false;
|
|
|
|
for (const source of healthySources) {
|
|
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
|
|
console.log(`\n Source: ${source.name}`);
|
|
|
|
for (const query of allHealthyQueries.slice(0, 20)) {
|
|
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
|
|
|
|
process.stdout.write(` "${query}"... `);
|
|
const result = await source.collector(
|
|
query,
|
|
TARGET_HEALTHY - totalHealthyUrls.length,
|
|
healthySeen,
|
|
);
|
|
totalHealthyUrls.push(...result.urls);
|
|
if (!result.exhausted) anyRemaining = true;
|
|
console.log(`${result.urls.length} new`);
|
|
}
|
|
}
|
|
|
|
healthyCp.seenUrls = Array.from(healthySeen);
|
|
|
|
if (totalHealthyUrls.length > 0) {
|
|
healthyCp.exhausted = !anyRemaining;
|
|
saveProgress(progress);
|
|
|
|
console.log(`\n Downloading ${totalHealthyUrls.length} healthy images...`);
|
|
const healthyStartIndex = countImagesInDir(healthyDir);
|
|
const { downloaded, failed } = await downloadBatch(
|
|
totalHealthyUrls,
|
|
healthyDir,
|
|
healthyStartIndex,
|
|
);
|
|
|
|
// Re-count actual files on disk
|
|
const newHealthyTotal = countImagesInDir(healthyDir);
|
|
healthyCp.count = newHealthyTotal;
|
|
healthyCp.downloaded += downloaded;
|
|
healthyCp.failed += failed;
|
|
|
|
if (healthyCp.count >= TARGET_HEALTHY) {
|
|
healthyCp.exhausted = true;
|
|
}
|
|
|
|
const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
|
|
console.log(
|
|
` Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
|
|
);
|
|
} else {
|
|
console.log(` ✗ No healthy images found`);
|
|
}
|
|
|
|
saveProgress(progress);
|
|
}
|
|
|
|
// ── Summary ────────────────────────────────────────────────────────────────
|
|
|
|
// Mark all phases complete
|
|
progress.phase = 3;
|
|
progress.phaseIndex = 0;
|
|
saveProgress(progress);
|
|
|
|
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
|
const mins = Math.floor(elapsed / 60);
|
|
const hrs = Math.floor(mins / 60);
|
|
|
|
let totalDownloaded = 0;
|
|
let totalFailed = 0;
|
|
for (const [, cp] of Object.entries(progress.classes)) {
|
|
totalDownloaded += cp.downloaded || 0;
|
|
totalFailed += cp.failed || 0;
|
|
}
|
|
|
|
console.log("\n" + "=".repeat(60));
|
|
console.log(" ✅ ALL PHASES COMPLETE");
|
|
console.log("=".repeat(60));
|
|
console.log(` Time: ${hrs}h ${mins % 60}m`);
|
|
console.log(` Downloaded: ${totalDownloaded} images`);
|
|
console.log(` Failed: ${totalFailed} images`);
|
|
console.log(` Dataset: ${DATASET_DIR}/`);
|
|
|
|
await closeDb();
|
|
console.log("=".repeat(60));
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Fatal error:", err);
|
|
process.exit(1);
|
|
});
|