flag impl fin

This commit is contained in:
2026-06-06 17:22:31 -04:00
parent db4c656730
commit 96de91e86c
12 changed files with 1025 additions and 65 deletions

View File

@@ -20,7 +20,7 @@
*/
import "dotenv/config";
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "fs";
import { resolve, extname } from "path";
// Load .env.development for DB creds
@@ -137,17 +137,32 @@ interface DuckDuckGoImageResult {
width: number;
}
interface SourceState {
exhausted: boolean;
}
interface ClassProgress {
count: number;
downloaded: number;
failed: number;
seenUrls: string[];
exhausted: boolean;
/** Per-source exhaustion tracking — prevents re-scraping exhausted sources on resume */
sources: {
db: SourceState;
duckduckgo: SourceState;
inaturalist: SourceState;
wikimedia: SourceState;
};
}
interface Progress {
lastUpdated: string;
classes: Record<string, ClassProgress>;
/** Phase checkpoint: 0=core, 1=full, 2=healthy. On resume, skip to this phase. */
phase: number;
/** Index within the current phase's disease array. On resume, skip to this index. */
phaseIndex: number;
}
// ─── DB Loading ──────────────────────────────────────────────────────────────
@@ -358,7 +373,7 @@ async function searchImagesCommons(
srlimit: "50",
sroffset: String(sroffset),
format: "json",
origin: "*", // server-side API call
// No origin needed — server-side fetch, Wikimedia ignores CORS headers on API calls
});
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
@@ -472,9 +487,40 @@ async function downloadBatch(
function loadProgress(): Progress {
if (!existsSync(PROGRESS_FILE)) {
return { lastUpdated: new Date().toISOString(), classes: {} };
return {
lastUpdated: new Date().toISOString(),
classes: {},
phase: 0,
phaseIndex: 0,
};
}
try {
const raw = JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Partial<Progress>;
// Backward compat: ensure new fields exist
raw.phase ??= 0;
raw.phaseIndex ??= 0;
raw.classes ??= {};
// Ensure each class has the sources field
for (const key of Object.keys(raw.classes)) {
const cp = raw.classes[key] as Partial<ClassProgress>;
cp.sources ??= {
db: { exhausted: false },
duckduckgo: { exhausted: false },
inaturalist: { exhausted: false },
wikimedia: { exhausted: false },
};
cp.seenUrls ??= [];
}
return raw as Progress;
} catch {
console.warn(" ⚠ Corrupt progress file, starting fresh");
return {
lastUpdated: new Date().toISOString(),
classes: {},
phase: 0,
phaseIndex: 0,
};
}
return JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Progress;
}
function saveProgress(progress: Progress): void {
@@ -490,6 +536,12 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
failed: 0,
seenUrls: [],
exhausted: false,
sources: {
db: { exhausted: false },
duckduckgo: { exhausted: false },
inaturalist: { exhausted: false },
wikimedia: { exhausted: false },
},
};
}
return progress.classes[classId];
@@ -514,6 +566,37 @@ function buildHealthyQueries(plant: string): string[] {
];
}
// ─── File Reconciliation ───────────────────────────────────────────────────
/**
* Count actual image files in a class directory.
* Returns the count of files matching img_* pattern, OR 0 if dir doesn't exist.
*/
function countImagesInDir(classDir: string): number {
if (!existsSync(classDir)) return 0;
try {
const files = readdirSync(classDir);
return files.filter((f) => f.startsWith("img_")).length;
} catch {
return 0;
}
}
/**
* Reconcile a class's progress count with actual files on disk.
* If files were deleted after the progress file was saved, this
* adjusts the count downward so we re-download the missing ones.
* Returns the reconciled count.
*/
function reconcileClassCount(classDir: string, progressCount: number): number {
const fileCount = countImagesInDir(classDir);
if (fileCount < progressCount) {
console.log(` ↻ File count (${fileCount}) < progress count (${progressCount}) — reconciling`);
return fileCount;
}
return progressCount;
}
// ─── Dataset Collection ─────────────────────────────────────────────────────
async function collectClassImages(
@@ -526,14 +609,32 @@ async function collectClassImages(
fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
): Promise<void> {
const cp = getClassProgress(progress, classId);
// ── Reconcile with actual files on disk ─────────────────────────────────
const actualCount = reconcileClassCount(classDir, cp.count);
if (actualCount !== cp.count) {
cp.count = actualCount;
saveProgress(progress);
}
const seenUrls = new Set(cp.seenUrls);
const sources = cp.sources;
if (cp.count >= target) {
console.log(` ✓ Already have ${cp.count}/${target}`);
return;
}
if (cp.exhausted) {
// Check if ALL sources are exhausted
const allExhausted =
sources.db.exhausted &&
sources.duckduckgo.exhausted &&
sources.inaturalist.exhausted &&
sources.wikimedia.exhausted;
if (allExhausted) {
cp.exhausted = true;
saveProgress(progress);
console.log(` ✓ Exhausted (${cp.count}/${target})`);
return;
}
@@ -541,73 +642,111 @@ async function collectClassImages(
mkdirSync(classDir, { recursive: true });
const allUrls: string[] = [];
let exhausted = false;
let anyNewResults = false;
const needed = target - cp.count;
// ── Source 0: Existing DB URLs ──────────────────────────────────────────
const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
if (freshDbUrls.length > 0) {
console.log(` DB: ${freshDbUrls.length} existing URLs`);
for (const url of freshDbUrls) {
if (allUrls.length >= target) break;
seenUrls.add(url);
allUrls.push(url);
if (!sources.db.exhausted) {
const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
if (freshDbUrls.length > 0) {
console.log(` DB: ${freshDbUrls.length} existing URLs`);
for (const url of freshDbUrls) {
if (allUrls.length >= needed) break;
seenUrls.add(url);
allUrls.push(url);
}
if (freshDbUrls.length > 0) anyNewResults = true;
}
// DB source is always "exhausted" after processing its initial URLs
sources.db.exhausted = true;
}
// ── Source 1: DuckDuckGo ──────────────────────────────────────────────
// Skip DDG in fast mode (full set — DDG is slowest source)
if (!fastMode && allUrls.length < target) {
if (!fastMode && !sources.duckduckgo.exhausted && allUrls.length < needed) {
for (const query of queries) {
if (allUrls.length >= target) break;
if (allUrls.length >= needed) break;
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls);
const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
if (result.exhausted) {
sources.duckduckgo.exhausted = true;
}
if (result.urls.length > 0) anyNewResults = true;
console.log(`${result.urls.length} new`);
if (allUrls.length >= target) break;
if (allUrls.length >= needed) break;
}
// If DDG never gave us anything, mark exhausted to avoid re-trying
if (!anyNewResults && sources.duckduckgo.exhausted) {
/* already marked */
}
}
// ── Source 2: iNaturalist ──────────────────────────────────────────────
if (allUrls.length < target) {
if (!sources.inaturalist.exhausted && allUrls.length < needed) {
const primaryQuery = queries[0];
console.log(` iNat: Searching...`);
const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls);
const result = await searchImagesInaturalist(primaryQuery, needed - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
if (result.exhausted) sources.inaturalist.exhausted = true;
if (result.urls.length > 0) anyNewResults = true;
console.log(` iNat: ${result.urls.length} images`);
}
// ── Source 3: Wikimedia Commons ────────────────────────────────────────
if (allUrls.length < target) {
if (!sources.wikimedia.exhausted && allUrls.length < needed) {
const primaryQuery = queries[0];
console.log(` Commons: Searching...`);
const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls);
const result = await searchImagesCommons(primaryQuery, needed - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
if (result.exhausted) sources.wikimedia.exhausted = true;
if (result.urls.length > 0) anyNewResults = true;
console.log(` Commons: ${result.urls.length} images`);
}
if (allUrls.length === 0) {
cp.exhausted = exhausted;
cp.exhausted = true;
saveProgress(progress);
console.log(` ✗ No images found`);
console.log(` ✗ No images found — exhausted`);
return;
}
if (!anyNewResults && allUrls.length > 0) {
// Only DB URLs survived — nothing more will come from searches
cp.exhausted = true;
saveProgress(progress);
}
// Save progress with seen URLs BEFORE downloading
cp.seenUrls = Array.from(seenUrls);
cp.exhausted = exhausted;
saveProgress(progress);
console.log(` Downloading ${allUrls.length} images...`);
const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count);
// Use actual file count as start index so filenames don't have gaps
const startIndex = countImagesInDir(classDir);
const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);
cp.count += downloaded;
// Re-count actual files on disk after download (more reliable than tracking)
const newTotal = countImagesInDir(classDir);
cp.count = newTotal;
cp.downloaded += downloaded;
cp.failed += failed;
// Check if all sources exhausted
if (
sources.db.exhausted &&
sources.duckduckgo.exhausted &&
sources.inaturalist.exhausted &&
sources.wikimedia.exhausted
) {
cp.exhausted = true;
}
// Don't mark exhausted if we still have room to grow
if (cp.count >= target) {
cp.exhausted = true;
}
saveProgress(progress);
const pct = Math.round((cp.count / target) * 100);
@@ -645,7 +784,12 @@ async function main() {
console.log("PHASE 1: Core Diseases (100 images each)");
console.log("─".repeat(60));
for (let i = 0; i < coreDiseases.length; i++) {
const coreStart = progress.phase === 0 ? progress.phaseIndex : 0;
if (coreStart > 0) {
console.log(` Resuming from disease #${coreStart + 1} (${((coreStart / coreDiseases.length) * 100).toFixed(0)}% done)`);
}
for (let i = coreStart; i < coreDiseases.length; i++) {
const d = coreDiseases[i];
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d);
@@ -655,6 +799,11 @@ async function main() {
console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
// Save checkpoint: phase 0, at index i
progress.phase = 0;
progress.phaseIndex = i + 1;
saveProgress(progress);
}
// ── Phase 2: Full set ──────────────────────────────────────────────────
@@ -663,7 +812,12 @@ async function main() {
console.log("PHASE 2: Full Disease Set (10 images each)");
console.log("─".repeat(60));
for (let i = 0; i < fullDiseases.length; i++) {
const fullStart = progress.phase === 1 ? progress.phaseIndex : 0;
if (fullStart > 0) {
console.log(` Resuming from disease #${fullStart + 1} (${((fullStart / fullDiseases.length) * 100).toFixed(0)}% done)`);
}
for (let i = fullStart; i < fullDiseases.length; i++) {
const d = fullDiseases[i];
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d);
@@ -673,6 +827,11 @@ async function main() {
console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
// Save checkpoint: phase 1, at index i
progress.phase = 1;
progress.phaseIndex = i + 1;
saveProgress(progress);
}
// ── Phase 3: Healthy class ──────────────────────────────────────────────
@@ -683,6 +842,14 @@ async function main() {
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
const healthyCp = getClassProgress(progress, HEALTHY_CLASS);
// Reconcile healthy class with files on disk
const healthyActualCount = reconcileClassCount(healthyDir, healthyCp.count);
if (healthyActualCount !== healthyCp.count) {
healthyCp.count = healthyActualCount;
saveProgress(progress);
}
const healthySeen = new Set(healthyCp.seenUrls);
if (healthyCp.count >= TARGET_HEALTHY) {
@@ -730,16 +897,23 @@ async function main() {
saveProgress(progress);
console.log(`\n Downloading ${totalHealthyUrls.length} healthy images...`);
const healthyStartIndex = countImagesInDir(healthyDir);
const { downloaded, failed } = await downloadBatch(
totalHealthyUrls,
healthyDir,
healthyCp.count,
healthyStartIndex,
);
healthyCp.count += downloaded;
// Re-count actual files on disk
const newHealthyTotal = countImagesInDir(healthyDir);
healthyCp.count = newHealthyTotal;
healthyCp.downloaded += downloaded;
healthyCp.failed += failed;
if (healthyCp.count >= TARGET_HEALTHY) {
healthyCp.exhausted = true;
}
const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
console.log(
` Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
@@ -753,6 +927,11 @@ async function main() {
// ── Summary ────────────────────────────────────────────────────────────────
// Mark all phases complete
progress.phase = 3;
progress.phaseIndex = 0;
saveProgress(progress);
const elapsed = Math.round((Date.now() - startTime) / 1000);
const mins = Math.floor(elapsed / 60);
const hrs = Math.floor(mins / 60);
@@ -765,7 +944,7 @@ async function main() {
}
console.log("\n" + "=".repeat(60));
console.log("COMPLETE");
console.log(" ✅ ALL PHASES COMPLETE");
console.log("=".repeat(60));
console.log(` Time: ${hrs}h ${mins % 60}m`);
console.log(` Downloaded: ${totalDownloaded} images`);