flag impl fin
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
|
||||
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "fs";
|
||||
import { resolve, extname } from "path";
|
||||
|
||||
// Load .env.development for DB creds
|
||||
@@ -137,17 +137,32 @@ interface DuckDuckGoImageResult {
|
||||
width: number;
|
||||
}
|
||||
|
||||
interface SourceState {
|
||||
exhausted: boolean;
|
||||
}
|
||||
|
||||
interface ClassProgress {
|
||||
count: number;
|
||||
downloaded: number;
|
||||
failed: number;
|
||||
seenUrls: string[];
|
||||
exhausted: boolean;
|
||||
/** Per-source exhaustion tracking — prevents re-scraping exhausted sources on resume */
|
||||
sources: {
|
||||
db: SourceState;
|
||||
duckduckgo: SourceState;
|
||||
inaturalist: SourceState;
|
||||
wikimedia: SourceState;
|
||||
};
|
||||
}
|
||||
|
||||
interface Progress {
|
||||
lastUpdated: string;
|
||||
classes: Record<string, ClassProgress>;
|
||||
/** Phase checkpoint: 0=core, 1=full, 2=healthy. On resume, skip to this phase. */
|
||||
phase: number;
|
||||
/** Index within the current phase's disease array. On resume, skip to this index. */
|
||||
phaseIndex: number;
|
||||
}
|
||||
|
||||
// ─── DB Loading ──────────────────────────────────────────────────────────────
|
||||
@@ -358,7 +373,7 @@ async function searchImagesCommons(
|
||||
srlimit: "50",
|
||||
sroffset: String(sroffset),
|
||||
format: "json",
|
||||
origin: "*", // server-side API call
|
||||
// No origin needed — server-side fetch, Wikimedia ignores CORS headers on API calls
|
||||
});
|
||||
|
||||
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
|
||||
@@ -472,9 +487,40 @@ async function downloadBatch(
|
||||
|
||||
function loadProgress(): Progress {
|
||||
if (!existsSync(PROGRESS_FILE)) {
|
||||
return { lastUpdated: new Date().toISOString(), classes: {} };
|
||||
return {
|
||||
lastUpdated: new Date().toISOString(),
|
||||
classes: {},
|
||||
phase: 0,
|
||||
phaseIndex: 0,
|
||||
};
|
||||
}
|
||||
try {
|
||||
const raw = JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Partial<Progress>;
|
||||
// Backward compat: ensure new fields exist
|
||||
raw.phase ??= 0;
|
||||
raw.phaseIndex ??= 0;
|
||||
raw.classes ??= {};
|
||||
// Ensure each class has the sources field
|
||||
for (const key of Object.keys(raw.classes)) {
|
||||
const cp = raw.classes[key] as Partial<ClassProgress>;
|
||||
cp.sources ??= {
|
||||
db: { exhausted: false },
|
||||
duckduckgo: { exhausted: false },
|
||||
inaturalist: { exhausted: false },
|
||||
wikimedia: { exhausted: false },
|
||||
};
|
||||
cp.seenUrls ??= [];
|
||||
}
|
||||
return raw as Progress;
|
||||
} catch {
|
||||
console.warn(" ⚠ Corrupt progress file, starting fresh");
|
||||
return {
|
||||
lastUpdated: new Date().toISOString(),
|
||||
classes: {},
|
||||
phase: 0,
|
||||
phaseIndex: 0,
|
||||
};
|
||||
}
|
||||
return JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Progress;
|
||||
}
|
||||
|
||||
function saveProgress(progress: Progress): void {
|
||||
@@ -490,6 +536,12 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
|
||||
failed: 0,
|
||||
seenUrls: [],
|
||||
exhausted: false,
|
||||
sources: {
|
||||
db: { exhausted: false },
|
||||
duckduckgo: { exhausted: false },
|
||||
inaturalist: { exhausted: false },
|
||||
wikimedia: { exhausted: false },
|
||||
},
|
||||
};
|
||||
}
|
||||
return progress.classes[classId];
|
||||
@@ -514,6 +566,37 @@ function buildHealthyQueries(plant: string): string[] {
|
||||
];
|
||||
}
|
||||
|
||||
// ─── File Reconciliation ───────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Count actual image files in a class directory.
|
||||
* Returns the count of files matching img_* pattern, OR 0 if dir doesn't exist.
|
||||
*/
|
||||
function countImagesInDir(classDir: string): number {
|
||||
if (!existsSync(classDir)) return 0;
|
||||
try {
|
||||
const files = readdirSync(classDir);
|
||||
return files.filter((f) => f.startsWith("img_")).length;
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconcile a class's progress count with actual files on disk.
|
||||
* If files were deleted after the progress file was saved, this
|
||||
* adjusts the count downward so we re-download the missing ones.
|
||||
* Returns the reconciled count.
|
||||
*/
|
||||
function reconcileClassCount(classDir: string, progressCount: number): number {
|
||||
const fileCount = countImagesInDir(classDir);
|
||||
if (fileCount < progressCount) {
|
||||
console.log(` ↻ File count (${fileCount}) < progress count (${progressCount}) — reconciling`);
|
||||
return fileCount;
|
||||
}
|
||||
return progressCount;
|
||||
}
|
||||
|
||||
// ─── Dataset Collection ─────────────────────────────────────────────────────
|
||||
|
||||
async function collectClassImages(
|
||||
@@ -526,14 +609,32 @@ async function collectClassImages(
|
||||
fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
|
||||
): Promise<void> {
|
||||
const cp = getClassProgress(progress, classId);
|
||||
|
||||
// ── Reconcile with actual files on disk ─────────────────────────────────
|
||||
const actualCount = reconcileClassCount(classDir, cp.count);
|
||||
if (actualCount !== cp.count) {
|
||||
cp.count = actualCount;
|
||||
saveProgress(progress);
|
||||
}
|
||||
|
||||
const seenUrls = new Set(cp.seenUrls);
|
||||
const sources = cp.sources;
|
||||
|
||||
if (cp.count >= target) {
|
||||
console.log(` ✓ Already have ${cp.count}/${target}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (cp.exhausted) {
|
||||
// Check if ALL sources are exhausted
|
||||
const allExhausted =
|
||||
sources.db.exhausted &&
|
||||
sources.duckduckgo.exhausted &&
|
||||
sources.inaturalist.exhausted &&
|
||||
sources.wikimedia.exhausted;
|
||||
|
||||
if (allExhausted) {
|
||||
cp.exhausted = true;
|
||||
saveProgress(progress);
|
||||
console.log(` ✓ Exhausted (${cp.count}/${target})`);
|
||||
return;
|
||||
}
|
||||
@@ -541,73 +642,111 @@ async function collectClassImages(
|
||||
mkdirSync(classDir, { recursive: true });
|
||||
|
||||
const allUrls: string[] = [];
|
||||
let exhausted = false;
|
||||
let anyNewResults = false;
|
||||
const needed = target - cp.count;
|
||||
|
||||
// ── Source 0: Existing DB URLs ──────────────────────────────────────────
|
||||
const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
|
||||
if (freshDbUrls.length > 0) {
|
||||
console.log(` DB: ${freshDbUrls.length} existing URLs`);
|
||||
for (const url of freshDbUrls) {
|
||||
if (allUrls.length >= target) break;
|
||||
seenUrls.add(url);
|
||||
allUrls.push(url);
|
||||
if (!sources.db.exhausted) {
|
||||
const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
|
||||
if (freshDbUrls.length > 0) {
|
||||
console.log(` DB: ${freshDbUrls.length} existing URLs`);
|
||||
for (const url of freshDbUrls) {
|
||||
if (allUrls.length >= needed) break;
|
||||
seenUrls.add(url);
|
||||
allUrls.push(url);
|
||||
}
|
||||
if (freshDbUrls.length > 0) anyNewResults = true;
|
||||
}
|
||||
// DB source is always "exhausted" after processing its initial URLs
|
||||
sources.db.exhausted = true;
|
||||
}
|
||||
|
||||
// ── Source 1: DuckDuckGo ──────────────────────────────────────────────
|
||||
// Skip DDG in fast mode (full set — DDG is slowest source)
|
||||
if (!fastMode && allUrls.length < target) {
|
||||
if (!fastMode && !sources.duckduckgo.exhausted && allUrls.length < needed) {
|
||||
for (const query of queries) {
|
||||
if (allUrls.length >= target) break;
|
||||
if (allUrls.length >= needed) break;
|
||||
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
|
||||
const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls);
|
||||
const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
if (result.exhausted) exhausted = true;
|
||||
if (result.exhausted) {
|
||||
sources.duckduckgo.exhausted = true;
|
||||
}
|
||||
if (result.urls.length > 0) anyNewResults = true;
|
||||
console.log(`${result.urls.length} new`);
|
||||
if (allUrls.length >= target) break;
|
||||
if (allUrls.length >= needed) break;
|
||||
}
|
||||
// If DDG never gave us anything, mark exhausted to avoid re-trying
|
||||
if (!anyNewResults && sources.duckduckgo.exhausted) {
|
||||
/* already marked */
|
||||
}
|
||||
}
|
||||
|
||||
// ── Source 2: iNaturalist ──────────────────────────────────────────────
|
||||
if (allUrls.length < target) {
|
||||
if (!sources.inaturalist.exhausted && allUrls.length < needed) {
|
||||
const primaryQuery = queries[0];
|
||||
console.log(` iNat: Searching...`);
|
||||
const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls);
|
||||
const result = await searchImagesInaturalist(primaryQuery, needed - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
if (result.exhausted) exhausted = true;
|
||||
if (result.exhausted) sources.inaturalist.exhausted = true;
|
||||
if (result.urls.length > 0) anyNewResults = true;
|
||||
console.log(` iNat: ${result.urls.length} images`);
|
||||
}
|
||||
|
||||
// ── Source 3: Wikimedia Commons ────────────────────────────────────────
|
||||
if (allUrls.length < target) {
|
||||
if (!sources.wikimedia.exhausted && allUrls.length < needed) {
|
||||
const primaryQuery = queries[0];
|
||||
console.log(` Commons: Searching...`);
|
||||
const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls);
|
||||
const result = await searchImagesCommons(primaryQuery, needed - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
if (result.exhausted) exhausted = true;
|
||||
if (result.exhausted) sources.wikimedia.exhausted = true;
|
||||
if (result.urls.length > 0) anyNewResults = true;
|
||||
console.log(` Commons: ${result.urls.length} images`);
|
||||
}
|
||||
|
||||
if (allUrls.length === 0) {
|
||||
cp.exhausted = exhausted;
|
||||
cp.exhausted = true;
|
||||
saveProgress(progress);
|
||||
console.log(` ✗ No images found`);
|
||||
console.log(` ✗ No images found — exhausted`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!anyNewResults && allUrls.length > 0) {
|
||||
// Only DB URLs survived — nothing more will come from searches
|
||||
cp.exhausted = true;
|
||||
saveProgress(progress);
|
||||
}
|
||||
|
||||
// Save progress with seen URLs BEFORE downloading
|
||||
cp.seenUrls = Array.from(seenUrls);
|
||||
cp.exhausted = exhausted;
|
||||
saveProgress(progress);
|
||||
|
||||
console.log(` Downloading ${allUrls.length} images...`);
|
||||
|
||||
const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count);
|
||||
// Use actual file count as start index so filenames don't have gaps
|
||||
const startIndex = countImagesInDir(classDir);
|
||||
const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);
|
||||
|
||||
cp.count += downloaded;
|
||||
// Re-count actual files on disk after download (more reliable than tracking)
|
||||
const newTotal = countImagesInDir(classDir);
|
||||
cp.count = newTotal;
|
||||
cp.downloaded += downloaded;
|
||||
cp.failed += failed;
|
||||
|
||||
// Check if all sources exhausted
|
||||
if (
|
||||
sources.db.exhausted &&
|
||||
sources.duckduckgo.exhausted &&
|
||||
sources.inaturalist.exhausted &&
|
||||
sources.wikimedia.exhausted
|
||||
) {
|
||||
cp.exhausted = true;
|
||||
}
|
||||
|
||||
// Don't mark exhausted if we still have room to grow
|
||||
if (cp.count >= target) {
|
||||
cp.exhausted = true;
|
||||
}
|
||||
|
||||
saveProgress(progress);
|
||||
|
||||
const pct = Math.round((cp.count / target) * 100);
|
||||
@@ -645,7 +784,12 @@ async function main() {
|
||||
console.log("PHASE 1: Core Diseases (100 images each)");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
for (let i = 0; i < coreDiseases.length; i++) {
|
||||
const coreStart = progress.phase === 0 ? progress.phaseIndex : 0;
|
||||
if (coreStart > 0) {
|
||||
console.log(` Resuming from disease #${coreStart + 1} (${((coreStart / coreDiseases.length) * 100).toFixed(0)}% done)`);
|
||||
}
|
||||
|
||||
for (let i = coreStart; i < coreDiseases.length; i++) {
|
||||
const d = coreDiseases[i];
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d);
|
||||
@@ -655,6 +799,11 @@ async function main() {
|
||||
console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
|
||||
|
||||
await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
|
||||
|
||||
// Save checkpoint: phase 0, at index i
|
||||
progress.phase = 0;
|
||||
progress.phaseIndex = i + 1;
|
||||
saveProgress(progress);
|
||||
}
|
||||
|
||||
// ── Phase 2: Full set ──────────────────────────────────────────────────
|
||||
@@ -663,7 +812,12 @@ async function main() {
|
||||
console.log("PHASE 2: Full Disease Set (10 images each)");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
for (let i = 0; i < fullDiseases.length; i++) {
|
||||
const fullStart = progress.phase === 1 ? progress.phaseIndex : 0;
|
||||
if (fullStart > 0) {
|
||||
console.log(` Resuming from disease #${fullStart + 1} (${((fullStart / fullDiseases.length) * 100).toFixed(0)}% done)`);
|
||||
}
|
||||
|
||||
for (let i = fullStart; i < fullDiseases.length; i++) {
|
||||
const d = fullDiseases[i];
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d);
|
||||
@@ -673,6 +827,11 @@ async function main() {
|
||||
console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
|
||||
|
||||
await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
|
||||
|
||||
// Save checkpoint: phase 1, at index i
|
||||
progress.phase = 1;
|
||||
progress.phaseIndex = i + 1;
|
||||
saveProgress(progress);
|
||||
}
|
||||
|
||||
// ── Phase 3: Healthy class ──────────────────────────────────────────────
|
||||
@@ -683,6 +842,14 @@ async function main() {
|
||||
|
||||
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
|
||||
const healthyCp = getClassProgress(progress, HEALTHY_CLASS);
|
||||
|
||||
// Reconcile healthy class with files on disk
|
||||
const healthyActualCount = reconcileClassCount(healthyDir, healthyCp.count);
|
||||
if (healthyActualCount !== healthyCp.count) {
|
||||
healthyCp.count = healthyActualCount;
|
||||
saveProgress(progress);
|
||||
}
|
||||
|
||||
const healthySeen = new Set(healthyCp.seenUrls);
|
||||
|
||||
if (healthyCp.count >= TARGET_HEALTHY) {
|
||||
@@ -730,16 +897,23 @@ async function main() {
|
||||
saveProgress(progress);
|
||||
|
||||
console.log(`\n Downloading ${totalHealthyUrls.length} healthy images...`);
|
||||
const healthyStartIndex = countImagesInDir(healthyDir);
|
||||
const { downloaded, failed } = await downloadBatch(
|
||||
totalHealthyUrls,
|
||||
healthyDir,
|
||||
healthyCp.count,
|
||||
healthyStartIndex,
|
||||
);
|
||||
|
||||
healthyCp.count += downloaded;
|
||||
// Re-count actual files on disk
|
||||
const newHealthyTotal = countImagesInDir(healthyDir);
|
||||
healthyCp.count = newHealthyTotal;
|
||||
healthyCp.downloaded += downloaded;
|
||||
healthyCp.failed += failed;
|
||||
|
||||
if (healthyCp.count >= TARGET_HEALTHY) {
|
||||
healthyCp.exhausted = true;
|
||||
}
|
||||
|
||||
const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
|
||||
console.log(
|
||||
` Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
|
||||
@@ -753,6 +927,11 @@ async function main() {
|
||||
|
||||
// ── Summary ────────────────────────────────────────────────────────────────
|
||||
|
||||
// Mark all phases complete
|
||||
progress.phase = 3;
|
||||
progress.phaseIndex = 0;
|
||||
saveProgress(progress);
|
||||
|
||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||
const mins = Math.floor(elapsed / 60);
|
||||
const hrs = Math.floor(mins / 60);
|
||||
@@ -765,7 +944,7 @@ async function main() {
|
||||
}
|
||||
|
||||
console.log("\n" + "=".repeat(60));
|
||||
console.log("COMPLETE");
|
||||
console.log(" ✅ ALL PHASES COMPLETE");
|
||||
console.log("=".repeat(60));
|
||||
console.log(` Time: ${hrs}h ${mins % 60}m`);
|
||||
console.log(` Downloaded: ${totalDownloaded} images`);
|
||||
|
||||
Reference in New Issue
Block a user