scripting
This commit is contained in:
660
apps/web/scripts/scrape-training-dataset.ts
Normal file
660
apps/web/scripts/scrape-training-dataset.ts
Normal file
@@ -0,0 +1,660 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* scrape-training-dataset.ts
|
||||
*
|
||||
* Collects a training dataset for fine-tuning by scraping DuckDuckGo image search.
|
||||
*
|
||||
* Targets:
|
||||
* - 200 images per disease class (93 diseases)
|
||||
* - 400 images for the "healthy" class
|
||||
* - Full resolution images stored in data/dataset/{class_id}/
|
||||
*
|
||||
* DuckDuckGo approach (no API key needed):
|
||||
* 1. Fetch the main search page to extract a vqd (query) token
|
||||
* 2. Use the vqd token to paginate through image results
|
||||
* 3. Download each image to the dataset directory
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
|
||||
*
|
||||
* Progress is tracked in data/dataset/.progress.json — interrupt and resume safely.
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
|
||||
import { resolve, extname, join } from "path";
|
||||
|
||||
// ─── Config ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
|
||||
const PLANTS_JSON = resolve(__dirname, "../src/data/plants.json");
|
||||
|
||||
const DATASET_DIR = resolve(__dirname, "../data/dataset");
|
||||
const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");
|
||||
|
||||
/** Target images per disease class */
|
||||
const TARGET_PER_DISEASE = 200;
|
||||
|
||||
/** Target images for the "healthy" class (2× normal) */
|
||||
const TARGET_HEALTHY = 400;
|
||||
|
||||
/** Delay between DuckDuckGo search API calls (ms) */
|
||||
const SEARCH_DELAY = 1500;
|
||||
|
||||
/** Delay between image downloads (ms) */
|
||||
const DOWNLOAD_DELAY = 300;
|
||||
|
||||
/** Max concurrent downloads */
|
||||
const CONCURRENT_DOWNLOADS = 5;
|
||||
|
||||
/** Minimum image size in bytes to accept (reject tiny placeholders) */
|
||||
const MIN_IMAGE_SIZE = 10_000; // 10KB
|
||||
|
||||
/** Maximum image size in bytes */
|
||||
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
|
||||
|
||||
/** Allowed image content types */
|
||||
const ALLOWED_CONTENT_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif"];
|
||||
|
||||
/** Allowed file extensions */
|
||||
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
|
||||
|
||||
/** User agent for requests */
|
||||
const UA =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface DiseaseSeed {
|
||||
id: string;
|
||||
plantId: string;
|
||||
name: string;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
interface PlantSeed {
|
||||
id: string;
|
||||
commonName: string;
|
||||
scientificName: string;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
interface DuckDuckGoImageResult {
|
||||
image: string;
|
||||
title: string;
|
||||
url: string;
|
||||
thumbnail: string;
|
||||
height: number;
|
||||
width: number;
|
||||
}
|
||||
|
||||
interface ClassProgress {
|
||||
count: number;
|
||||
downloaded: number;
|
||||
failed: number;
|
||||
skipped: number;
|
||||
/** URLs we've already seen (to avoid duplicates) */
|
||||
seenUrls: string[];
|
||||
/** Whether we've exhausted search results */
|
||||
exhausted: boolean;
|
||||
}
|
||||
|
||||
interface Progress {
|
||||
lastUpdated: string;
|
||||
classes: Record<string, ClassProgress>;
|
||||
}
|
||||
|
||||
/** Class ID for healthy plants */
|
||||
const HEALTHY_CLASS = "healthy";
|
||||
|
||||
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Extract the vqd token from DuckDuckGo's search page.
|
||||
* Required for paginating image results.
|
||||
*/
|
||||
async function getVqdToken(query: string): Promise<string> {
|
||||
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
|
||||
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA, Accept: "text/html" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
throw new Error(`Failed to get vqd token: ${res.status}`);
|
||||
}
|
||||
|
||||
const html = await res.text();
|
||||
|
||||
// Extract vqd token from the HTML
|
||||
// Format: vqd='<token>' or vqd="<token>"
|
||||
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
|
||||
if (!match) {
|
||||
throw new Error(`Could not extract vqd token from DuckDuckGo response for "${query}"`);
|
||||
}
|
||||
|
||||
return match[1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a page of DuckDuckGo image results.
|
||||
*/
|
||||
async function searchImagesDuckDuckGo(
|
||||
query: string,
|
||||
vqd: string,
|
||||
page: number,
|
||||
): Promise<DuckDuckGoImageResult[]> {
|
||||
const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(query)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
|
||||
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent": UA,
|
||||
Accept: "application/json",
|
||||
Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
|
||||
},
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
if (res.status === 429) {
|
||||
console.warn(" ⚠ Rate limited (429). Waiting 10s...");
|
||||
await sleep(10_000);
|
||||
return searchImagesDuckDuckGo(query, vqd, page); // Retry
|
||||
}
|
||||
if (res.status === 403) {
|
||||
console.warn(" ⚠ Forbidden (403). Token may have expired.");
|
||||
return []; // Token expired — no more pages
|
||||
}
|
||||
throw new Error(`DuckDuckGo search failed: ${res.status}`);
|
||||
}
|
||||
|
||||
const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
|
||||
return data.results ?? [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Search DuckDuckGo images, automatically paginating to collect up to `target` results.
|
||||
* Returns unique image URLs.
|
||||
*/
|
||||
async function collectImages(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<{ urls: string[]; exhausted: boolean }> {
|
||||
const results: string[] = [];
|
||||
let page = 1;
|
||||
let exhausted = false;
|
||||
let consecutiveEmpty = 0;
|
||||
|
||||
// Get vqd token
|
||||
let vqd: string;
|
||||
try {
|
||||
vqd = await getVqdToken(query);
|
||||
} catch (err) {
|
||||
console.warn(` ⚠ Failed to get vqd token: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
return { urls: [], exhausted: true };
|
||||
}
|
||||
|
||||
while (results.length < target) {
|
||||
await sleep(SEARCH_DELAY);
|
||||
|
||||
let pageResults: DuckDuckGoImageResult[];
|
||||
try {
|
||||
pageResults = await searchImagesDuckDuckGo(query, vqd, page);
|
||||
} catch (err) {
|
||||
console.warn(` ⚠ Search error: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
break;
|
||||
}
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
consecutiveEmpty++;
|
||||
if (consecutiveEmpty >= 3) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
page++;
|
||||
continue;
|
||||
}
|
||||
|
||||
consecutiveEmpty = 0;
|
||||
let newCount = 0;
|
||||
|
||||
for (const r of pageResults) {
|
||||
if (results.length >= target) break;
|
||||
|
||||
const imgUrl = r.image || r.url;
|
||||
|
||||
// Skip if we've already seen this URL
|
||||
if (seenUrls.has(imgUrl)) continue;
|
||||
|
||||
// Validate URL looks like an image
|
||||
const ext = extname(new URL(imgUrl).pathname).toLowerCase();
|
||||
if (!ALLOWED_EXTENSIONS.includes(ext) && !ext) {
|
||||
// No extension - still try, could be a CDN URL
|
||||
}
|
||||
|
||||
seenUrls.add(imgUrl);
|
||||
results.push(imgUrl);
|
||||
newCount++;
|
||||
}
|
||||
|
||||
if (newCount === 0 && pageResults.every((r) => seenUrls.has(r.image || r.url))) {
|
||||
// All results on this page were already seen
|
||||
page++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (results.length < target) {
|
||||
page++;
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: results.slice(0, target), exhausted };
|
||||
}
|
||||
|
||||
// ─── Image Download ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Download a single image from a URL to the target path.
|
||||
* Returns true if successful, false otherwise.
|
||||
*/
|
||||
async function downloadImage(url: string, destPath: string): Promise<boolean> {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) return false;
|
||||
|
||||
const contentType = res.headers.get("content-type") || "";
|
||||
const contentLength = parseInt(res.headers.get("content-length") || "0", 10);
|
||||
|
||||
// Validate content type
|
||||
if (!ALLOWED_CONTENT_TYPES.some((t) => contentType.includes(t))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate size
|
||||
if (contentLength > 0 && contentLength < MIN_IMAGE_SIZE) return false;
|
||||
if (contentLength > MAX_IMAGE_SIZE) return false;
|
||||
|
||||
const buffer = Buffer.from(await res.arrayBuffer());
|
||||
|
||||
// Double-check actual buffer size
|
||||
if (buffer.length < MIN_IMAGE_SIZE) return false;
|
||||
if (buffer.length > MAX_IMAGE_SIZE) return false;
|
||||
|
||||
// Determine correct extension from content type or URL
|
||||
let ext = extname(new URL(url).pathname).toLowerCase();
|
||||
if (!ALLOWED_EXTENSIONS.includes(ext)) {
|
||||
// Map from content type
|
||||
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
|
||||
else if (contentType.includes("png")) ext = ".png";
|
||||
else if (contentType.includes("webp")) ext = ".webp";
|
||||
else ext = ".jpg"; // Default
|
||||
}
|
||||
|
||||
const filePath = destPath.replace(/\.\w+$/, ext);
|
||||
writeFileSync(filePath, buffer);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Download multiple images concurrently, respecting a per-download delay.
|
||||
*/
|
||||
async function downloadBatch(
|
||||
urls: string[],
|
||||
classDir: string,
|
||||
startIndex: number,
|
||||
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
|
||||
let downloaded = 0;
|
||||
let failed = 0;
|
||||
let index = startIndex;
|
||||
|
||||
// Process in chunks to control concurrency
|
||||
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
|
||||
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
|
||||
|
||||
const results = await Promise.all(
|
||||
chunk.map(async (url) => {
|
||||
const paddedIndex = String(index).padStart(4, "0");
|
||||
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
|
||||
|
||||
const success = await downloadImage(url, destPath);
|
||||
await sleep(DOWNLOAD_DELAY);
|
||||
return { success, index: index++ };
|
||||
}),
|
||||
);
|
||||
|
||||
for (const r of results) {
|
||||
if (r.success) downloaded++;
|
||||
else failed++;
|
||||
}
|
||||
}
|
||||
|
||||
return { downloaded, failed, lastIndex: index };
|
||||
}
|
||||
|
||||
// ─── Progress Tracking ──────────────────────────────────────────────────────
|
||||
|
||||
function loadProgress(): Progress {
|
||||
if (!existsSync(PROGRESS_FILE)) {
|
||||
return { lastUpdated: new Date().toISOString(), classes: {} };
|
||||
}
|
||||
return JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Progress;
|
||||
}
|
||||
|
||||
function saveProgress(progress: Progress): void {
|
||||
progress.lastUpdated = new Date().toISOString();
|
||||
writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2));
|
||||
}
|
||||
|
||||
function getClassProgress(progress: Progress, classId: string): ClassProgress {
|
||||
if (!progress.classes[classId]) {
|
||||
progress.classes[classId] = {
|
||||
count: 0,
|
||||
downloaded: 0,
|
||||
failed: 0,
|
||||
skipped: 0,
|
||||
seenUrls: [],
|
||||
exhausted: false,
|
||||
};
|
||||
}
|
||||
return progress.classes[classId];
|
||||
}
|
||||
|
||||
// ─── Search Query Building ──────────────────────────────────────────────────
|
||||
|
||||
function buildSearchQueries(disease: DiseaseSeed, plant: PlantSeed | null): string[] {
|
||||
const name = disease.name;
|
||||
const plantName = plant?.commonName || disease.plantId;
|
||||
|
||||
return [
|
||||
`${name} ${plantName} leaf disease`,
|
||||
`${plantName} ${name} symptoms`,
|
||||
`${name} plant disease`,
|
||||
`${plantName} diseased leaf`,
|
||||
];
|
||||
}
|
||||
|
||||
function buildHealthyQueries(plant: PlantSeed): string[] {
|
||||
return [
|
||||
`healthy ${plant.commonName} leaf`,
|
||||
`${plant.commonName} leaf closeup`,
|
||||
`healthy ${plant.commonName} plant`,
|
||||
`${plant.commonName} foliage`,
|
||||
];
|
||||
}
|
||||
|
||||
// ─── Dataset Collection ─────────────────────────────────────────────────────
|
||||
|
||||
async function collectClassImages(
|
||||
classId: string,
|
||||
queries: string[],
|
||||
target: number,
|
||||
progress: Progress,
|
||||
classDir: string,
|
||||
): Promise<void> {
|
||||
const cp = getClassProgress(progress, classId);
|
||||
const seenUrls = new Set(cp.seenUrls);
|
||||
|
||||
if (cp.count >= target) {
|
||||
console.log(` ✓ Already have ${cp.count}/${target} images`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (cp.exhausted) {
|
||||
console.log(` ✓ Already exhausted search results (${cp.count}/${target} images)`);
|
||||
return;
|
||||
}
|
||||
|
||||
mkdirSync(classDir, { recursive: true });
|
||||
|
||||
const totalUrls: string[] = [];
|
||||
let exhausted = false;
|
||||
|
||||
// Search with each query until we hit the target
|
||||
for (const query of queries) {
|
||||
if (totalUrls.length >= target) break;
|
||||
|
||||
console.log(` Searching: "${query}"...`);
|
||||
const result = await collectImages(query, target - totalUrls.length, seenUrls);
|
||||
|
||||
totalUrls.push(...result.urls);
|
||||
cp.seenUrls = Array.from(seenUrls);
|
||||
|
||||
if (result.exhausted) {
|
||||
exhausted = true;
|
||||
}
|
||||
|
||||
if (totalUrls.length >= target) break;
|
||||
}
|
||||
|
||||
if (totalUrls.length === 0) {
|
||||
cp.exhausted = exhausted;
|
||||
saveProgress(progress);
|
||||
console.log(` ✗ No images found for "${classId}"`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Found ${totalUrls.length} unique image URLs. Downloading...`);
|
||||
|
||||
// Download the images
|
||||
const { downloaded, failed } = await downloadBatch(totalUrls, classDir, cp.count);
|
||||
|
||||
cp.count += downloaded;
|
||||
cp.downloaded += downloaded;
|
||||
cp.failed += failed;
|
||||
cp.exhausted = exhausted;
|
||||
|
||||
saveProgress(progress);
|
||||
|
||||
const pct = Math.round((cp.count / target) * 100);
|
||||
console.log(
|
||||
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
|
||||
// ─── Main ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log("=".repeat(60));
|
||||
console.log("PLANT DISEASE DATASET COLLECTOR");
|
||||
console.log("=".repeat(60));
|
||||
|
||||
// Load knowledge base
|
||||
const diseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
|
||||
const plants = JSON.parse(readFileSync(PLANTS_JSON, "utf-8")) as PlantSeed[];
|
||||
|
||||
const plantMap = new Map<string, PlantSeed>();
|
||||
for (const p of plants) {
|
||||
plantMap.set(p.id, p);
|
||||
}
|
||||
|
||||
console.log(`\nLoaded ${diseases.length} diseases, ${plants.length} plants`);
|
||||
console.log(
|
||||
`Target: ${TARGET_PER_DISEASE} images/disease (×${diseases.length} = ${diseases.length * TARGET_PER_DISEASE})`,
|
||||
);
|
||||
console.log(`Target: ${TARGET_HEALTHY} images for "healthy" class`);
|
||||
console.log(`Output: ${DATASET_DIR}/`);
|
||||
console.log("");
|
||||
|
||||
// Load progress
|
||||
mkdirSync(DATASET_DIR, { recursive: true });
|
||||
const progress = loadProgress();
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// ── Phase 1: Disease classes ──────────────────────────────────────────────
|
||||
|
||||
console.log("─".repeat(60));
|
||||
console.log("PHASE 1: Disease Images");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
for (let i = 0; i < diseases.length; i++) {
|
||||
const disease = diseases[i];
|
||||
const plant = plantMap.get(disease.plantId) ?? null;
|
||||
const classDir = resolve(DATASET_DIR, disease.id);
|
||||
const queries = buildSearchQueries(disease, plant);
|
||||
|
||||
const pct = Math.round((i / diseases.length) * 100);
|
||||
console.log(`\n[${i + 1}/${diseases.length}] (${pct}%) ${disease.name} (${disease.id})`);
|
||||
|
||||
await collectClassImages(disease.id, queries, TARGET_PER_DISEASE, progress, classDir);
|
||||
}
|
||||
|
||||
// ── Phase 2: Healthy class ────────────────────────────────────────────────
|
||||
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log("PHASE 2: Healthy Plant Images");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
|
||||
const healthyCp = getClassProgress(progress, HEALTHY_CLASS);
|
||||
const healthySeen = new Set(healthyCp.seenUrls);
|
||||
|
||||
if (healthyCp.count >= TARGET_HEALTHY) {
|
||||
console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY} healthy images`);
|
||||
} else {
|
||||
// Build a pool of healthy plant queries
|
||||
const allHealthyQueries: string[] = [];
|
||||
for (const plant of plants) {
|
||||
allHealthyQueries.push(...buildHealthyQueries(plant));
|
||||
}
|
||||
|
||||
const totalHealthyUrls: string[] = [];
|
||||
let healthyExhausted = false;
|
||||
|
||||
for (const query of allHealthyQueries) {
|
||||
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
|
||||
if (healthyExhausted) break;
|
||||
|
||||
console.log(`\n Searching: "${query}"...`);
|
||||
const result = await collectImages(
|
||||
query,
|
||||
TARGET_HEALTHY - totalHealthyUrls.length,
|
||||
healthySeen,
|
||||
);
|
||||
|
||||
totalHealthyUrls.push(...result.urls);
|
||||
|
||||
if (result.exhausted) {
|
||||
healthyExhausted = true;
|
||||
}
|
||||
}
|
||||
|
||||
healthyCp.seenUrls = Array.from(healthySeen);
|
||||
|
||||
if (totalHealthyUrls.length > 0) {
|
||||
console.log(`\n Found ${totalHealthyUrls.length} healthy image URLs. Downloading...`);
|
||||
const { downloaded, failed } = await downloadBatch(
|
||||
totalHealthyUrls,
|
||||
healthyDir,
|
||||
healthyCp.count,
|
||||
);
|
||||
|
||||
healthyCp.count += downloaded;
|
||||
healthyCp.downloaded += downloaded;
|
||||
healthyCp.failed += failed;
|
||||
healthyCp.exhausted = healthyExhausted;
|
||||
|
||||
const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
|
||||
console.log(
|
||||
` Got ${downloaded} images (${failed} failed). Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
|
||||
);
|
||||
} else {
|
||||
healthyCp.exhausted = true;
|
||||
console.log(` ✗ No healthy images found`);
|
||||
}
|
||||
|
||||
saveProgress(progress);
|
||||
}
|
||||
|
||||
// ── Summary ────────────────────────────────────────────────────────────────
|
||||
|
||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||
const mins = Math.floor(elapsed / 60);
|
||||
const secs = elapsed % 60;
|
||||
|
||||
let totalDownloaded = 0;
|
||||
let totalFailed = 0;
|
||||
let totalTarget = 0;
|
||||
|
||||
for (const [classId, cp] of Object.entries(progress.classes)) {
|
||||
totalDownloaded += cp.downloaded || 0;
|
||||
totalFailed += cp.failed || 0;
|
||||
totalTarget += classId === HEALTHY_CLASS ? TARGET_HEALTHY : TARGET_PER_DISEASE;
|
||||
}
|
||||
|
||||
const totalSize = await getDatasetSize();
|
||||
const sizeGb = (totalSize / (1024 * 1024 * 1024)).toFixed(2);
|
||||
|
||||
console.log("\n" + "=".repeat(60));
|
||||
console.log("COMPLETE");
|
||||
console.log("=".repeat(60));
|
||||
console.log(` Time: ${mins}m ${secs}s`);
|
||||
console.log(` Downloaded: ${totalDownloaded} images`);
|
||||
console.log(` Failed: ${totalFailed} images`);
|
||||
console.log(` Target: ${totalTarget} images`);
|
||||
console.log(` Dataset size: ${sizeGb} GB`);
|
||||
console.log(` Dataset location: ${DATASET_DIR}/`);
|
||||
console.log("");
|
||||
console.log("Next steps:");
|
||||
console.log(" 1. Run the fine-tuning script to train on this dataset");
|
||||
console.log(" 2. The fine-tuning script will resize to 160×160 and augment");
|
||||
console.log("=".repeat(60));
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate total size of the dataset directory.
|
||||
*/
|
||||
async function getDatasetSize(): Promise<number> {
|
||||
let total = 0;
|
||||
if (!existsSync(DATASET_DIR)) return 0;
|
||||
|
||||
const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
if (!entry.name.startsWith(".")) {
|
||||
const fullPath = resolve(DATASET_DIR, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
total += dirSize(fullPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
function dirSize(dirPath: string): number {
|
||||
let total = 0;
|
||||
try {
|
||||
const entries = readdirSync(dirPath, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(dirPath, entry.name);
|
||||
if (entry.isFile()) {
|
||||
total += statSync(fullPath).size;
|
||||
} else if (entry.isDirectory()) {
|
||||
total += dirSize(fullPath);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// skip errors
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Fatal error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user