flagging
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
11
apps/web/scripts/.flagged-content-review-needed.md
Normal file
11
apps/web/scripts/.flagged-content-review-needed.md
Normal file
@@ -0,0 +1,11 @@
|
||||
# 🚩 Flagged Content Review — Nothing to Review
|
||||
|
||||
Generated: 2026-06-06T21:02:03.301Z
|
||||
|
||||
**No content has been flagged for review yet.**
|
||||
|
||||
Flagged items will appear here once users flag content for manual review.
|
||||
|
||||
---
|
||||
|
||||
_Report generated with min-flags=1_
|
||||
53
apps/web/scripts/apply-flag-migration.ts
Normal file
53
apps/web/scripts/apply-flag-migration.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
/**
|
||||
* apply-flag-migration.ts
|
||||
*
|
||||
* Applies the flagged_content table migration to Turso.
|
||||
* Run with: npx tsx scripts/apply-flag-migration.ts
|
||||
*/
|
||||
|
||||
import dotenv from "dotenv";
|
||||
import path from "node:path";
|
||||
|
||||
const envFile =
|
||||
process.env.NODE_ENV === "production" ? "../.env.production" : "../.env.development";
|
||||
dotenv.config({ path: path.resolve(__dirname, envFile) });
|
||||
|
||||
import { createClient } from "@libsql/client";
|
||||
|
||||
async function main() {
|
||||
const db = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
console.log("Applying migration: create flagged_content table...");
|
||||
|
||||
await db.execute(`
|
||||
CREATE TABLE IF NOT EXISTS flagged_content (
|
||||
id text PRIMARY KEY NOT NULL,
|
||||
content_type text NOT NULL,
|
||||
content_id text NOT NULL,
|
||||
field_name text NOT NULL,
|
||||
notes text DEFAULT '',
|
||||
flag_count integer DEFAULT 1 NOT NULL,
|
||||
created_at text DEFAULT (datetime('now')) NOT NULL,
|
||||
updated_at text DEFAULT (datetime('now')) NOT NULL
|
||||
)
|
||||
`);
|
||||
|
||||
await db.execute(`
|
||||
CREATE INDEX IF NOT EXISTS idx_flagged_content_type ON flagged_content (content_type)
|
||||
`);
|
||||
|
||||
await db.execute(`
|
||||
CREATE INDEX IF NOT EXISTS idx_flagged_content_id ON flagged_content (content_id)
|
||||
`);
|
||||
|
||||
console.log("Migration applied successfully.");
|
||||
db.close();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Migration failed:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
19
apps/web/scripts/check-progress.mjs
Normal file
19
apps/web/scripts/check-progress.mjs
Normal file
@@ -0,0 +1,19 @@
|
||||
import { createClient } from "@libsql/client";
|
||||
const c = createClient({
|
||||
url: process.env.DATABASE_URL,
|
||||
authToken: process.env.DATABASE_TOKEN,
|
||||
});
|
||||
const r = await c.execute("SELECT COUNT(*) as cnt FROM diseases");
|
||||
const r2 = await c.execute(
|
||||
`SELECT SUM(CASE WHEN image_url IS NOT NULL AND image_url != '' THEN 1 ELSE 0 END) as has, SUM(CASE WHEN image_url IS NULL OR image_url = '' THEN 1 ELSE 0 END) as miss FROM diseases`,
|
||||
);
|
||||
const r3 = await c.execute(
|
||||
`SELECT severity, COUNT(*) as total, SUM(CASE WHEN image_url IS NOT NULL AND image_url != '' THEN 1 ELSE 0 END) as has FROM diseases GROUP BY severity ORDER BY severity`,
|
||||
);
|
||||
console.log(
|
||||
`Total: ${r.rows[0].cnt} | With images: ${r2.rows[0].has} | Missing: ${r2.rows[0].miss}`,
|
||||
);
|
||||
for (const row of r3.rows) {
|
||||
console.log(` ${row.severity?.padEnd(10)}: ${row.has}/${row.total}`);
|
||||
}
|
||||
c.close();
|
||||
379
apps/web/scripts/generate-flagged-report.ts
Normal file
379
apps/web/scripts/generate-flagged-report.ts
Normal file
@@ -0,0 +1,379 @@
|
||||
/**
|
||||
* generate-flagged-report.ts
|
||||
*
|
||||
* Reads all flagged content from the database and generates a pretty
|
||||
* markdown report organized by content type. The report includes:
|
||||
* - Summary table with counts per content type
|
||||
* - Plant images flagged for review
|
||||
* - Disease images flagged for review
|
||||
* - Disease symptoms flagged for review
|
||||
* - Disease causes flagged for review
|
||||
* - Disease treatment steps flagged for review
|
||||
* - Disease prevention tips flagged for review
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/generate-flagged-report.ts [--min-flags N] [--output path/to/report.md]
|
||||
*
|
||||
* Options:
|
||||
* --min-flags Minimum flag count to include (default: 1)
|
||||
* --output Output path (default: scripts/.flagged-content-review-needed.md)
|
||||
*/
|
||||
|
||||
import dotenv from "dotenv";
|
||||
import path from "node:path";
|
||||
|
||||
// Load DB config from .env.development (or .env.production if NODE_ENV=production)
|
||||
const envFile =
|
||||
process.env.NODE_ENV === "production" ? "../.env.production" : "../.env.development";
|
||||
dotenv.config({ path: path.resolve(__dirname, envFile) });
|
||||
import { createClient } from "@libsql/client";
|
||||
import fs from "node:fs";
|
||||
|
||||
// ─── Config ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const MIN_FLAGS = parseInt(
|
||||
process.argv.find((a) => a.startsWith("--min-flags="))?.split("=")[1] ?? "1",
|
||||
10,
|
||||
);
|
||||
const OUTPUT_PATH =
|
||||
process.argv.find((a) => a.startsWith("--output="))?.split("=")[1] ??
|
||||
path.join(__dirname, ".flagged-content-review-needed.md");
|
||||
|
||||
// ─── DB Connection ──────────────────────────────────────────────────────────
|
||||
|
||||
const db = createClient({
|
||||
url: process.env.DATABASE_URL!,
|
||||
authToken: process.env.DATABASE_TOKEN!,
|
||||
});
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface FlaggedRow {
|
||||
id: string;
|
||||
content_type: string;
|
||||
content_id: string;
|
||||
field_name: string;
|
||||
notes: string;
|
||||
flag_count: number;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
interface PlantRow {
|
||||
id: string;
|
||||
common_name: string;
|
||||
scientific_name: string;
|
||||
family: string;
|
||||
image_url: string;
|
||||
}
|
||||
|
||||
interface DiseaseRow {
|
||||
id: string;
|
||||
name: string;
|
||||
scientific_name: string;
|
||||
plant_id: string;
|
||||
image_url: string;
|
||||
}
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
const CONTENT_TYPE_LABELS: Record<string, { emoji: string; title: string; description: string }> = {
|
||||
plant_image: {
|
||||
emoji: "🪴",
|
||||
title: "Plant Images Flagged for Review",
|
||||
description: "Plant images that users have flagged as potentially incorrect or low quality.",
|
||||
},
|
||||
disease_image: {
|
||||
emoji: "📸",
|
||||
title: "Disease Images Flagged for Review",
|
||||
description:
|
||||
"Disease symptom images that users have flagged as potentially incorrect or misleading.",
|
||||
},
|
||||
disease_symptoms: {
|
||||
emoji: "⚠️",
|
||||
title: "Disease Symptoms Flagged for Review",
|
||||
description: "Symptom descriptions that users have flagged as potentially inaccurate.",
|
||||
},
|
||||
disease_causes: {
|
||||
emoji: "🔍",
|
||||
title: "Disease Causes Flagged for Review",
|
||||
description:
|
||||
"Causes and contributing factors that users have flagged as potentially incorrect.",
|
||||
},
|
||||
disease_treatment: {
|
||||
emoji: "💊",
|
||||
title: "Disease Treatment Steps Flagged for Review",
|
||||
description:
|
||||
"Treatment instructions that users have flagged as potentially incorrect or harmful.",
|
||||
},
|
||||
disease_prevention: {
|
||||
emoji: "🛡️",
|
||||
title: "Disease Prevention Tips Flagged for Review",
|
||||
description: "Prevention tips that users have flagged as potentially incorrect or misleading.",
|
||||
},
|
||||
};
|
||||
|
||||
function formatDate(iso: string): string {
|
||||
const d = new Date(iso);
|
||||
return d.toLocaleDateString("en-US", {
|
||||
year: "numeric",
|
||||
month: "short",
|
||||
day: "numeric",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Main ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log(`📋 Generating flagged content report (min flags: ${MIN_FLAGS})...`);
|
||||
|
||||
// Fetch flagged content
|
||||
const flaggedRs = await db.execute({
|
||||
sql: "SELECT * FROM flagged_content WHERE flag_count >= ? ORDER BY content_type, flag_count DESC, updated_at DESC",
|
||||
args: [MIN_FLAGS],
|
||||
});
|
||||
const flaggedRows = flaggedRs.rows as unknown as FlaggedRow[];
|
||||
|
||||
if (flaggedRows.length === 0) {
|
||||
const report = [
|
||||
"# 🚩 Flagged Content Review — Nothing to Review",
|
||||
"",
|
||||
`Generated: ${new Date().toISOString()}`,
|
||||
"",
|
||||
"**No content has been flagged for review yet.**",
|
||||
"",
|
||||
"Flagged items will appear here once users flag content for manual review.",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
`_Report generated with min-flags=${MIN_FLAGS}_`,
|
||||
"",
|
||||
].join("\n");
|
||||
|
||||
fs.writeFileSync(OUTPUT_PATH, report, "utf-8");
|
||||
console.log(`✅ Report written to ${OUTPUT_PATH} (no flagged items)`);
|
||||
db.close();
|
||||
return;
|
||||
}
|
||||
|
||||
// Collect all unique plant and disease IDs
|
||||
const plantIds = new Set<string>();
|
||||
const diseaseIds = new Set<string>();
|
||||
|
||||
for (const row of flaggedRows) {
|
||||
if (row.content_type === "plant_image") {
|
||||
plantIds.add(row.content_id);
|
||||
} else {
|
||||
diseaseIds.add(row.content_id);
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch plant names
|
||||
const plantMap = new Map<string, PlantRow>();
|
||||
if (plantIds.size > 0) {
|
||||
const plantRs = await db.execute({
|
||||
sql: `SELECT id, common_name, scientific_name, family, image_url FROM plants WHERE id IN (${[...plantIds].map(() => "?").join(",")})`,
|
||||
args: [...plantIds],
|
||||
});
|
||||
for (const row of plantRs.rows as unknown as PlantRow[]) {
|
||||
plantMap.set(row.id, row);
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch disease names + their plant references
|
||||
const diseaseMap = new Map<string, DiseaseRow>();
|
||||
if (diseaseIds.size > 0) {
|
||||
const diseaseRs = await db.execute({
|
||||
sql: `SELECT id, name, scientific_name, plant_id, image_url FROM diseases WHERE id IN (${[...diseaseIds].map(() => "?").join(",")})`,
|
||||
args: [...diseaseIds],
|
||||
});
|
||||
for (const row of diseaseRs.rows as unknown as DiseaseRow[]) {
|
||||
diseaseMap.set(row.id, row);
|
||||
if (!plantMap.has(row.plant_id)) {
|
||||
plantIds.add(row.plant_id);
|
||||
}
|
||||
}
|
||||
// Fetch any missing plant references for diseases
|
||||
if (plantIds.size > 0) {
|
||||
const missingPlantIds = [...plantIds].filter((id) => !plantMap.has(id));
|
||||
if (missingPlantIds.length > 0) {
|
||||
const plantRs = await db.execute({
|
||||
sql: `SELECT id, common_name, scientific_name, family, image_url FROM plants WHERE id IN (${missingPlantIds.map(() => "?").join(",")})`,
|
||||
args: missingPlantIds,
|
||||
});
|
||||
for (const row of plantRs.rows as unknown as PlantRow[]) {
|
||||
plantMap.set(row.id, row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Group by content type
|
||||
const groups: Record<string, FlaggedRow[]> = {};
|
||||
for (const row of flaggedRows) {
|
||||
if (!groups[row.content_type]) groups[row.content_type] = [];
|
||||
groups[row.content_type].push(row);
|
||||
}
|
||||
|
||||
// ─── Build Report ────────────────────────────────────────────────────────
|
||||
|
||||
const lines: string[] = [];
|
||||
const totalFlags = flaggedRows.reduce((sum, r) => sum + r.flag_count, 0);
|
||||
|
||||
lines.push("# 🚩 Flagged Content — Manual Review Needed");
|
||||
lines.push("");
|
||||
lines.push(`Generated: ${new Date().toISOString()}`);
|
||||
lines.push("");
|
||||
lines.push(
|
||||
flaggedRows.length === 1
|
||||
? `**${flaggedRows.length} item** flagged for review (${totalFlags} total flags).`
|
||||
: `**${flaggedRows.length} items** flagged for review (${totalFlags} total flags).`,
|
||||
);
|
||||
lines.push("");
|
||||
lines.push("Most data in this knowledge base is not reviewed by humans. ");
|
||||
lines.push("Items listed below have been flagged by users for manual review. ");
|
||||
lines.push("Please review each item and take appropriate action.");
|
||||
lines.push("");
|
||||
|
||||
// Summary table
|
||||
lines.push("## 📊 Summary");
|
||||
lines.push("");
|
||||
lines.push("| Content Type | Count | Total Flags |");
|
||||
lines.push("|---|---|---|");
|
||||
const orderedTypes = [
|
||||
"plant_image",
|
||||
"disease_image",
|
||||
"disease_symptoms",
|
||||
"disease_causes",
|
||||
"disease_treatment",
|
||||
"disease_prevention",
|
||||
];
|
||||
for (const type of orderedTypes) {
|
||||
const items = groups[type];
|
||||
if (!items) continue;
|
||||
const label = CONTENT_TYPE_LABELS[type]?.title ?? type;
|
||||
const count = items.length;
|
||||
const sumFlags = items.reduce((s, r) => s + r.flag_count, 0);
|
||||
lines.push(`| ${label} | ${count} | ${sumFlags} |`);
|
||||
}
|
||||
lines.push(`| **Total** | **${flaggedRows.length}** | **${totalFlags}** |`);
|
||||
lines.push("");
|
||||
lines.push("---");
|
||||
lines.push("");
|
||||
|
||||
// Detail sections per content type
|
||||
for (const type of orderedTypes) {
|
||||
const items = groups[type];
|
||||
if (!items) continue;
|
||||
|
||||
const config = CONTENT_TYPE_LABELS[type];
|
||||
lines.push(`## ${config?.emoji ?? "📋"} ${config?.title ?? type}`);
|
||||
lines.push("");
|
||||
lines.push(config?.description ?? "");
|
||||
lines.push("");
|
||||
lines.push(`**${items.length} item${items.length === 1 ? "" : "s"} flagged**`);
|
||||
lines.push("");
|
||||
|
||||
for (const item of items) {
|
||||
// Build label
|
||||
let label = item.content_id;
|
||||
let plantLabel = "";
|
||||
|
||||
if (type === "plant_image") {
|
||||
const plant = plantMap.get(item.content_id);
|
||||
if (plant) {
|
||||
label = `${plant.common_name} (_${plant.scientific_name}_)`;
|
||||
plantLabel = `${plant.family} family`;
|
||||
}
|
||||
} else {
|
||||
const disease = diseaseMap.get(item.content_id);
|
||||
if (disease) {
|
||||
const plant = plantMap.get(disease.plant_id);
|
||||
const plantName = plant?.common_name ?? disease.plant_id;
|
||||
label = `${disease.name} (_${disease.scientific_name}_) on **${plantName}**`;
|
||||
plantLabel = `Affects: ${plantName}`;
|
||||
}
|
||||
}
|
||||
|
||||
const flagWord = item.flag_count === 1 ? "flag" : "flags";
|
||||
const firstFlagged = formatDate(item.created_at);
|
||||
const lastFlagged = formatDate(item.updated_at);
|
||||
|
||||
lines.push(`### ${label}`);
|
||||
lines.push("");
|
||||
lines.push(`- **Field:** \`${item.field_name}\``);
|
||||
lines.push(`- **Flags:** ${item.flag_count} ${flagWord}`);
|
||||
lines.push(`- **First flagged:** ${firstFlagged}`);
|
||||
lines.push(`- **Last flagged:** ${lastFlagged}`);
|
||||
if (plantLabel) {
|
||||
lines.push(`- **${plantLabel}**`);
|
||||
}
|
||||
if (item.notes) {
|
||||
lines.push(`- **User notes:** ${item.notes}`);
|
||||
}
|
||||
|
||||
// Show the content data if we can fetch it
|
||||
if (type === "plant_image") {
|
||||
const plant = plantMap.get(item.content_id);
|
||||
if (plant?.image_url) {
|
||||
lines.push("");
|
||||
lines.push(` `);
|
||||
}
|
||||
} else {
|
||||
const disease = diseaseMap.get(item.content_id);
|
||||
if (type === "disease_image" && disease?.image_url) {
|
||||
lines.push("");
|
||||
lines.push(` `);
|
||||
}
|
||||
}
|
||||
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
lines.push("---");
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
// Footer
|
||||
lines.push("## ℹ️ How This Works");
|
||||
lines.push("");
|
||||
lines.push("1. **Users** click the 🚩 Flag button on any content they believe needs review.");
|
||||
lines.push("2. **The system** stores the flag in the database with a counter.");
|
||||
lines.push(
|
||||
"3. **This report** is generated by querying the database and formatting the results.",
|
||||
);
|
||||
lines.push("4. **Reviewers** go through each item and take action (fix, update, or dismiss).");
|
||||
lines.push("");
|
||||
lines.push("### Taking Action");
|
||||
lines.push("");
|
||||
lines.push("After reviewing an item, you can clear its flags by running:");
|
||||
lines.push("");
|
||||
lines.push("```sql");
|
||||
lines.push("DELETE FROM flagged_content WHERE id = '<item-id>';");
|
||||
lines.push("```");
|
||||
lines.push("");
|
||||
lines.push("Or clear all flags for a specific item by running:");
|
||||
lines.push("");
|
||||
lines.push("```sql");
|
||||
lines.push(
|
||||
"UPDATE flagged_content SET flag_count = 0 WHERE content_id = '<id>' AND field_name = '<field>';",
|
||||
);
|
||||
lines.push("```");
|
||||
lines.push("");
|
||||
lines.push("---");
|
||||
lines.push("");
|
||||
lines.push(`_Report generated with min-flags=${MIN_FLAGS}_`);
|
||||
|
||||
// Write report
|
||||
fs.writeFileSync(OUTPUT_PATH, lines.join("\n"), "utf-8");
|
||||
console.log(`✅ Report written to ${OUTPUT_PATH}`);
|
||||
console.log(` ${flaggedRows.length} items, ${totalFlags} total flags`);
|
||||
db.close();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("❌ Failed to generate report:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -2,59 +2,113 @@
|
||||
/**
|
||||
* scrape-training-dataset.ts
|
||||
*
|
||||
* Collects a training dataset for fine-tuning by scraping DuckDuckGo image search.
|
||||
* Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
|
||||
*
|
||||
* Targets:
|
||||
* - 200 images per disease class (93 diseases)
|
||||
* - 400 images for the "healthy" class
|
||||
* - Full resolution images stored in data/dataset/{class_id}/
|
||||
* Targets (tiered by plant type):
|
||||
* - Core plants (houseplants + common garden): 100 images per disease
|
||||
* - Full set (all 11,498 DB diseases): 10 images per disease
|
||||
* - Healthy: 400 images
|
||||
*
|
||||
* DuckDuckGo approach (no API key needed):
|
||||
* 1. Fetch the main search page to extract a vqd (query) token
|
||||
* 2. Use the vqd token to paginate through image results
|
||||
* 3. Download each image to the dataset directory
|
||||
* Sources (all free, no API keys):
|
||||
* 1. DB image_url — existing images already found
|
||||
* 2. DuckDuckGo — general web image search
|
||||
* 3. iNaturalist — real-world plant observation photos
|
||||
* 4. Wikimedia Commons — curated scientific/educational images
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
|
||||
*
|
||||
* Progress is tracked in data/dataset/.progress.json — interrupt and resume safely.
|
||||
* Progress: data/dataset/.progress.json — interrupt and resume safely.
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
|
||||
import { resolve, extname, join } from "path";
|
||||
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
|
||||
import { resolve, extname } from "path";
|
||||
|
||||
// Load .env.development for DB creds
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "@/lib/db/index";
|
||||
import { diseases } from "@/lib/db/schema";
|
||||
|
||||
// ─── Config ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
|
||||
const PLANTS_JSON = resolve(__dirname, "../src/data/plants.json");
|
||||
|
||||
const DATASET_DIR = resolve(__dirname, "../data/dataset");
|
||||
const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");
|
||||
|
||||
/** Target images per disease class */
|
||||
const TARGET_PER_DISEASE = 200;
|
||||
/** Target images per disease for CORE plants */
|
||||
const TARGET_CORE = 100;
|
||||
|
||||
/** Target images for the "healthy" class (2× normal) */
|
||||
/** Target images per disease for the FULL set */
|
||||
const TARGET_FULL = 10;
|
||||
|
||||
/** Target images for the "healthy" class */
|
||||
const TARGET_HEALTHY = 400;
|
||||
|
||||
/** Core plants that get higher image targets */
|
||||
const CORE_PLANTS = new Set([
|
||||
// Houseplants
|
||||
"monstera",
|
||||
"pothos",
|
||||
"snake-plant",
|
||||
"peace-lily",
|
||||
"orchid",
|
||||
"succulent",
|
||||
"fiddle-leaf-fig",
|
||||
"aloe-vera",
|
||||
"cactus",
|
||||
"fern",
|
||||
// Garden plants
|
||||
"tomato",
|
||||
"basil",
|
||||
"rose",
|
||||
"pepper",
|
||||
"strawberry",
|
||||
"cucumber",
|
||||
"squash",
|
||||
"lettuce",
|
||||
"spinach",
|
||||
"cabbage",
|
||||
"lavender",
|
||||
"mint",
|
||||
"jasmine",
|
||||
"sunflower",
|
||||
"daisy",
|
||||
"zucchini",
|
||||
"bean",
|
||||
"eggplant",
|
||||
"chili",
|
||||
// General disease patterns
|
||||
"general",
|
||||
]);
|
||||
|
||||
/** Delay between DuckDuckGo search API calls (ms) */
|
||||
const SEARCH_DELAY = 1500;
|
||||
|
||||
/** Delay between image downloads (ms) */
|
||||
const DOWNLOAD_DELAY = 300;
|
||||
const DOWNLOAD_DELAY = 100;
|
||||
|
||||
/** Max concurrent downloads */
|
||||
const CONCURRENT_DOWNLOADS = 5;
|
||||
const CONCURRENT_DOWNLOADS = 10;
|
||||
|
||||
/** Minimum image size in bytes to accept (reject tiny placeholders) */
|
||||
/** Minimum image size in bytes to accept */
|
||||
const MIN_IMAGE_SIZE = 10_000; // 10KB
|
||||
|
||||
/** Maximum image size in bytes */
|
||||
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
|
||||
|
||||
/** Allowed image content types */
|
||||
const ALLOWED_CONTENT_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif"];
|
||||
|
||||
/** Allowed file extensions */
|
||||
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
|
||||
|
||||
@@ -62,22 +116,16 @@ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
|
||||
const UA =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
||||
|
||||
/** Class ID for healthy plants */
|
||||
const HEALTHY_CLASS = "healthy";
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface DiseaseSeed {
|
||||
interface DbDisease {
|
||||
id: string;
|
||||
plantId: string;
|
||||
name: string;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
interface PlantSeed {
|
||||
id: string;
|
||||
commonName: string;
|
||||
scientificName: string;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
[key: string]: any;
|
||||
imageUrl: string | null;
|
||||
}
|
||||
|
||||
interface DuckDuckGoImageResult {
|
||||
@@ -93,10 +141,7 @@ interface ClassProgress {
|
||||
count: number;
|
||||
downloaded: number;
|
||||
failed: number;
|
||||
skipped: number;
|
||||
/** URLs we've already seen (to avoid duplicates) */
|
||||
seenUrls: string[];
|
||||
/** Whether we've exhausted search results */
|
||||
exhausted: boolean;
|
||||
}
|
||||
|
||||
@@ -105,15 +150,27 @@ interface Progress {
|
||||
classes: Record<string, ClassProgress>;
|
||||
}
|
||||
|
||||
/** Class ID for healthy plants */
|
||||
const HEALTHY_CLASS = "healthy";
|
||||
// ─── DB Loading ──────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Load all diseases from the database with their existing image URLs.
|
||||
*/
|
||||
async function loadDiseasesFromDb(): Promise<DbDisease[]> {
|
||||
const db = getDb();
|
||||
const rows = await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
plantId: diseases.plantId,
|
||||
name: diseases.name,
|
||||
imageUrl: diseases.imageUrl,
|
||||
})
|
||||
.from(diseases)
|
||||
.orderBy(diseases.id);
|
||||
return rows;
|
||||
}
|
||||
|
||||
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Extract the vqd token from DuckDuckGo's search page.
|
||||
* Required for paginating image results.
|
||||
*/
|
||||
async function getVqdToken(query: string): Promise<string> {
|
||||
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
|
||||
|
||||
@@ -122,25 +179,15 @@ async function getVqdToken(query: string): Promise<string> {
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
throw new Error(`Failed to get vqd token: ${res.status}`);
|
||||
}
|
||||
if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
|
||||
|
||||
const html = await res.text();
|
||||
|
||||
// Extract vqd token from the HTML
|
||||
// Format: vqd='<token>' or vqd="<token>"
|
||||
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
|
||||
if (!match) {
|
||||
throw new Error(`Could not extract vqd token from DuckDuckGo response for "${query}"`);
|
||||
}
|
||||
if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
|
||||
|
||||
return match[1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a page of DuckDuckGo image results.
|
||||
*/
|
||||
async function searchImagesDuckDuckGo(
|
||||
query: string,
|
||||
vqd: string,
|
||||
@@ -161,12 +208,9 @@ async function searchImagesDuckDuckGo(
|
||||
if (res.status === 429) {
|
||||
console.warn(" ⚠ Rate limited (429). Waiting 10s...");
|
||||
await sleep(10_000);
|
||||
return searchImagesDuckDuckGo(query, vqd, page); // Retry
|
||||
}
|
||||
if (res.status === 403) {
|
||||
console.warn(" ⚠ Forbidden (403). Token may have expired.");
|
||||
return []; // Token expired — no more pages
|
||||
return searchImagesDuckDuckGo(query, vqd, page);
|
||||
}
|
||||
if (res.status === 403) return [];
|
||||
throw new Error(`DuckDuckGo search failed: ${res.status}`);
|
||||
}
|
||||
|
||||
@@ -174,11 +218,7 @@ async function searchImagesDuckDuckGo(
|
||||
return data.results ?? [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Search DuckDuckGo images, automatically paginating to collect up to `target` results.
|
||||
* Returns unique image URLs.
|
||||
*/
|
||||
async function collectImages(
|
||||
async function collectImagesDuckDuckGo(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
@@ -188,27 +228,29 @@ async function collectImages(
|
||||
let exhausted = false;
|
||||
let consecutiveEmpty = 0;
|
||||
|
||||
// Get vqd token
|
||||
let vqd: string;
|
||||
try {
|
||||
vqd = await getVqdToken(query);
|
||||
} catch (err) {
|
||||
console.warn(` ⚠ Failed to get vqd token: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
return { urls: [], exhausted: true };
|
||||
}
|
||||
|
||||
while (results.length < target) {
|
||||
const MAX_PAGES = 5;
|
||||
let lowNoveltyCount = 0;
|
||||
|
||||
while (results.length < target && page <= MAX_PAGES) {
|
||||
await sleep(SEARCH_DELAY);
|
||||
|
||||
let pageResults: DuckDuckGoImageResult[];
|
||||
try {
|
||||
pageResults = await searchImagesDuckDuckGo(query, vqd, page);
|
||||
} catch (err) {
|
||||
console.warn(` ⚠ Search error: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
console.warn(` ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
break;
|
||||
}
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
if (!pageResults || pageResults.length === 0) {
|
||||
consecutiveEmpty++;
|
||||
if (consecutiveEmpty >= 3) {
|
||||
exhausted = true;
|
||||
@@ -223,78 +265,160 @@ async function collectImages(
|
||||
|
||||
for (const r of pageResults) {
|
||||
if (results.length >= target) break;
|
||||
|
||||
const imgUrl = r.image || r.url;
|
||||
|
||||
// Skip if we've already seen this URL
|
||||
if (!imgUrl || typeof imgUrl !== "string") continue;
|
||||
if (seenUrls.has(imgUrl)) continue;
|
||||
|
||||
// Validate URL looks like an image
|
||||
const ext = extname(new URL(imgUrl).pathname).toLowerCase();
|
||||
if (!ALLOWED_EXTENSIONS.includes(ext) && !ext) {
|
||||
// No extension - still try, could be a CDN URL
|
||||
try {
|
||||
new URL(imgUrl);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
seenUrls.add(imgUrl);
|
||||
results.push(imgUrl);
|
||||
newCount++;
|
||||
}
|
||||
|
||||
if (newCount === 0 && pageResults.every((r) => seenUrls.has(r.image || r.url))) {
|
||||
// All results on this page were already seen
|
||||
page++;
|
||||
continue;
|
||||
const newRatio = newCount / pageResults.length;
|
||||
if (newRatio < 0.05) {
|
||||
lowNoveltyCount++;
|
||||
if (lowNoveltyCount >= 2) break;
|
||||
} else {
|
||||
lowNoveltyCount = 0;
|
||||
}
|
||||
|
||||
if (results.length < target) {
|
||||
page++;
|
||||
}
|
||||
if (results.length < target) page++;
|
||||
}
|
||||
|
||||
return { urls: results.slice(0, target), exhausted };
|
||||
}
|
||||
|
||||
// ─── iNaturalist API ─────────────────────────────────────────────────────────
|
||||
|
||||
async function searchImagesInaturalist(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<{ urls: string[]; exhausted: boolean }> {
|
||||
const results: string[] = [];
|
||||
const perPage = Math.min(target, 200);
|
||||
|
||||
const apiUrl =
|
||||
`https://api.inaturalist.org/v1/observations` +
|
||||
`?q=${encodeURIComponent(query)}` +
|
||||
`&photos_only=true` +
|
||||
`&quality_grade=research` +
|
||||
`&per_page=${perPage}` +
|
||||
`&order_by=observed_on&order=desc`;
|
||||
|
||||
try {
|
||||
const res = await fetch(apiUrl, {
|
||||
headers: { "User-Agent": UA, Accept: "application/json" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!res.ok) return { urls: [], exhausted: false };
|
||||
|
||||
const data = (await res.json()) as {
|
||||
results: Array<{ photos: Array<{ url: string }> }>;
|
||||
};
|
||||
|
||||
for (const obs of data.results ?? []) {
|
||||
if (results.length >= target) break;
|
||||
for (const photo of obs.photos ?? []) {
|
||||
if (results.length >= target) break;
|
||||
const url = photo.url;
|
||||
if (!url || seenUrls.has(url)) continue;
|
||||
const fullUrl = url.replace("/medium.", "/original.");
|
||||
seenUrls.add(fullUrl);
|
||||
results.push(fullUrl);
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: results, exhausted: results.length < target };
|
||||
} catch {
|
||||
return { urls: results, exhausted: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Wikimedia Commons API ──────────────────────────────────────────────────
|
||||
|
||||
async function searchImagesCommons(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<{ urls: string[]; exhausted: boolean }> {
|
||||
const results: string[] = [];
|
||||
let sroffset = 0;
|
||||
|
||||
while (results.length < target) {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
list: "search",
|
||||
srsearch: query,
|
||||
srnamespace: "6",
|
||||
srlimit: "50",
|
||||
sroffset: String(sroffset),
|
||||
format: "json",
|
||||
origin: "*", // server-side API call
|
||||
});
|
||||
|
||||
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA },
|
||||
signal: AbortSignal.timeout(10_000),
|
||||
});
|
||||
if (!res.ok) break;
|
||||
|
||||
const data = (await res.json()) as {
|
||||
query?: { search?: Array<{ title: string }> };
|
||||
continue?: { sroffset?: number };
|
||||
};
|
||||
|
||||
const hits = data.query?.search ?? [];
|
||||
if (hits.length === 0) break;
|
||||
|
||||
for (const hit of hits) {
|
||||
if (results.length >= target) break;
|
||||
const filename = hit.title.replace(/^File:/, "");
|
||||
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(filename)}`;
|
||||
if (seenUrls.has(imgUrl)) continue;
|
||||
seenUrls.add(imgUrl);
|
||||
results.push(imgUrl);
|
||||
}
|
||||
|
||||
sroffset = data.continue?.sroffset ?? sroffset + hits.length;
|
||||
} catch {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: results, exhausted: results.length < target };
|
||||
}
|
||||
|
||||
// ─── Image Download ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Download a single image from a URL to the target path.
|
||||
* Returns true if successful, false otherwise.
|
||||
*/
|
||||
async function downloadImage(url: string, destPath: string): Promise<boolean> {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg" },
|
||||
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) return false;
|
||||
|
||||
const contentType = res.headers.get("content-type") || "";
|
||||
const contentLength = parseInt(res.headers.get("content-length") || "0", 10);
|
||||
|
||||
// Validate content type
|
||||
if (!ALLOWED_CONTENT_TYPES.some((t) => contentType.includes(t))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate size
|
||||
if (contentLength > 0 && contentLength < MIN_IMAGE_SIZE) return false;
|
||||
if (contentLength > MAX_IMAGE_SIZE) return false;
|
||||
if (contentType.includes("text/html")) return false;
|
||||
|
||||
const buffer = Buffer.from(await res.arrayBuffer());
|
||||
|
||||
// Double-check actual buffer size
|
||||
if (buffer.length < MIN_IMAGE_SIZE) return false;
|
||||
if (buffer.length > MAX_IMAGE_SIZE) return false;
|
||||
|
||||
// Determine correct extension from content type or URL
|
||||
let ext = extname(new URL(url).pathname).toLowerCase();
|
||||
if (!ALLOWED_EXTENSIONS.includes(ext)) {
|
||||
// Map from content type
|
||||
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
|
||||
else if (contentType.includes("png")) ext = ".png";
|
||||
else if (contentType.includes("webp")) ext = ".webp";
|
||||
else ext = ".jpg"; // Default
|
||||
else ext = ".jpg";
|
||||
}
|
||||
|
||||
const filePath = destPath.replace(/\.\w+$/, ext);
|
||||
@@ -305,9 +429,6 @@ async function downloadImage(url: string, destPath: string): Promise<boolean> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Download multiple images concurrently, respecting a per-download delay.
|
||||
*/
|
||||
async function downloadBatch(
|
||||
urls: string[],
|
||||
classDir: string,
|
||||
@@ -317,7 +438,6 @@ async function downloadBatch(
|
||||
let failed = 0;
|
||||
let index = startIndex;
|
||||
|
||||
// Process in chunks to control concurrency
|
||||
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
|
||||
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
|
||||
|
||||
@@ -325,16 +445,23 @@ async function downloadBatch(
|
||||
chunk.map(async (url) => {
|
||||
const paddedIndex = String(index).padStart(4, "0");
|
||||
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
|
||||
|
||||
const success = await downloadImage(url, destPath);
|
||||
await sleep(DOWNLOAD_DELAY);
|
||||
return { success, index: index++ };
|
||||
return { success, index: index++, url: url.substring(0, 50) };
|
||||
}),
|
||||
);
|
||||
|
||||
for (const r of results) {
|
||||
if (r.success) downloaded++;
|
||||
else failed++;
|
||||
else {
|
||||
failed++;
|
||||
if (failed % 20 === 1) console.log(` ⚠ Failed: ${r.url}...`);
|
||||
}
|
||||
}
|
||||
|
||||
const total = downloaded + failed;
|
||||
if (total % 30 === 0 || total === urls.length) {
|
||||
console.log(` Progress: ${downloaded}/${urls.length} (${failed} failed)`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -361,7 +488,6 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
|
||||
count: 0,
|
||||
downloaded: 0,
|
||||
failed: 0,
|
||||
skipped: 0,
|
||||
seenUrls: [],
|
||||
exhausted: false,
|
||||
};
|
||||
@@ -369,26 +495,22 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
|
||||
return progress.classes[classId];
|
||||
}
|
||||
|
||||
// ─── Search Query Building ──────────────────────────────────────────────────
|
||||
// ─── Query Building ─────────────────────────────────────────────────────────
|
||||
|
||||
function buildSearchQueries(disease: DiseaseSeed, plant: PlantSeed | null): string[] {
|
||||
const name = disease.name;
|
||||
const plantName = plant?.commonName || disease.plantId;
|
||||
|
||||
return [
|
||||
`${name} ${plantName} leaf disease`,
|
||||
`${plantName} ${name} symptoms`,
|
||||
`${name} plant disease`,
|
||||
`${plantName} diseased leaf`,
|
||||
];
|
||||
function buildSearchQueries(disease: DbDisease): string[] {
|
||||
const name = disease.name || disease.id.replace(/-/g, " ");
|
||||
const plant = disease.plantId.replace(/-/g, " ");
|
||||
// Every query keeps the disease NAME to avoid noisy labels
|
||||
return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
|
||||
}
|
||||
|
||||
function buildHealthyQueries(plant: PlantSeed): string[] {
|
||||
function buildHealthyQueries(plant: string): string[] {
|
||||
const name = plant.replace(/-/g, " ");
|
||||
return [
|
||||
`healthy ${plant.commonName} leaf`,
|
||||
`${plant.commonName} leaf closeup`,
|
||||
`healthy ${plant.commonName} plant`,
|
||||
`${plant.commonName} foliage`,
|
||||
`healthy ${name} leaf`,
|
||||
`${name} leaf closeup`,
|
||||
`healthy ${name} plant`,
|
||||
`${name} foliage`,
|
||||
];
|
||||
}
|
||||
|
||||
@@ -400,64 +522,97 @@ async function collectClassImages(
|
||||
target: number,
|
||||
progress: Progress,
|
||||
classDir: string,
|
||||
existingUrls: string[] = [],
|
||||
fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
|
||||
): Promise<void> {
|
||||
const cp = getClassProgress(progress, classId);
|
||||
const seenUrls = new Set(cp.seenUrls);
|
||||
|
||||
if (cp.count >= target) {
|
||||
console.log(` ✓ Already have ${cp.count}/${target} images`);
|
||||
console.log(` ✓ Already have ${cp.count}/${target}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (cp.exhausted) {
|
||||
console.log(` ✓ Already exhausted search results (${cp.count}/${target} images)`);
|
||||
console.log(` ✓ Exhausted (${cp.count}/${target})`);
|
||||
return;
|
||||
}
|
||||
|
||||
mkdirSync(classDir, { recursive: true });
|
||||
|
||||
const totalUrls: string[] = [];
|
||||
const allUrls: string[] = [];
|
||||
let exhausted = false;
|
||||
|
||||
// Search with each query until we hit the target
|
||||
for (const query of queries) {
|
||||
if (totalUrls.length >= target) break;
|
||||
|
||||
console.log(` Searching: "${query}"...`);
|
||||
const result = await collectImages(query, target - totalUrls.length, seenUrls);
|
||||
|
||||
totalUrls.push(...result.urls);
|
||||
cp.seenUrls = Array.from(seenUrls);
|
||||
|
||||
if (result.exhausted) {
|
||||
exhausted = true;
|
||||
// ── Source 0: Existing DB URLs ──────────────────────────────────────────
|
||||
const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
|
||||
if (freshDbUrls.length > 0) {
|
||||
console.log(` DB: ${freshDbUrls.length} existing URLs`);
|
||||
for (const url of freshDbUrls) {
|
||||
if (allUrls.length >= target) break;
|
||||
seenUrls.add(url);
|
||||
allUrls.push(url);
|
||||
}
|
||||
|
||||
if (totalUrls.length >= target) break;
|
||||
}
|
||||
|
||||
if (totalUrls.length === 0) {
|
||||
// ── Source 1: DuckDuckGo ──────────────────────────────────────────────
|
||||
// Skip DDG in fast mode (full set — DDG is slowest source)
|
||||
if (!fastMode && allUrls.length < target) {
|
||||
for (const query of queries) {
|
||||
if (allUrls.length >= target) break;
|
||||
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
|
||||
const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
if (result.exhausted) exhausted = true;
|
||||
console.log(`${result.urls.length} new`);
|
||||
if (allUrls.length >= target) break;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Source 2: iNaturalist ──────────────────────────────────────────────
|
||||
if (allUrls.length < target) {
|
||||
const primaryQuery = queries[0];
|
||||
console.log(` iNat: Searching...`);
|
||||
const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
if (result.exhausted) exhausted = true;
|
||||
console.log(` iNat: ${result.urls.length} images`);
|
||||
}
|
||||
|
||||
// ── Source 3: Wikimedia Commons ────────────────────────────────────────
|
||||
if (allUrls.length < target) {
|
||||
const primaryQuery = queries[0];
|
||||
console.log(` Commons: Searching...`);
|
||||
const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
if (result.exhausted) exhausted = true;
|
||||
console.log(` Commons: ${result.urls.length} images`);
|
||||
}
|
||||
|
||||
if (allUrls.length === 0) {
|
||||
cp.exhausted = exhausted;
|
||||
saveProgress(progress);
|
||||
console.log(` ✗ No images found for "${classId}"`);
|
||||
console.log(` ✗ No images found`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Found ${totalUrls.length} unique image URLs. Downloading...`);
|
||||
// Save progress with seen URLs BEFORE downloading
|
||||
cp.seenUrls = Array.from(seenUrls);
|
||||
cp.exhausted = exhausted;
|
||||
saveProgress(progress);
|
||||
|
||||
// Download the images
|
||||
const { downloaded, failed } = await downloadBatch(totalUrls, classDir, cp.count);
|
||||
console.log(` Downloading ${allUrls.length} images...`);
|
||||
|
||||
const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count);
|
||||
|
||||
cp.count += downloaded;
|
||||
cp.downloaded += downloaded;
|
||||
cp.failed += failed;
|
||||
cp.exhausted = exhausted;
|
||||
|
||||
saveProgress(progress);
|
||||
|
||||
const pct = Math.round((cp.count / target) * 100);
|
||||
console.log(
|
||||
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
|
||||
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${allUrls.length} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -465,25 +620,18 @@ async function collectClassImages(
|
||||
|
||||
async function main() {
|
||||
console.log("=".repeat(60));
|
||||
console.log("PLANT DISEASE DATASET COLLECTOR");
|
||||
console.log("PLANT DISEASE DATASET COLLECTOR — FULL DB");
|
||||
console.log("=".repeat(60));
|
||||
|
||||
// Load knowledge base
|
||||
const diseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
|
||||
const plants = JSON.parse(readFileSync(PLANTS_JSON, "utf-8")) as PlantSeed[];
|
||||
// Load diseases from DB
|
||||
console.log("\nLoading diseases from database...");
|
||||
const dbDiseases = await loadDiseasesFromDb();
|
||||
console.log(` ${dbDiseases.length} diseases loaded`);
|
||||
|
||||
const plantMap = new Map<string, PlantSeed>();
|
||||
for (const p of plants) {
|
||||
plantMap.set(p.id, p);
|
||||
}
|
||||
|
||||
console.log(`\nLoaded ${diseases.length} diseases, ${plants.length} plants`);
|
||||
console.log(
|
||||
`Target: ${TARGET_PER_DISEASE} images/disease (×${diseases.length} = ${diseases.length * TARGET_PER_DISEASE})`,
|
||||
);
|
||||
console.log(`Target: ${TARGET_HEALTHY} images for "healthy" class`);
|
||||
console.log(`Output: ${DATASET_DIR}/`);
|
||||
console.log("");
|
||||
const coreDiseases = dbDiseases.filter((d) => CORE_PLANTS.has(d.plantId));
|
||||
const fullDiseases = dbDiseases.filter((d) => !CORE_PLANTS.has(d.plantId));
|
||||
console.log(` Core plants: ${coreDiseases.length} diseases (target: ${TARGET_CORE})`);
|
||||
console.log(` Full set: ${fullDiseases.length} diseases (target: ${TARGET_FULL})`);
|
||||
|
||||
// Load progress
|
||||
mkdirSync(DATASET_DIR, { recursive: true });
|
||||
@@ -491,28 +639,46 @@ async function main() {
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// ── Phase 1: Disease classes ──────────────────────────────────────────────
|
||||
|
||||
console.log("─".repeat(60));
|
||||
console.log("PHASE 1: Disease Images");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
for (let i = 0; i < diseases.length; i++) {
|
||||
const disease = diseases[i];
|
||||
const plant = plantMap.get(disease.plantId) ?? null;
|
||||
const classDir = resolve(DATASET_DIR, disease.id);
|
||||
const queries = buildSearchQueries(disease, plant);
|
||||
|
||||
const pct = Math.round((i / diseases.length) * 100);
|
||||
console.log(`\n[${i + 1}/${diseases.length}] (${pct}%) ${disease.name} (${disease.id})`);
|
||||
|
||||
await collectClassImages(disease.id, queries, TARGET_PER_DISEASE, progress, classDir);
|
||||
}
|
||||
|
||||
// ── Phase 2: Healthy class ────────────────────────────────────────────────
|
||||
// ── Phase 1: Core set ──────────────────────────────────────────────────
|
||||
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log("PHASE 2: Healthy Plant Images");
|
||||
console.log("PHASE 1: Core Diseases (100 images each)");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
for (let i = 0; i < coreDiseases.length; i++) {
|
||||
const d = coreDiseases[i];
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d);
|
||||
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
|
||||
|
||||
const pct = Math.round((i / coreDiseases.length) * 100);
|
||||
console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
|
||||
|
||||
await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
|
||||
}
|
||||
|
||||
// ── Phase 2: Full set ──────────────────────────────────────────────────
|
||||
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log("PHASE 2: Full Disease Set (10 images each)");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
for (let i = 0; i < fullDiseases.length; i++) {
|
||||
const d = fullDiseases[i];
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d);
|
||||
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
|
||||
|
||||
const pct = Math.round((i / fullDiseases.length) * 100);
|
||||
console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
|
||||
|
||||
await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
|
||||
}
|
||||
|
||||
// ── Phase 3: Healthy class ──────────────────────────────────────────────
|
||||
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log("PHASE 3: Healthy Plant Images");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
|
||||
@@ -520,39 +686,50 @@ async function main() {
|
||||
const healthySeen = new Set(healthyCp.seenUrls);
|
||||
|
||||
if (healthyCp.count >= TARGET_HEALTHY) {
|
||||
console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY} healthy images`);
|
||||
console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`);
|
||||
} else {
|
||||
// Build a pool of healthy plant queries
|
||||
// Collect all unique plants
|
||||
const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))];
|
||||
const allHealthyQueries: string[] = [];
|
||||
for (const plant of plants) {
|
||||
for (const plant of allPlants) {
|
||||
allHealthyQueries.push(...buildHealthyQueries(plant));
|
||||
}
|
||||
|
||||
const healthySources = [
|
||||
{ name: "DDG", collector: collectImagesDuckDuckGo },
|
||||
{ name: "iNat", collector: searchImagesInaturalist },
|
||||
{ name: "Commons", collector: searchImagesCommons },
|
||||
] as const;
|
||||
|
||||
const totalHealthyUrls: string[] = [];
|
||||
let healthyExhausted = false;
|
||||
let anyRemaining = false;
|
||||
|
||||
for (const query of allHealthyQueries) {
|
||||
for (const source of healthySources) {
|
||||
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
|
||||
if (healthyExhausted) break;
|
||||
console.log(`\n Source: ${source.name}`);
|
||||
|
||||
console.log(`\n Searching: "${query}"...`);
|
||||
const result = await collectImages(
|
||||
query,
|
||||
TARGET_HEALTHY - totalHealthyUrls.length,
|
||||
healthySeen,
|
||||
);
|
||||
for (const query of allHealthyQueries.slice(0, 20)) {
|
||||
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
|
||||
|
||||
totalHealthyUrls.push(...result.urls);
|
||||
|
||||
if (result.exhausted) {
|
||||
healthyExhausted = true;
|
||||
process.stdout.write(` "${query}"... `);
|
||||
const result = await source.collector(
|
||||
query,
|
||||
TARGET_HEALTHY - totalHealthyUrls.length,
|
||||
healthySeen,
|
||||
);
|
||||
totalHealthyUrls.push(...result.urls);
|
||||
if (!result.exhausted) anyRemaining = true;
|
||||
console.log(`${result.urls.length} new`);
|
||||
}
|
||||
}
|
||||
|
||||
healthyCp.seenUrls = Array.from(healthySeen);
|
||||
|
||||
if (totalHealthyUrls.length > 0) {
|
||||
console.log(`\n Found ${totalHealthyUrls.length} healthy image URLs. Downloading...`);
|
||||
healthyCp.exhausted = !anyRemaining;
|
||||
saveProgress(progress);
|
||||
|
||||
console.log(`\n Downloading ${totalHealthyUrls.length} healthy images...`);
|
||||
const { downloaded, failed } = await downloadBatch(
|
||||
totalHealthyUrls,
|
||||
healthyDir,
|
||||
@@ -562,14 +739,12 @@ async function main() {
|
||||
healthyCp.count += downloaded;
|
||||
healthyCp.downloaded += downloaded;
|
||||
healthyCp.failed += failed;
|
||||
healthyCp.exhausted = healthyExhausted;
|
||||
|
||||
const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
|
||||
console.log(
|
||||
` Got ${downloaded} images (${failed} failed). Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
|
||||
` Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
|
||||
);
|
||||
} else {
|
||||
healthyCp.exhausted = true;
|
||||
console.log(` ✗ No healthy images found`);
|
||||
}
|
||||
|
||||
@@ -580,76 +755,27 @@ async function main() {
|
||||
|
||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||
const mins = Math.floor(elapsed / 60);
|
||||
const secs = elapsed % 60;
|
||||
const hrs = Math.floor(mins / 60);
|
||||
|
||||
let totalDownloaded = 0;
|
||||
let totalFailed = 0;
|
||||
let totalTarget = 0;
|
||||
|
||||
for (const [classId, cp] of Object.entries(progress.classes)) {
|
||||
for (const [, cp] of Object.entries(progress.classes)) {
|
||||
totalDownloaded += cp.downloaded || 0;
|
||||
totalFailed += cp.failed || 0;
|
||||
totalTarget += classId === HEALTHY_CLASS ? TARGET_HEALTHY : TARGET_PER_DISEASE;
|
||||
}
|
||||
|
||||
const totalSize = await getDatasetSize();
|
||||
const sizeGb = (totalSize / (1024 * 1024 * 1024)).toFixed(2);
|
||||
|
||||
console.log("\n" + "=".repeat(60));
|
||||
console.log("COMPLETE");
|
||||
console.log("=".repeat(60));
|
||||
console.log(` Time: ${mins}m ${secs}s`);
|
||||
console.log(` Time: ${hrs}h ${mins % 60}m`);
|
||||
console.log(` Downloaded: ${totalDownloaded} images`);
|
||||
console.log(` Failed: ${totalFailed} images`);
|
||||
console.log(` Target: ${totalTarget} images`);
|
||||
console.log(` Dataset size: ${sizeGb} GB`);
|
||||
console.log(` Dataset location: ${DATASET_DIR}/`);
|
||||
console.log("");
|
||||
console.log("Next steps:");
|
||||
console.log(" 1. Run the fine-tuning script to train on this dataset");
|
||||
console.log(" 2. The fine-tuning script will resize to 160×160 and augment");
|
||||
console.log(` Dataset: ${DATASET_DIR}/`);
|
||||
|
||||
await closeDb();
|
||||
console.log("=".repeat(60));
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate total size of the dataset directory.
|
||||
*/
|
||||
async function getDatasetSize(): Promise<number> {
|
||||
let total = 0;
|
||||
if (!existsSync(DATASET_DIR)) return 0;
|
||||
|
||||
const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
if (!entry.name.startsWith(".")) {
|
||||
const fullPath = resolve(DATASET_DIR, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
total += dirSize(fullPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
function dirSize(dirPath: string): number {
|
||||
let total = 0;
|
||||
try {
|
||||
const entries = readdirSync(dirPath, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(dirPath, entry.name);
|
||||
if (entry.isFile()) {
|
||||
total += statSync(fullPath).size;
|
||||
} else if (entry.isDirectory()) {
|
||||
total += dirSize(fullPath);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// skip errors
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user