This commit is contained in:
2026-06-06 17:02:45 -04:00
parent 47609e5e42
commit db4c656730
22 changed files with 6195 additions and 326 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
# 🚩 Flagged Content Review — Nothing to Review
Generated: 2026-06-06T21:02:03.301Z
**No content has been flagged for review yet.**
Flagged items will appear here once users flag content for manual review.
---
_Report generated with min-flags=1_

View File

@@ -0,0 +1,53 @@
/**
* apply-flag-migration.ts
*
* Applies the flagged_content table migration to Turso.
* Run with: npx tsx scripts/apply-flag-migration.ts
*/
import dotenv from "dotenv";
import path from "node:path";
const envFile =
process.env.NODE_ENV === "production" ? "../.env.production" : "../.env.development";
dotenv.config({ path: path.resolve(__dirname, envFile) });
import { createClient } from "@libsql/client";
async function main() {
const db = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
console.log("Applying migration: create flagged_content table...");
await db.execute(`
CREATE TABLE IF NOT EXISTS flagged_content (
id text PRIMARY KEY NOT NULL,
content_type text NOT NULL,
content_id text NOT NULL,
field_name text NOT NULL,
notes text DEFAULT '',
flag_count integer DEFAULT 1 NOT NULL,
created_at text DEFAULT (datetime('now')) NOT NULL,
updated_at text DEFAULT (datetime('now')) NOT NULL
)
`);
await db.execute(`
CREATE INDEX IF NOT EXISTS idx_flagged_content_type ON flagged_content (content_type)
`);
await db.execute(`
CREATE INDEX IF NOT EXISTS idx_flagged_content_id ON flagged_content (content_id)
`);
console.log("Migration applied successfully.");
db.close();
}
main().catch((err) => {
console.error("Migration failed:", err);
process.exit(1);
});

View File

@@ -0,0 +1,19 @@
import { createClient } from "@libsql/client";
const c = createClient({
url: process.env.DATABASE_URL,
authToken: process.env.DATABASE_TOKEN,
});
const r = await c.execute("SELECT COUNT(*) as cnt FROM diseases");
const r2 = await c.execute(
`SELECT SUM(CASE WHEN image_url IS NOT NULL AND image_url != '' THEN 1 ELSE 0 END) as has, SUM(CASE WHEN image_url IS NULL OR image_url = '' THEN 1 ELSE 0 END) as miss FROM diseases`,
);
const r3 = await c.execute(
`SELECT severity, COUNT(*) as total, SUM(CASE WHEN image_url IS NOT NULL AND image_url != '' THEN 1 ELSE 0 END) as has FROM diseases GROUP BY severity ORDER BY severity`,
);
console.log(
`Total: ${r.rows[0].cnt} | With images: ${r2.rows[0].has} | Missing: ${r2.rows[0].miss}`,
);
for (const row of r3.rows) {
console.log(` ${row.severity?.padEnd(10)}: ${row.has}/${row.total}`);
}
c.close();

View File

@@ -0,0 +1,379 @@
/**
* generate-flagged-report.ts
*
* Reads all flagged content from the database and generates a pretty
* markdown report organized by content type. The report includes:
* - Summary table with counts per content type
* - Plant images flagged for review
* - Disease images flagged for review
* - Disease symptoms flagged for review
* - Disease causes flagged for review
* - Disease treatment steps flagged for review
* - Disease prevention tips flagged for review
*
* Usage:
* npx tsx scripts/generate-flagged-report.ts [--min-flags N] [--output path/to/report.md]
*
* Options:
* --min-flags Minimum flag count to include (default: 1)
* --output Output path (default: scripts/.flagged-content-review-needed.md)
*/
import dotenv from "dotenv";
import path from "node:path";
// Load DB config from .env.development (or .env.production if NODE_ENV=production)
const envFile =
process.env.NODE_ENV === "production" ? "../.env.production" : "../.env.development";
dotenv.config({ path: path.resolve(__dirname, envFile) });
import { createClient } from "@libsql/client";
import fs from "node:fs";
// ─── Config ─────────────────────────────────────────────────────────────────
const MIN_FLAGS = parseInt(
process.argv.find((a) => a.startsWith("--min-flags="))?.split("=")[1] ?? "1",
10,
);
const OUTPUT_PATH =
process.argv.find((a) => a.startsWith("--output="))?.split("=")[1] ??
path.join(__dirname, ".flagged-content-review-needed.md");
// ─── DB Connection ──────────────────────────────────────────────────────────
const db = createClient({
url: process.env.DATABASE_URL!,
authToken: process.env.DATABASE_TOKEN!,
});
// ─── Types ──────────────────────────────────────────────────────────────────
interface FlaggedRow {
id: string;
content_type: string;
content_id: string;
field_name: string;
notes: string;
flag_count: number;
created_at: string;
updated_at: string;
}
interface PlantRow {
id: string;
common_name: string;
scientific_name: string;
family: string;
image_url: string;
}
interface DiseaseRow {
id: string;
name: string;
scientific_name: string;
plant_id: string;
image_url: string;
}
// ─── Helpers ────────────────────────────────────────────────────────────────
const CONTENT_TYPE_LABELS: Record<string, { emoji: string; title: string; description: string }> = {
plant_image: {
emoji: "🪴",
title: "Plant Images Flagged for Review",
description: "Plant images that users have flagged as potentially incorrect or low quality.",
},
disease_image: {
emoji: "📸",
title: "Disease Images Flagged for Review",
description:
"Disease symptom images that users have flagged as potentially incorrect or misleading.",
},
disease_symptoms: {
emoji: "⚠️",
title: "Disease Symptoms Flagged for Review",
description: "Symptom descriptions that users have flagged as potentially inaccurate.",
},
disease_causes: {
emoji: "🔍",
title: "Disease Causes Flagged for Review",
description:
"Causes and contributing factors that users have flagged as potentially incorrect.",
},
disease_treatment: {
emoji: "💊",
title: "Disease Treatment Steps Flagged for Review",
description:
"Treatment instructions that users have flagged as potentially incorrect or harmful.",
},
disease_prevention: {
emoji: "🛡️",
title: "Disease Prevention Tips Flagged for Review",
description: "Prevention tips that users have flagged as potentially incorrect or misleading.",
},
};
function formatDate(iso: string): string {
const d = new Date(iso);
return d.toLocaleDateString("en-US", {
year: "numeric",
month: "short",
day: "numeric",
hour: "2-digit",
minute: "2-digit",
});
}
// ─── Main ───────────────────────────────────────────────────────────────────
async function main() {
console.log(`📋 Generating flagged content report (min flags: ${MIN_FLAGS})...`);
// Fetch flagged content
const flaggedRs = await db.execute({
sql: "SELECT * FROM flagged_content WHERE flag_count >= ? ORDER BY content_type, flag_count DESC, updated_at DESC",
args: [MIN_FLAGS],
});
const flaggedRows = flaggedRs.rows as unknown as FlaggedRow[];
if (flaggedRows.length === 0) {
const report = [
"# 🚩 Flagged Content Review — Nothing to Review",
"",
`Generated: ${new Date().toISOString()}`,
"",
"**No content has been flagged for review yet.**",
"",
"Flagged items will appear here once users flag content for manual review.",
"",
"---",
"",
`_Report generated with min-flags=${MIN_FLAGS}_`,
"",
].join("\n");
fs.writeFileSync(OUTPUT_PATH, report, "utf-8");
console.log(`✅ Report written to ${OUTPUT_PATH} (no flagged items)`);
db.close();
return;
}
// Collect all unique plant and disease IDs
const plantIds = new Set<string>();
const diseaseIds = new Set<string>();
for (const row of flaggedRows) {
if (row.content_type === "plant_image") {
plantIds.add(row.content_id);
} else {
diseaseIds.add(row.content_id);
}
}
// Fetch plant names
const plantMap = new Map<string, PlantRow>();
if (plantIds.size > 0) {
const plantRs = await db.execute({
sql: `SELECT id, common_name, scientific_name, family, image_url FROM plants WHERE id IN (${[...plantIds].map(() => "?").join(",")})`,
args: [...plantIds],
});
for (const row of plantRs.rows as unknown as PlantRow[]) {
plantMap.set(row.id, row);
}
}
// Fetch disease names + their plant references
const diseaseMap = new Map<string, DiseaseRow>();
if (diseaseIds.size > 0) {
const diseaseRs = await db.execute({
sql: `SELECT id, name, scientific_name, plant_id, image_url FROM diseases WHERE id IN (${[...diseaseIds].map(() => "?").join(",")})`,
args: [...diseaseIds],
});
for (const row of diseaseRs.rows as unknown as DiseaseRow[]) {
diseaseMap.set(row.id, row);
if (!plantMap.has(row.plant_id)) {
plantIds.add(row.plant_id);
}
}
// Fetch any missing plant references for diseases
if (plantIds.size > 0) {
const missingPlantIds = [...plantIds].filter((id) => !plantMap.has(id));
if (missingPlantIds.length > 0) {
const plantRs = await db.execute({
sql: `SELECT id, common_name, scientific_name, family, image_url FROM plants WHERE id IN (${missingPlantIds.map(() => "?").join(",")})`,
args: missingPlantIds,
});
for (const row of plantRs.rows as unknown as PlantRow[]) {
plantMap.set(row.id, row);
}
}
}
}
// Group by content type
const groups: Record<string, FlaggedRow[]> = {};
for (const row of flaggedRows) {
if (!groups[row.content_type]) groups[row.content_type] = [];
groups[row.content_type].push(row);
}
// ─── Build Report ────────────────────────────────────────────────────────
const lines: string[] = [];
const totalFlags = flaggedRows.reduce((sum, r) => sum + r.flag_count, 0);
lines.push("# 🚩 Flagged Content — Manual Review Needed");
lines.push("");
lines.push(`Generated: ${new Date().toISOString()}`);
lines.push("");
lines.push(
flaggedRows.length === 1
? `**${flaggedRows.length} item** flagged for review (${totalFlags} total flags).`
: `**${flaggedRows.length} items** flagged for review (${totalFlags} total flags).`,
);
lines.push("");
lines.push("Most data in this knowledge base is not reviewed by humans. ");
lines.push("Items listed below have been flagged by users for manual review. ");
lines.push("Please review each item and take appropriate action.");
lines.push("");
// Summary table
lines.push("## 📊 Summary");
lines.push("");
lines.push("| Content Type | Count | Total Flags |");
lines.push("|---|---|---|");
const orderedTypes = [
"plant_image",
"disease_image",
"disease_symptoms",
"disease_causes",
"disease_treatment",
"disease_prevention",
];
for (const type of orderedTypes) {
const items = groups[type];
if (!items) continue;
const label = CONTENT_TYPE_LABELS[type]?.title ?? type;
const count = items.length;
const sumFlags = items.reduce((s, r) => s + r.flag_count, 0);
lines.push(`| ${label} | ${count} | ${sumFlags} |`);
}
lines.push(`| **Total** | **${flaggedRows.length}** | **${totalFlags}** |`);
lines.push("");
lines.push("---");
lines.push("");
// Detail sections per content type
for (const type of orderedTypes) {
const items = groups[type];
if (!items) continue;
const config = CONTENT_TYPE_LABELS[type];
lines.push(`## ${config?.emoji ?? "📋"} ${config?.title ?? type}`);
lines.push("");
lines.push(config?.description ?? "");
lines.push("");
lines.push(`**${items.length} item${items.length === 1 ? "" : "s"} flagged**`);
lines.push("");
for (const item of items) {
// Build label
let label = item.content_id;
let plantLabel = "";
if (type === "plant_image") {
const plant = plantMap.get(item.content_id);
if (plant) {
label = `${plant.common_name} (_${plant.scientific_name}_)`;
plantLabel = `${plant.family} family`;
}
} else {
const disease = diseaseMap.get(item.content_id);
if (disease) {
const plant = plantMap.get(disease.plant_id);
const plantName = plant?.common_name ?? disease.plant_id;
label = `${disease.name} (_${disease.scientific_name}_) on **${plantName}**`;
plantLabel = `Affects: ${plantName}`;
}
}
const flagWord = item.flag_count === 1 ? "flag" : "flags";
const firstFlagged = formatDate(item.created_at);
const lastFlagged = formatDate(item.updated_at);
lines.push(`### ${label}`);
lines.push("");
lines.push(`- **Field:** \`${item.field_name}\``);
lines.push(`- **Flags:** ${item.flag_count} ${flagWord}`);
lines.push(`- **First flagged:** ${firstFlagged}`);
lines.push(`- **Last flagged:** ${lastFlagged}`);
if (plantLabel) {
lines.push(`- **${plantLabel}**`);
}
if (item.notes) {
lines.push(`- **User notes:** ${item.notes}`);
}
// Show the content data if we can fetch it
if (type === "plant_image") {
const plant = plantMap.get(item.content_id);
if (plant?.image_url) {
lines.push("");
lines.push(` ![${plant.common_name}](${plant.image_url})`);
}
} else {
const disease = diseaseMap.get(item.content_id);
if (type === "disease_image" && disease?.image_url) {
lines.push("");
lines.push(` ![${disease.name}](${disease.image_url})`);
}
}
lines.push("");
}
lines.push("---");
lines.push("");
}
// Footer
lines.push("## How This Works");
lines.push("");
lines.push("1. **Users** click the 🚩 Flag button on any content they believe needs review.");
lines.push("2. **The system** stores the flag in the database with a counter.");
lines.push(
"3. **This report** is generated by querying the database and formatting the results.",
);
lines.push("4. **Reviewers** go through each item and take action (fix, update, or dismiss).");
lines.push("");
lines.push("### Taking Action");
lines.push("");
lines.push("After reviewing an item, you can clear its flags by running:");
lines.push("");
lines.push("```sql");
lines.push("DELETE FROM flagged_content WHERE id = '<item-id>';");
lines.push("```");
lines.push("");
lines.push("Or clear all flags for a specific item by running:");
lines.push("");
lines.push("```sql");
lines.push(
"UPDATE flagged_content SET flag_count = 0 WHERE content_id = '<id>' AND field_name = '<field>';",
);
lines.push("```");
lines.push("");
lines.push("---");
lines.push("");
lines.push(`_Report generated with min-flags=${MIN_FLAGS}_`);
// Write report
fs.writeFileSync(OUTPUT_PATH, lines.join("\n"), "utf-8");
console.log(`✅ Report written to ${OUTPUT_PATH}`);
console.log(` ${flaggedRows.length} items, ${totalFlags} total flags`);
db.close();
}
main().catch((err) => {
console.error("❌ Failed to generate report:", err);
process.exit(1);
});

View File

@@ -2,59 +2,113 @@
/**
* scrape-training-dataset.ts
*
* Collects a training dataset for fine-tuning by scraping DuckDuckGo image search.
* Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
*
* Targets:
* - 200 images per disease class (93 diseases)
* - 400 images for the "healthy" class
* - Full resolution images stored in data/dataset/{class_id}/
* Targets (tiered by plant type):
* - Core plants (houseplants + common garden): 100 images per disease
* - Full set (all 11,498 DB diseases): 10 images per disease
* - Healthy: 400 images
*
* DuckDuckGo approach (no API key needed):
* 1. Fetch the main search page to extract a vqd (query) token
* 2. Use the vqd token to paginate through image results
* 3. Download each image to the dataset directory
* Sources (all free, no API keys):
* 1. DB image_url — existing images already found
* 2. DuckDuckGo — general web image search
* 3. iNaturalist — real-world plant observation photos
* 4. Wikimedia Commons — curated scientific/educational images
*
* Usage: cd apps/web && npx tsx scripts/scrape-training-dataset.ts
*
* Progress is tracked in data/dataset/.progress.json — interrupt and resume safely.
* Progress: data/dataset/.progress.json — interrupt and resume safely.
*/
import "dotenv/config";
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
import { resolve, extname, join } from "path";
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
import { resolve, extname } from "path";
// Load .env.development for DB creds
const envPath = resolve(__dirname, "../.env.development");
try {
const env = readFileSync(envPath, "utf-8");
for (const line of env.split("\n")) {
const trimmed = line.trim();
if (trimmed && !trimmed.startsWith("#")) {
const eqIdx = trimmed.indexOf("=");
if (eqIdx > 0) {
const key = trimmed.slice(0, eqIdx).trim();
const val = trimmed.slice(eqIdx + 1).trim();
if (!process.env[key]) process.env[key] = val;
}
}
}
} catch {}
import { getDb, closeDb } from "@/lib/db/index";
import { diseases } from "@/lib/db/schema";
// ─── Config ─────────────────────────────────────────────────────────────────
const DISEASES_JSON = resolve(__dirname, "../src/data/diseases.json");
const PLANTS_JSON = resolve(__dirname, "../src/data/plants.json");
const DATASET_DIR = resolve(__dirname, "../data/dataset");
const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");
/** Target images per disease class */
const TARGET_PER_DISEASE = 200;
/** Target images per disease for CORE plants */
const TARGET_CORE = 100;
/** Target images for the "healthy" class (2× normal) */
/** Target images per disease for the FULL set */
const TARGET_FULL = 10;
/** Target images for the "healthy" class */
const TARGET_HEALTHY = 400;
/** Core plants that get higher image targets */
const CORE_PLANTS = new Set([
// Houseplants
"monstera",
"pothos",
"snake-plant",
"peace-lily",
"orchid",
"succulent",
"fiddle-leaf-fig",
"aloe-vera",
"cactus",
"fern",
// Garden plants
"tomato",
"basil",
"rose",
"pepper",
"strawberry",
"cucumber",
"squash",
"lettuce",
"spinach",
"cabbage",
"lavender",
"mint",
"jasmine",
"sunflower",
"daisy",
"zucchini",
"bean",
"eggplant",
"chili",
// General disease patterns
"general",
]);
/** Delay between DuckDuckGo search API calls (ms) */
const SEARCH_DELAY = 1500;
/** Delay between image downloads (ms) */
const DOWNLOAD_DELAY = 300;
const DOWNLOAD_DELAY = 100;
/** Max concurrent downloads */
const CONCURRENT_DOWNLOADS = 5;
const CONCURRENT_DOWNLOADS = 10;
/** Minimum image size in bytes to accept (reject tiny placeholders) */
/** Minimum image size in bytes to accept */
const MIN_IMAGE_SIZE = 10_000; // 10KB
/** Maximum image size in bytes */
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
/** Allowed image content types */
const ALLOWED_CONTENT_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/webp", "image/gif"];
/** Allowed file extensions */
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
@@ -62,22 +116,16 @@ const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
const UA =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
/** Class ID for healthy plants */
const HEALTHY_CLASS = "healthy";
// ─── Types ──────────────────────────────────────────────────────────────────
interface DiseaseSeed {
interface DbDisease {
id: string;
plantId: string;
name: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
[key: string]: any;
}
interface PlantSeed {
id: string;
commonName: string;
scientificName: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
[key: string]: any;
imageUrl: string | null;
}
interface DuckDuckGoImageResult {
@@ -93,10 +141,7 @@ interface ClassProgress {
count: number;
downloaded: number;
failed: number;
skipped: number;
/** URLs we've already seen (to avoid duplicates) */
seenUrls: string[];
/** Whether we've exhausted search results */
exhausted: boolean;
}
@@ -105,15 +150,27 @@ interface Progress {
classes: Record<string, ClassProgress>;
}
/** Class ID for healthy plants */
const HEALTHY_CLASS = "healthy";
// ─── DB Loading ──────────────────────────────────────────────────────────────
/**
* Load all diseases from the database with their existing image URLs.
*/
async function loadDiseasesFromDb(): Promise<DbDisease[]> {
const db = getDb();
const rows = await db
.select({
id: diseases.id,
plantId: diseases.plantId,
name: diseases.name,
imageUrl: diseases.imageUrl,
})
.from(diseases)
.orderBy(diseases.id);
return rows;
}
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
/**
* Extract the vqd token from DuckDuckGo's search page.
* Required for paginating image results.
*/
async function getVqdToken(query: string): Promise<string> {
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
@@ -122,25 +179,15 @@ async function getVqdToken(query: string): Promise<string> {
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) {
throw new Error(`Failed to get vqd token: ${res.status}`);
}
if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
const html = await res.text();
// Extract vqd token from the HTML
// Format: vqd='<token>' or vqd="<token>"
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
if (!match) {
throw new Error(`Could not extract vqd token from DuckDuckGo response for "${query}"`);
}
if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
return match[1];
}
/**
* Fetch a page of DuckDuckGo image results.
*/
async function searchImagesDuckDuckGo(
query: string,
vqd: string,
@@ -161,12 +208,9 @@ async function searchImagesDuckDuckGo(
if (res.status === 429) {
console.warn(" ⚠ Rate limited (429). Waiting 10s...");
await sleep(10_000);
return searchImagesDuckDuckGo(query, vqd, page); // Retry
}
if (res.status === 403) {
console.warn(" ⚠ Forbidden (403). Token may have expired.");
return []; // Token expired — no more pages
return searchImagesDuckDuckGo(query, vqd, page);
}
if (res.status === 403) return [];
throw new Error(`DuckDuckGo search failed: ${res.status}`);
}
@@ -174,11 +218,7 @@ async function searchImagesDuckDuckGo(
return data.results ?? [];
}
/**
* Search DuckDuckGo images, automatically paginating to collect up to `target` results.
* Returns unique image URLs.
*/
async function collectImages(
async function collectImagesDuckDuckGo(
query: string,
target: number,
seenUrls: Set<string>,
@@ -188,27 +228,29 @@ async function collectImages(
let exhausted = false;
let consecutiveEmpty = 0;
// Get vqd token
let vqd: string;
try {
vqd = await getVqdToken(query);
} catch (err) {
console.warn(`Failed to get vqd token: ${err instanceof Error ? err.message : "unknown"}`);
console.warn(`DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
return { urls: [], exhausted: true };
}
while (results.length < target) {
const MAX_PAGES = 5;
let lowNoveltyCount = 0;
while (results.length < target && page <= MAX_PAGES) {
await sleep(SEARCH_DELAY);
let pageResults: DuckDuckGoImageResult[];
try {
pageResults = await searchImagesDuckDuckGo(query, vqd, page);
} catch (err) {
console.warn(`Search error: ${err instanceof Error ? err.message : "unknown"}`);
console.warn(`DDG error: ${err instanceof Error ? err.message : "unknown"}`);
break;
}
if (pageResults.length === 0) {
if (!pageResults || pageResults.length === 0) {
consecutiveEmpty++;
if (consecutiveEmpty >= 3) {
exhausted = true;
@@ -223,78 +265,160 @@ async function collectImages(
for (const r of pageResults) {
if (results.length >= target) break;
const imgUrl = r.image || r.url;
// Skip if we've already seen this URL
if (!imgUrl || typeof imgUrl !== "string") continue;
if (seenUrls.has(imgUrl)) continue;
// Validate URL looks like an image
const ext = extname(new URL(imgUrl).pathname).toLowerCase();
if (!ALLOWED_EXTENSIONS.includes(ext) && !ext) {
// No extension - still try, could be a CDN URL
try {
new URL(imgUrl);
} catch {
continue;
}
seenUrls.add(imgUrl);
results.push(imgUrl);
newCount++;
}
if (newCount === 0 && pageResults.every((r) => seenUrls.has(r.image || r.url))) {
// All results on this page were already seen
page++;
continue;
const newRatio = newCount / pageResults.length;
if (newRatio < 0.05) {
lowNoveltyCount++;
if (lowNoveltyCount >= 2) break;
} else {
lowNoveltyCount = 0;
}
if (results.length < target) {
page++;
}
if (results.length < target) page++;
}
return { urls: results.slice(0, target), exhausted };
}
// ─── iNaturalist API ─────────────────────────────────────────────────────────
async function searchImagesInaturalist(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
const results: string[] = [];
const perPage = Math.min(target, 200);
const apiUrl =
`https://api.inaturalist.org/v1/observations` +
`?q=${encodeURIComponent(query)}` +
`&photos_only=true` +
`&quality_grade=research` +
`&per_page=${perPage}` +
`&order_by=observed_on&order=desc`;
try {
const res = await fetch(apiUrl, {
headers: { "User-Agent": UA, Accept: "application/json" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) return { urls: [], exhausted: false };
const data = (await res.json()) as {
results: Array<{ photos: Array<{ url: string }> }>;
};
for (const obs of data.results ?? []) {
if (results.length >= target) break;
for (const photo of obs.photos ?? []) {
if (results.length >= target) break;
const url = photo.url;
if (!url || seenUrls.has(url)) continue;
const fullUrl = url.replace("/medium.", "/original.");
seenUrls.add(fullUrl);
results.push(fullUrl);
}
}
return { urls: results, exhausted: results.length < target };
} catch {
return { urls: results, exhausted: false };
}
}
// ─── Wikimedia Commons API ──────────────────────────────────────────────────
async function searchImagesCommons(
query: string,
target: number,
seenUrls: Set<string>,
): Promise<{ urls: string[]; exhausted: boolean }> {
const results: string[] = [];
let sroffset = 0;
while (results.length < target) {
const params = new URLSearchParams({
action: "query",
list: "search",
srsearch: query,
srnamespace: "6",
srlimit: "50",
sroffset: String(sroffset),
format: "json",
origin: "*", // server-side API call
});
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
try {
const res = await fetch(url, {
headers: { "User-Agent": UA },
signal: AbortSignal.timeout(10_000),
});
if (!res.ok) break;
const data = (await res.json()) as {
query?: { search?: Array<{ title: string }> };
continue?: { sroffset?: number };
};
const hits = data.query?.search ?? [];
if (hits.length === 0) break;
for (const hit of hits) {
if (results.length >= target) break;
const filename = hit.title.replace(/^File:/, "");
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(filename)}`;
if (seenUrls.has(imgUrl)) continue;
seenUrls.add(imgUrl);
results.push(imgUrl);
}
sroffset = data.continue?.sroffset ?? sroffset + hits.length;
} catch {
break;
}
}
return { urls: results, exhausted: results.length < target };
}
// ─── Image Download ─────────────────────────────────────────────────────────
/**
* Download a single image from a URL to the target path.
* Returns true if successful, false otherwise.
*/
async function downloadImage(url: string, destPath: string): Promise<boolean> {
try {
const res = await fetch(url, {
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg" },
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
signal: AbortSignal.timeout(15_000),
});
if (!res.ok) return false;
const contentType = res.headers.get("content-type") || "";
const contentLength = parseInt(res.headers.get("content-length") || "0", 10);
// Validate content type
if (!ALLOWED_CONTENT_TYPES.some((t) => contentType.includes(t))) {
return false;
}
// Validate size
if (contentLength > 0 && contentLength < MIN_IMAGE_SIZE) return false;
if (contentLength > MAX_IMAGE_SIZE) return false;
if (contentType.includes("text/html")) return false;
const buffer = Buffer.from(await res.arrayBuffer());
// Double-check actual buffer size
if (buffer.length < MIN_IMAGE_SIZE) return false;
if (buffer.length > MAX_IMAGE_SIZE) return false;
// Determine correct extension from content type or URL
let ext = extname(new URL(url).pathname).toLowerCase();
if (!ALLOWED_EXTENSIONS.includes(ext)) {
// Map from content type
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
else if (contentType.includes("png")) ext = ".png";
else if (contentType.includes("webp")) ext = ".webp";
else ext = ".jpg"; // Default
else ext = ".jpg";
}
const filePath = destPath.replace(/\.\w+$/, ext);
@@ -305,9 +429,6 @@ async function downloadImage(url: string, destPath: string): Promise<boolean> {
}
}
/**
* Download multiple images concurrently, respecting a per-download delay.
*/
async function downloadBatch(
urls: string[],
classDir: string,
@@ -317,7 +438,6 @@ async function downloadBatch(
let failed = 0;
let index = startIndex;
// Process in chunks to control concurrency
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
@@ -325,16 +445,23 @@ async function downloadBatch(
chunk.map(async (url) => {
const paddedIndex = String(index).padStart(4, "0");
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
const success = await downloadImage(url, destPath);
await sleep(DOWNLOAD_DELAY);
return { success, index: index++ };
return { success, index: index++, url: url.substring(0, 50) };
}),
);
for (const r of results) {
if (r.success) downloaded++;
else failed++;
else {
failed++;
if (failed % 20 === 1) console.log(` ⚠ Failed: ${r.url}...`);
}
}
const total = downloaded + failed;
if (total % 30 === 0 || total === urls.length) {
console.log(` Progress: ${downloaded}/${urls.length} (${failed} failed)`);
}
}
@@ -361,7 +488,6 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
count: 0,
downloaded: 0,
failed: 0,
skipped: 0,
seenUrls: [],
exhausted: false,
};
@@ -369,26 +495,22 @@ function getClassProgress(progress: Progress, classId: string): ClassProgress {
return progress.classes[classId];
}
// ─── Search Query Building ──────────────────────────────────────────────────
// ─── Query Building ─────────────────────────────────────────────────────────
function buildSearchQueries(disease: DiseaseSeed, plant: PlantSeed | null): string[] {
const name = disease.name;
const plantName = plant?.commonName || disease.plantId;
return [
`${name} ${plantName} leaf disease`,
`${plantName} ${name} symptoms`,
`${name} plant disease`,
`${plantName} diseased leaf`,
];
function buildSearchQueries(disease: DbDisease): string[] {
const name = disease.name || disease.id.replace(/-/g, " ");
const plant = disease.plantId.replace(/-/g, " ");
// Every query keeps the disease NAME to avoid noisy labels
return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
}
function buildHealthyQueries(plant: PlantSeed): string[] {
function buildHealthyQueries(plant: string): string[] {
const name = plant.replace(/-/g, " ");
return [
`healthy ${plant.commonName} leaf`,
`${plant.commonName} leaf closeup`,
`healthy ${plant.commonName} plant`,
`${plant.commonName} foliage`,
`healthy ${name} leaf`,
`${name} leaf closeup`,
`healthy ${name} plant`,
`${name} foliage`,
];
}
@@ -400,64 +522,97 @@ async function collectClassImages(
target: number,
progress: Progress,
classDir: string,
existingUrls: string[] = [],
fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
): Promise<void> {
const cp = getClassProgress(progress, classId);
const seenUrls = new Set(cp.seenUrls);
if (cp.count >= target) {
console.log(` ✓ Already have ${cp.count}/${target} images`);
console.log(` ✓ Already have ${cp.count}/${target}`);
return;
}
if (cp.exhausted) {
console.log(`Already exhausted search results (${cp.count}/${target} images)`);
console.log(`Exhausted (${cp.count}/${target})`);
return;
}
mkdirSync(classDir, { recursive: true });
const totalUrls: string[] = [];
const allUrls: string[] = [];
let exhausted = false;
// Search with each query until we hit the target
for (const query of queries) {
if (totalUrls.length >= target) break;
console.log(` Searching: "${query}"...`);
const result = await collectImages(query, target - totalUrls.length, seenUrls);
totalUrls.push(...result.urls);
cp.seenUrls = Array.from(seenUrls);
if (result.exhausted) {
exhausted = true;
// ── Source 0: Existing DB URLs ──────────────────────────────────────────
const freshDbUrls = existingUrls.filter((u) => !seenUrls.has(u));
if (freshDbUrls.length > 0) {
console.log(` DB: ${freshDbUrls.length} existing URLs`);
for (const url of freshDbUrls) {
if (allUrls.length >= target) break;
seenUrls.add(url);
allUrls.push(url);
}
if (totalUrls.length >= target) break;
}
if (totalUrls.length === 0) {
// ── Source 1: DuckDuckGo ──────────────────────────────────────────────
// Skip DDG in fast mode (full set — DDG is slowest source)
if (!fastMode && allUrls.length < target) {
for (const query of queries) {
if (allUrls.length >= target) break;
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
const result = await collectImagesDuckDuckGo(query, target - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
console.log(`${result.urls.length} new`);
if (allUrls.length >= target) break;
}
}
// ── Source 2: iNaturalist ──────────────────────────────────────────────
if (allUrls.length < target) {
const primaryQuery = queries[0];
console.log(` iNat: Searching...`);
const result = await searchImagesInaturalist(primaryQuery, target - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
console.log(` iNat: ${result.urls.length} images`);
}
// ── Source 3: Wikimedia Commons ────────────────────────────────────────
if (allUrls.length < target) {
const primaryQuery = queries[0];
console.log(` Commons: Searching...`);
const result = await searchImagesCommons(primaryQuery, target - allUrls.length, seenUrls);
allUrls.push(...result.urls);
if (result.exhausted) exhausted = true;
console.log(` Commons: ${result.urls.length} images`);
}
if (allUrls.length === 0) {
cp.exhausted = exhausted;
saveProgress(progress);
console.log(` ✗ No images found for "${classId}"`);
console.log(` ✗ No images found`);
return;
}
console.log(` Found ${totalUrls.length} unique image URLs. Downloading...`);
// Save progress with seen URLs BEFORE downloading
cp.seenUrls = Array.from(seenUrls);
cp.exhausted = exhausted;
saveProgress(progress);
// Download the images
const { downloaded, failed } = await downloadBatch(totalUrls, classDir, cp.count);
console.log(` Downloading ${allUrls.length} images...`);
const { downloaded, failed } = await downloadBatch(allUrls, classDir, cp.count);
cp.count += downloaded;
cp.downloaded += downloaded;
cp.failed += failed;
cp.exhausted = exhausted;
saveProgress(progress);
const pct = Math.round((cp.count / target) * 100);
console.log(
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${allUrls.length} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
);
}
@@ -465,25 +620,18 @@ async function collectClassImages(
async function main() {
console.log("=".repeat(60));
console.log("PLANT DISEASE DATASET COLLECTOR");
console.log("PLANT DISEASE DATASET COLLECTOR — FULL DB");
console.log("=".repeat(60));
// Load knowledge base
const diseases = JSON.parse(readFileSync(DISEASES_JSON, "utf-8")) as DiseaseSeed[];
const plants = JSON.parse(readFileSync(PLANTS_JSON, "utf-8")) as PlantSeed[];
// Load diseases from DB
console.log("\nLoading diseases from database...");
const dbDiseases = await loadDiseasesFromDb();
console.log(` ${dbDiseases.length} diseases loaded`);
const plantMap = new Map<string, PlantSeed>();
for (const p of plants) {
plantMap.set(p.id, p);
}
console.log(`\nLoaded ${diseases.length} diseases, ${plants.length} plants`);
console.log(
`Target: ${TARGET_PER_DISEASE} images/disease (×${diseases.length} = ${diseases.length * TARGET_PER_DISEASE})`,
);
console.log(`Target: ${TARGET_HEALTHY} images for "healthy" class`);
console.log(`Output: ${DATASET_DIR}/`);
console.log("");
const coreDiseases = dbDiseases.filter((d) => CORE_PLANTS.has(d.plantId));
const fullDiseases = dbDiseases.filter((d) => !CORE_PLANTS.has(d.plantId));
console.log(` Core plants: ${coreDiseases.length} diseases (target: ${TARGET_CORE})`);
console.log(` Full set: ${fullDiseases.length} diseases (target: ${TARGET_FULL})`);
// Load progress
mkdirSync(DATASET_DIR, { recursive: true });
@@ -491,28 +639,46 @@ async function main() {
const startTime = Date.now();
// ── Phase 1: Disease classes ──────────────────────────────────────────────
console.log("─".repeat(60));
console.log("PHASE 1: Disease Images");
console.log("─".repeat(60));
for (let i = 0; i < diseases.length; i++) {
const disease = diseases[i];
const plant = plantMap.get(disease.plantId) ?? null;
const classDir = resolve(DATASET_DIR, disease.id);
const queries = buildSearchQueries(disease, plant);
const pct = Math.round((i / diseases.length) * 100);
console.log(`\n[${i + 1}/${diseases.length}] (${pct}%) ${disease.name} (${disease.id})`);
await collectClassImages(disease.id, queries, TARGET_PER_DISEASE, progress, classDir);
}
// ── Phase 2: Healthy class ────────────────────────────────────────────────
// ── Phase 1: Core set ──────────────────────────────────────────────────
console.log("\n" + "─".repeat(60));
console.log("PHASE 2: Healthy Plant Images");
console.log("PHASE 1: Core Diseases (100 images each)");
console.log("─".repeat(60));
for (let i = 0; i < coreDiseases.length; i++) {
const d = coreDiseases[i];
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d);
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
const pct = Math.round((i / coreDiseases.length) * 100);
console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
}
// ── Phase 2: Full set ──────────────────────────────────────────────────
console.log("\n" + "─".repeat(60));
console.log("PHASE 2: Full Disease Set (10 images each)");
console.log("─".repeat(60));
for (let i = 0; i < fullDiseases.length; i++) {
const d = fullDiseases[i];
const classDir = resolve(DATASET_DIR, d.id);
const queries = buildSearchQueries(d);
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
const pct = Math.round((i / fullDiseases.length) * 100);
console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
}
// ── Phase 3: Healthy class ──────────────────────────────────────────────
console.log("\n" + "─".repeat(60));
console.log("PHASE 3: Healthy Plant Images");
console.log("─".repeat(60));
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
@@ -520,39 +686,50 @@ async function main() {
const healthySeen = new Set(healthyCp.seenUrls);
if (healthyCp.count >= TARGET_HEALTHY) {
console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY} healthy images`);
console.log(`\n ✓ Already have ${healthyCp.count}/${TARGET_HEALTHY}`);
} else {
// Build a pool of healthy plant queries
// Collect all unique plants
const allPlants = [...new Set(dbDiseases.map((d) => d.plantId))];
const allHealthyQueries: string[] = [];
for (const plant of plants) {
for (const plant of allPlants) {
allHealthyQueries.push(...buildHealthyQueries(plant));
}
const healthySources = [
{ name: "DDG", collector: collectImagesDuckDuckGo },
{ name: "iNat", collector: searchImagesInaturalist },
{ name: "Commons", collector: searchImagesCommons },
] as const;
const totalHealthyUrls: string[] = [];
let healthyExhausted = false;
let anyRemaining = false;
for (const query of allHealthyQueries) {
for (const source of healthySources) {
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
if (healthyExhausted) break;
console.log(`\n Source: ${source.name}`);
console.log(`\n Searching: "${query}"...`);
const result = await collectImages(
query,
TARGET_HEALTHY - totalHealthyUrls.length,
healthySeen,
);
for (const query of allHealthyQueries.slice(0, 20)) {
if (totalHealthyUrls.length >= TARGET_HEALTHY) break;
totalHealthyUrls.push(...result.urls);
if (result.exhausted) {
healthyExhausted = true;
process.stdout.write(` "${query}"... `);
const result = await source.collector(
query,
TARGET_HEALTHY - totalHealthyUrls.length,
healthySeen,
);
totalHealthyUrls.push(...result.urls);
if (!result.exhausted) anyRemaining = true;
console.log(`${result.urls.length} new`);
}
}
healthyCp.seenUrls = Array.from(healthySeen);
if (totalHealthyUrls.length > 0) {
console.log(`\n Found ${totalHealthyUrls.length} healthy image URLs. Downloading...`);
healthyCp.exhausted = !anyRemaining;
saveProgress(progress);
console.log(`\n Downloading ${totalHealthyUrls.length} healthy images...`);
const { downloaded, failed } = await downloadBatch(
totalHealthyUrls,
healthyDir,
@@ -562,14 +739,12 @@ async function main() {
healthyCp.count += downloaded;
healthyCp.downloaded += downloaded;
healthyCp.failed += failed;
healthyCp.exhausted = healthyExhausted;
const pct = Math.round((healthyCp.count / TARGET_HEALTHY) * 100);
console.log(
` Got ${downloaded} images (${failed} failed). Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
` Got ${downloaded} images. Total: ${healthyCp.count}/${TARGET_HEALTHY} (${pct}%)`,
);
} else {
healthyCp.exhausted = true;
console.log(` ✗ No healthy images found`);
}
@@ -580,76 +755,27 @@ async function main() {
const elapsed = Math.round((Date.now() - startTime) / 1000);
const mins = Math.floor(elapsed / 60);
const secs = elapsed % 60;
const hrs = Math.floor(mins / 60);
let totalDownloaded = 0;
let totalFailed = 0;
let totalTarget = 0;
for (const [classId, cp] of Object.entries(progress.classes)) {
for (const [, cp] of Object.entries(progress.classes)) {
totalDownloaded += cp.downloaded || 0;
totalFailed += cp.failed || 0;
totalTarget += classId === HEALTHY_CLASS ? TARGET_HEALTHY : TARGET_PER_DISEASE;
}
const totalSize = await getDatasetSize();
const sizeGb = (totalSize / (1024 * 1024 * 1024)).toFixed(2);
console.log("\n" + "=".repeat(60));
console.log("COMPLETE");
console.log("=".repeat(60));
console.log(` Time: ${mins}m ${secs}s`);
console.log(` Time: ${hrs}h ${mins % 60}m`);
console.log(` Downloaded: ${totalDownloaded} images`);
console.log(` Failed: ${totalFailed} images`);
console.log(` Target: ${totalTarget} images`);
console.log(` Dataset size: ${sizeGb} GB`);
console.log(` Dataset location: ${DATASET_DIR}/`);
console.log("");
console.log("Next steps:");
console.log(" 1. Run the fine-tuning script to train on this dataset");
console.log(" 2. The fine-tuning script will resize to 160×160 and augment");
console.log(` Dataset: ${DATASET_DIR}/`);
await closeDb();
console.log("=".repeat(60));
}
/**
* Calculate total size of the dataset directory.
*/
async function getDatasetSize(): Promise<number> {
let total = 0;
if (!existsSync(DATASET_DIR)) return 0;
const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
for (const entry of entries) {
if (!entry.name.startsWith(".")) {
const fullPath = resolve(DATASET_DIR, entry.name);
if (entry.isDirectory()) {
total += dirSize(fullPath);
}
}
}
return total;
}
function dirSize(dirPath: string): number {
let total = 0;
try {
const entries = readdirSync(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = join(dirPath, entry.name);
if (entry.isFile()) {
total += statSync(fullPath).size;
} else if (entry.isDirectory()) {
total += dirSize(fullPath);
}
}
} catch {
// skip errors
}
return total;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}