prevelance data added
This commit is contained in:
1
apps/web/drizzle/0005_add-prevalence-score.sql
Normal file
1
apps/web/drizzle/0005_add-prevalence-score.sql
Normal file
@@ -0,0 +1 @@
|
||||
ALTER TABLE `diseases` ADD COLUMN `prevalence_score` integer DEFAULT 0 NOT NULL;
|
||||
@@ -36,6 +36,13 @@
|
||||
"when": 1751846400000,
|
||||
"tag": "0004_add-flagged-content",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 5,
|
||||
"version": "6",
|
||||
"when": 1751846400000,
|
||||
"tag": "0005_add-prevalence-score",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -51,7 +51,7 @@ interface DiseaseRow {
|
||||
|
||||
// ─── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
const POLITE_DELAY = 1100; // ms between calls
|
||||
const POLITE_DELAY = 800; // ms between calls
|
||||
const DB_FLUSH_BATCH = 50;
|
||||
const STATE_FILE = resolve(__dirname, ".ddg-progress.json");
|
||||
|
||||
@@ -163,6 +163,8 @@ async function main() {
|
||||
const query1 = `${d.name} on ${plantName} plant disease`;
|
||||
const query2 = `${d.scientificName || d.name} on ${plantName} disease`;
|
||||
const query3 = `${d.name} plant disease ${plantName}`;
|
||||
const query4 = `${d.name} plant`;
|
||||
const query5 = `${d.name} symptom`;
|
||||
|
||||
process.stdout.write(
|
||||
` [${String(i + 1).padStart(4)}/${pending.length}] [${sev}] ${d.name.substring(0, 42).padEnd(44)} `,
|
||||
@@ -170,7 +172,7 @@ async function main() {
|
||||
|
||||
// Try queries in order until we get a result
|
||||
let url: string | null = null;
|
||||
for (const q of [query1, query2, query3]) {
|
||||
for (const q of [query1, query2, query3, query4, query5]) {
|
||||
url = await searchImage(q);
|
||||
if (url) break;
|
||||
}
|
||||
|
||||
768
apps/web/scripts/fill-training-dataset.ts
Normal file
768
apps/web/scripts/fill-training-dataset.ts
Normal file
@@ -0,0 +1,768 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* fill-training-dataset.ts
|
||||
*
|
||||
* Scans the existing dataset directory and downloads any missing images
|
||||
* to reach the target counts (200 per disease, 400 for healthy).
|
||||
*
|
||||
* Does NOT re-run prevalence queries — just fills gaps from image sources.
|
||||
* Each run scans the directory, reports deficits, then fills them.
|
||||
* Interrupt-safe: re-run to pick up where you left off.
|
||||
*
|
||||
* Usage: cd apps/web && npx tsx scripts/fill-training-dataset.ts
|
||||
*/
|
||||
|
||||
import "dotenv/config";
|
||||
import { readFileSync, readdirSync, writeFileSync, existsSync, mkdirSync } from "fs";
|
||||
import { resolve, extname } from "path";
|
||||
|
||||
// Load .env.development for DB creds
|
||||
const envPath = resolve(__dirname, "../.env.development");
|
||||
try {
|
||||
const env = readFileSync(envPath, "utf-8");
|
||||
for (const line of env.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed && !trimmed.startsWith("#")) {
|
||||
const eqIdx = trimmed.indexOf("=");
|
||||
if (eqIdx > 0) {
|
||||
const key = trimmed.slice(0, eqIdx).trim();
|
||||
const val = trimmed.slice(eqIdx + 1).trim();
|
||||
if (!process.env[key]) process.env[key] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
import { getDb, closeDb } from "@/lib/db/index";
|
||||
import { diseases } from "@/lib/db/schema";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
// ─── Config ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const DATASET_DIR = resolve(__dirname, "../data/dataset");
|
||||
const SEEN_CACHE_FILE = resolve(DATASET_DIR, ".fill-seen-urls.json");
|
||||
|
||||
/** Target images per disease */
|
||||
const TARGET_PER_DISEASE = 200;
|
||||
|
||||
/** Target images for the "healthy" class */
|
||||
const TARGET_HEALTHY = 400;
|
||||
|
||||
/** Delay between DuckDuckGo search API calls (ms) */
|
||||
const SEARCH_DELAY = 1500;
|
||||
|
||||
/** Max concurrent image downloads per disease */
|
||||
const CONCURRENT_DOWNLOADS = 30;
|
||||
|
||||
/** Number of diseases to process in parallel */
|
||||
const DISEASE_CONCURRENCY = 5;
|
||||
|
||||
/** Minimum image size in bytes to accept */
|
||||
const MIN_IMAGE_SIZE = 10_000; // 10KB
|
||||
|
||||
/** Maximum image size in bytes */
|
||||
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB
|
||||
|
||||
/** Allowed file extensions */
|
||||
const ALLOWED_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp"];
|
||||
|
||||
/** User agent for requests */
|
||||
const UA =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
||||
|
||||
/** Healthy class directory name */
|
||||
const HEALTHY_CLASS = "healthy";
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface DuckDuckGoImageResult {
|
||||
image: string;
|
||||
title: string;
|
||||
url: string;
|
||||
thumbnail: string;
|
||||
height: number;
|
||||
width: number;
|
||||
}
|
||||
|
||||
interface DiseaseInfo {
|
||||
id: string;
|
||||
name: string;
|
||||
plantId: string;
|
||||
have: number;
|
||||
needed: number;
|
||||
}
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/** Count actual image files in a directory (matching img_* pattern). */
|
||||
function countImagesInDir(dir: string): number {
|
||||
if (!existsSync(dir)) return 0;
|
||||
try {
|
||||
const files = readdirSync(dir);
|
||||
return files.filter((f) => f.startsWith("img_")).length;
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Format bytes for display */
|
||||
function formatBytes(bytes: number): string {
|
||||
if (bytes < 1024) return `${bytes} B`;
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
||||
}
|
||||
|
||||
// ─── Seen-URLs Cache ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Load the per-disease seen-URLs cache from disk.
|
||||
* This prevents re-fetching the same URLs across runs.
|
||||
*/
|
||||
function loadSeenUrlsCache(): Record<string, string[]> {
|
||||
if (existsSync(SEEN_CACHE_FILE)) {
|
||||
try {
|
||||
return JSON.parse(readFileSync(SEEN_CACHE_FILE, "utf-8"));
|
||||
} catch {}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the seen-URLs cache to disk.
|
||||
*/
|
||||
function saveSeenUrlsCache(cache: Record<string, string[]>): void {
|
||||
writeFileSync(SEEN_CACHE_FILE, JSON.stringify(cache, null, 2));
|
||||
}
|
||||
|
||||
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
|
||||
|
||||
async function getVqdToken(query: string): Promise<string> {
|
||||
const url = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`;
|
||||
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA, Accept: "text/html" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) throw new Error(`Failed to get vqd token: ${res.status}`);
|
||||
|
||||
const html = await res.text();
|
||||
const match = html.match(/vqd['"]?\s*[:=]\s*['"]([a-f0-9-]+)['"]/);
|
||||
if (!match) throw new Error(`Could not extract vqd token for "${query}"`);
|
||||
|
||||
return match[1];
|
||||
}
|
||||
|
||||
async function searchImagesDuckDuckGo(
|
||||
query: string,
|
||||
vqd: string,
|
||||
page: number,
|
||||
): Promise<DuckDuckGoImageResult[]> {
|
||||
const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
|
||||
query,
|
||||
)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
|
||||
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent": UA,
|
||||
Accept: "application/json",
|
||||
Referer: `https://duckduckgo.com/?q=${encodeURIComponent(query)}&t=h_&iax=images&ia=images`,
|
||||
},
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
if (res.status === 429) {
|
||||
console.warn(" ⚠ DDG rate limited (429). Waiting 10s...");
|
||||
await sleep(10_000);
|
||||
return searchImagesDuckDuckGo(query, vqd, page);
|
||||
}
|
||||
if (res.status === 403) return [];
|
||||
throw new Error(`DuckDuckGo search failed: ${res.status}`);
|
||||
}
|
||||
|
||||
const data = (await res.json()) as { results: DuckDuckGoImageResult[] };
|
||||
return data.results ?? [];
|
||||
}
|
||||
|
||||
async function collectImagesDuckDuckGo(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<{ urls: string[]; exhausted: boolean }> {
|
||||
const results: string[] = [];
|
||||
let page = 1;
|
||||
let exhausted = false;
|
||||
let consecutiveEmpty = 0;
|
||||
|
||||
let vqd: string;
|
||||
try {
|
||||
vqd = await getVqdToken(query);
|
||||
} catch (err) {
|
||||
console.warn(` ⚠ DDG token failed: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
return { urls: [], exhausted: true };
|
||||
}
|
||||
|
||||
const MAX_PAGES = 5;
|
||||
let lowNoveltyCount = 0;
|
||||
|
||||
while (results.length < target && page <= MAX_PAGES) {
|
||||
await sleep(SEARCH_DELAY);
|
||||
|
||||
let pageResults: DuckDuckGoImageResult[];
|
||||
try {
|
||||
pageResults = await searchImagesDuckDuckGo(query, vqd, page);
|
||||
} catch (err) {
|
||||
console.warn(` ⚠ DDG error: ${err instanceof Error ? err.message : "unknown"}`);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!pageResults || pageResults.length === 0) {
|
||||
consecutiveEmpty++;
|
||||
if (consecutiveEmpty >= 3) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
page++;
|
||||
continue;
|
||||
}
|
||||
|
||||
consecutiveEmpty = 0;
|
||||
let newCount = 0;
|
||||
|
||||
for (const r of pageResults) {
|
||||
if (results.length >= target) break;
|
||||
const imgUrl = r.image || r.url;
|
||||
if (!imgUrl || typeof imgUrl !== "string") continue;
|
||||
if (seenUrls.has(imgUrl)) continue;
|
||||
try {
|
||||
new URL(imgUrl);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
seenUrls.add(imgUrl);
|
||||
results.push(imgUrl);
|
||||
newCount++;
|
||||
}
|
||||
|
||||
const newRatio = newCount / pageResults.length;
|
||||
if (newRatio < 0.05) {
|
||||
lowNoveltyCount++;
|
||||
if (lowNoveltyCount >= 2) break;
|
||||
} else {
|
||||
lowNoveltyCount = 0;
|
||||
}
|
||||
|
||||
if (results.length < target) page++;
|
||||
}
|
||||
|
||||
return { urls: results.slice(0, target), exhausted };
|
||||
}
|
||||
|
||||
// ─── iNaturalist API ───────────────────────────────────────────────────────
|
||||
|
||||
async function searchImagesInaturalist(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<{ urls: string[]; exhausted: boolean }> {
|
||||
const results: string[] = [];
|
||||
const perPage = Math.min(target, 200);
|
||||
|
||||
const apiUrl =
|
||||
`https://api.inaturalist.org/v1/observations` +
|
||||
`?q=${encodeURIComponent(query)}` +
|
||||
`&photos_only=true` +
|
||||
`&quality_grade=research` +
|
||||
`&per_page=${perPage}` +
|
||||
`&order_by=observed_on&order=desc`;
|
||||
|
||||
try {
|
||||
const res = await fetch(apiUrl, {
|
||||
headers: { "User-Agent": UA, Accept: "application/json" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!res.ok) return { urls: [], exhausted: false };
|
||||
|
||||
const data = (await res.json()) as {
|
||||
results: Array<{ photos: Array<{ url: string }> }>;
|
||||
};
|
||||
|
||||
for (const obs of data.results ?? []) {
|
||||
if (results.length >= target) break;
|
||||
for (const photo of obs.photos ?? []) {
|
||||
if (results.length >= target) break;
|
||||
const url = photo.url;
|
||||
if (!url || seenUrls.has(url)) continue;
|
||||
const fullUrl = url.replace("/medium.", "/original.");
|
||||
seenUrls.add(fullUrl);
|
||||
results.push(fullUrl);
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: results, exhausted: results.length < target };
|
||||
} catch {
|
||||
return { urls: results, exhausted: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Wikimedia Commons API ─────────────────────────────────────────────────
|
||||
|
||||
async function searchImagesCommons(
|
||||
query: string,
|
||||
target: number,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<{ urls: string[]; exhausted: boolean }> {
|
||||
const results: string[] = [];
|
||||
let sroffset = 0;
|
||||
|
||||
while (results.length < target) {
|
||||
const params = new URLSearchParams({
|
||||
action: "query",
|
||||
list: "search",
|
||||
srsearch: query,
|
||||
srnamespace: "6",
|
||||
srlimit: "50",
|
||||
sroffset: String(sroffset),
|
||||
format: "json",
|
||||
});
|
||||
|
||||
const url = `https://commons.wikimedia.org/w/api.php?${params}`;
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA },
|
||||
signal: AbortSignal.timeout(10_000),
|
||||
});
|
||||
if (!res.ok) break;
|
||||
|
||||
const data = (await res.json()) as {
|
||||
query?: { search?: Array<{ title: string }> };
|
||||
continue?: { sroffset?: number };
|
||||
};
|
||||
|
||||
const hits = data.query?.search ?? [];
|
||||
if (hits.length === 0) break;
|
||||
|
||||
for (const hit of hits) {
|
||||
if (results.length >= target) break;
|
||||
const filename = hit.title.replace(/^File:/, "");
|
||||
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
|
||||
filename,
|
||||
)}`;
|
||||
if (seenUrls.has(imgUrl)) continue;
|
||||
seenUrls.add(imgUrl);
|
||||
results.push(imgUrl);
|
||||
}
|
||||
|
||||
sroffset = data.continue?.sroffset ?? sroffset + hits.length;
|
||||
} catch {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return { urls: results, exhausted: results.length < target };
|
||||
}
|
||||
|
||||
// ─── Image Download ─────────────────────────────────────────────────────────
|
||||
|
||||
async function downloadImage(url: string, destPath: string): Promise<boolean> {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: { "User-Agent": UA, Accept: "image/webp,image/png,image/jpeg,*/*" },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!res.ok) return false;
|
||||
|
||||
const contentType = res.headers.get("content-type") || "";
|
||||
if (contentType.includes("text/html")) return false;
|
||||
|
||||
const buffer = Buffer.from(await res.arrayBuffer());
|
||||
if (buffer.length < MIN_IMAGE_SIZE) return false;
|
||||
if (buffer.length > MAX_IMAGE_SIZE) return false;
|
||||
|
||||
let ext = extname(new URL(url).pathname).toLowerCase();
|
||||
if (!ALLOWED_EXTENSIONS.includes(ext)) {
|
||||
if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
|
||||
else if (contentType.includes("png")) ext = ".png";
|
||||
else if (contentType.includes("webp")) ext = ".webp";
|
||||
else ext = ".jpg";
|
||||
}
|
||||
|
||||
const filePath = destPath.replace(/\.\w+$/, ext);
|
||||
writeFileSync(filePath, buffer);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function downloadBatch(
|
||||
urls: string[],
|
||||
classDir: string,
|
||||
startIndex: number,
|
||||
): Promise<{ downloaded: number; failed: number; lastIndex: number }> {
|
||||
let downloaded = 0;
|
||||
let failed = 0;
|
||||
let index = startIndex;
|
||||
|
||||
for (let i = 0; i < urls.length; i += CONCURRENT_DOWNLOADS) {
|
||||
const chunk = urls.slice(i, i + CONCURRENT_DOWNLOADS);
|
||||
|
||||
const results = await Promise.all(
|
||||
chunk.map(async (url) => {
|
||||
const paddedIndex = String(index).padStart(4, "0");
|
||||
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
|
||||
const success = await downloadImage(url, destPath);
|
||||
return { success, index: index++ };
|
||||
}),
|
||||
);
|
||||
|
||||
for (const r of results) {
|
||||
if (r.success) downloaded++;
|
||||
else failed++;
|
||||
}
|
||||
|
||||
const total = downloaded + failed;
|
||||
if (total % 30 === 0 || total === urls.length) {
|
||||
process.stdout.write(`\r Progress: ${downloaded}/${urls.length} (${failed} failed)`);
|
||||
}
|
||||
}
|
||||
console.log();
|
||||
|
||||
return { downloaded, failed, lastIndex: index };
|
||||
}
|
||||
|
||||
// ─── Query Building ─────────────────────────────────────────────────────────
|
||||
|
||||
function buildSearchQueries(name: string, plant: string): string[] {
|
||||
return [`${name} ${plant} leaf disease`, `${plant} ${name} symptoms`, `${name} ${plant}`];
|
||||
}
|
||||
|
||||
function buildHealthyQueries(plant: string): string[] {
|
||||
const name = plant.replace(/-/g, " ");
|
||||
return [
|
||||
`healthy ${name} leaf`,
|
||||
`${name} leaf closeup`,
|
||||
`healthy ${name} plant`,
|
||||
`${name} foliage`,
|
||||
];
|
||||
}
|
||||
|
||||
// ─── Fill Logic ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Try to collect up to `needed` images for a disease by hitting all three
|
||||
* sources in order. Returns how many new images were actually downloaded.
|
||||
*/
|
||||
async function fillClass(
|
||||
diseaseId: string,
|
||||
queries: string[],
|
||||
needed: number,
|
||||
classDir: string,
|
||||
seenUrls: Set<string>,
|
||||
): Promise<number> {
|
||||
if (needed <= 0) return 0;
|
||||
|
||||
mkdirSync(classDir, { recursive: true });
|
||||
|
||||
const allUrls: string[] = [];
|
||||
|
||||
// ── Source 1: DuckDuckGo ───────────────────────────────────────────────
|
||||
if (allUrls.length < needed) {
|
||||
for (const query of queries) {
|
||||
if (allUrls.length >= needed) break;
|
||||
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
|
||||
const result = await collectImagesDuckDuckGo(query, needed - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
console.log(`${result.urls.length} new`);
|
||||
if (result.exhausted) break;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Source 2: iNaturalist ──────────────────────────────────────────────
|
||||
if (allUrls.length < needed) {
|
||||
process.stdout.write(` iNat: Searching... `);
|
||||
const result = await searchImagesInaturalist(queries[0], needed - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
console.log(`${result.urls.length} new`);
|
||||
}
|
||||
|
||||
// ── Source 3: Wikimedia Commons ────────────────────────────────────────
|
||||
if (allUrls.length < needed) {
|
||||
process.stdout.write(` Commons: Searching... `);
|
||||
const result = await searchImagesCommons(queries[0], needed - allUrls.length, seenUrls);
|
||||
allUrls.push(...result.urls);
|
||||
console.log(`${result.urls.length} new`);
|
||||
}
|
||||
|
||||
if (allUrls.length === 0) {
|
||||
console.log(` ✗ No new images found from any source`);
|
||||
return 0;
|
||||
}
|
||||
|
||||
console.log(` Downloading ${allUrls.length} images...`);
|
||||
const startIndex = countImagesInDir(classDir);
|
||||
const { downloaded, failed } = await downloadBatch(allUrls, classDir, startIndex);
|
||||
|
||||
const newTotal = countImagesInDir(classDir);
|
||||
const gained = newTotal - startIndex;
|
||||
console.log(
|
||||
` ${downloaded > 0 ? "✓" : "✗"} Downloaded ${downloaded}/${allUrls.length}` +
|
||||
` (${failed} failed, ${gained} new files)`,
|
||||
);
|
||||
|
||||
return gained;
|
||||
}
|
||||
|
||||
// ─── Directory Scanner ─────────────────────────────────────────────────────
|
||||
|
||||
interface ScanResult {
|
||||
/** Disease id → how many images currently on disk */
|
||||
diseaseCounts: Map<string, number>;
|
||||
/** How many healthy images on disk */
|
||||
healthyCount: number;
|
||||
}
|
||||
|
||||
function scanDataset(): ScanResult {
|
||||
const diseaseCounts = new Map<string, number>();
|
||||
let healthyCount = 0;
|
||||
|
||||
if (!existsSync(DATASET_DIR)) {
|
||||
return { diseaseCounts, healthyCount: 0 };
|
||||
}
|
||||
|
||||
const entries = readdirSync(DATASET_DIR, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
if (!entry.isDirectory()) continue;
|
||||
if (entry.name.startsWith(".")) continue;
|
||||
|
||||
if (entry.name === HEALTHY_CLASS) {
|
||||
healthyCount = countImagesInDir(resolve(DATASET_DIR, entry.name));
|
||||
} else {
|
||||
const count = countImagesInDir(resolve(DATASET_DIR, entry.name));
|
||||
if (count > 0) {
|
||||
diseaseCounts.set(entry.name, count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { diseaseCounts, healthyCount };
|
||||
}
|
||||
|
||||
// ─── Main ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log("=".repeat(60));
|
||||
console.log("TRAINING DATASET FILL — Gap-filling download");
|
||||
console.log("=".repeat(60));
|
||||
|
||||
// Ensure dataset directory exists
|
||||
mkdirSync(DATASET_DIR, { recursive: true });
|
||||
|
||||
// ── Step 1: Scan what we already have ────────────────────────────────────
|
||||
console.log("\nScanning existing dataset...");
|
||||
const { diseaseCounts, healthyCount } = scanDataset();
|
||||
console.log(` Found ${diseaseCounts.size} disease directories, ${healthyCount} healthy images`);
|
||||
|
||||
// ── Step 2: Load disease info from DB ────────────────────────────────────
|
||||
console.log("\nLoading disease info from database...");
|
||||
const db = getDb();
|
||||
|
||||
const allDiseases = await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
plantId: diseases.plantId,
|
||||
name: diseases.name,
|
||||
})
|
||||
.from(diseases);
|
||||
|
||||
// Build a deduplicated map: disease id → first disease info found
|
||||
const diseaseInfo = new Map<string, { name: string; plantId: string }>();
|
||||
for (const d of allDiseases) {
|
||||
if (!diseaseInfo.has(d.id)) {
|
||||
diseaseInfo.set(d.id, { name: d.name, plantId: d.plantId });
|
||||
}
|
||||
}
|
||||
console.log(` Loaded ${diseaseInfo.size} unique diseases from DB`);
|
||||
|
||||
// ── Step 3: Build deficit list ──────────────────────────────────────────
|
||||
const deficits: DiseaseInfo[] = [];
|
||||
|
||||
for (const [id, info] of diseaseInfo) {
|
||||
const have = diseaseCounts.get(id) ?? 0;
|
||||
const needed = TARGET_PER_DISEASE - have;
|
||||
if (needed > 0) {
|
||||
deficits.push({ id, name: info.name, plantId: info.plantId, have, needed });
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by deficit size (largest first) so we prioritize the neediest diseases
|
||||
deficits.sort((a, b) => b.needed - a.needed);
|
||||
|
||||
const healthyDeficit = TARGET_HEALTHY - healthyCount;
|
||||
|
||||
console.log(`\n${"=".repeat(60)}`);
|
||||
console.log("DEFICIT REPORT");
|
||||
console.log(`${"=".repeat(60)}`);
|
||||
console.log(` Diseases needing images: ${deficits.length}/${diseaseInfo.size}`);
|
||||
console.log(` Total images missing: ${deficits.reduce((s, d) => s + d.needed, 0)}`);
|
||||
console.log(` Healthy deficit: ${Math.max(0, healthyDeficit)}`);
|
||||
console.log(`${"=".repeat(60)}`);
|
||||
|
||||
if (deficits.length === 0 && healthyDeficit <= 0) {
|
||||
console.log("\n ✓ Nothing to do — all targets met!\n");
|
||||
await closeDb();
|
||||
return;
|
||||
}
|
||||
|
||||
// ── Step 4: Load seen-URLs cache ────────────────────────────────────────
|
||||
const seenUrlsCache = loadSeenUrlsCache();
|
||||
let totalDownloaded = 0;
|
||||
let totalFailed = 0;
|
||||
const startTime = Date.now();
|
||||
|
||||
// ── Step 5: Fill disease deficits ───────────────────────────────────────
|
||||
if (deficits.length > 0) {
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log(`FILLING ${deficits.length} DISEASES (target: ${TARGET_PER_DISEASE} each)`);
|
||||
console.log("─".repeat(60));
|
||||
|
||||
// Process in parallel batches
|
||||
for (let i = 0; i < deficits.length; i += DISEASE_CONCURRENCY) {
|
||||
const batch = deficits.slice(i, i + DISEASE_CONCURRENCY);
|
||||
const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
|
||||
const totalBatches = Math.ceil(deficits.length / DISEASE_CONCURRENCY);
|
||||
|
||||
console.log(`\n[Batch ${batchNum}/${totalBatches}] Processing ${batch.length} diseases...`);
|
||||
|
||||
await Promise.all(
|
||||
batch.map(async (d) => {
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d.name, d.plantId);
|
||||
const seen = new Set<string>(seenUrlsCache[d.id] ?? []);
|
||||
|
||||
console.log(
|
||||
` [${d.id}] have ${d.have}, need ${d.needed} more` + ` (${d.name} / ${d.plantId})`,
|
||||
);
|
||||
|
||||
const gained = await fillClass(d.id, queries, d.needed, classDir, seen);
|
||||
|
||||
// Update seen-URLs cache for this disease
|
||||
seenUrlsCache[d.id] = Array.from(seen);
|
||||
saveSeenUrlsCache(seenUrlsCache);
|
||||
|
||||
totalDownloaded += gained;
|
||||
}),
|
||||
);
|
||||
|
||||
// Save seen cache after every batch
|
||||
saveSeenUrlsCache(seenUrlsCache);
|
||||
|
||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||
console.log(
|
||||
` [Batch ${batchNum}/${totalBatches}] checkpoint — ` +
|
||||
`${totalDownloaded} downloaded so far (${elapsed}s elapsed)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Step 6: Fill healthy deficit ────────────────────────────────────────
|
||||
if (healthyDeficit > 0) {
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log(`FILLING HEALTHY CLASS (target: ${TARGET_HEALTHY})`);
|
||||
console.log("─".repeat(60));
|
||||
|
||||
const healthyDir = resolve(DATASET_DIR, HEALTHY_CLASS);
|
||||
mkdirSync(healthyDir, { recursive: true });
|
||||
|
||||
// Collect all unique plants from the disease info
|
||||
const allPlants = [...new Set(diseaseInfo.values())].map((d) => d.plantId);
|
||||
const allHealthyQueries: string[] = [];
|
||||
for (const plant of allPlants) {
|
||||
allHealthyQueries.push(...buildHealthyQueries(plant));
|
||||
}
|
||||
|
||||
const healthySeen = new Set<string>(seenUrlsCache[HEALTHY_CLASS] ?? []);
|
||||
const healthyNeeded = TARGET_HEALTHY - countImagesInDir(healthyDir);
|
||||
const allUrls: string[] = [];
|
||||
|
||||
// Try each source with up to 20 healthy queries
|
||||
const sources = [
|
||||
{ name: "DDG", collector: collectImagesDuckDuckGo },
|
||||
{ name: "iNat", collector: searchImagesInaturalist },
|
||||
{ name: "Commons", collector: searchImagesCommons },
|
||||
] as const;
|
||||
|
||||
for (const source of sources) {
|
||||
if (allUrls.length >= healthyNeeded) break;
|
||||
console.log(`\n Source: ${source.name}`);
|
||||
|
||||
for (const query of allHealthyQueries.slice(0, 20)) {
|
||||
if (allUrls.length >= healthyNeeded) break;
|
||||
|
||||
process.stdout.write(` "${query}"... `);
|
||||
const result = await source.collector(query, healthyNeeded - allUrls.length, healthySeen);
|
||||
allUrls.push(...result.urls);
|
||||
console.log(`${result.urls.length} new`);
|
||||
}
|
||||
}
|
||||
|
||||
if (allUrls.length > 0) {
|
||||
console.log(`\n Downloading ${allUrls.length} healthy images...`);
|
||||
const startIdx = countImagesInDir(healthyDir);
|
||||
const { downloaded, failed } = await downloadBatch(allUrls, healthyDir, startIdx);
|
||||
|
||||
const newTotal = countImagesInDir(healthyDir);
|
||||
const gained = newTotal - healthyCount;
|
||||
totalDownloaded += gained;
|
||||
totalFailed += failed;
|
||||
|
||||
console.log(
|
||||
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded} images.` +
|
||||
` Total healthy: ${newTotal}/${TARGET_HEALTHY} (${gained} new)`,
|
||||
);
|
||||
} else {
|
||||
console.log(`\n ✗ No healthy images found`);
|
||||
}
|
||||
|
||||
// Update seen-URLs cache
|
||||
seenUrlsCache[HEALTHY_CLASS] = Array.from(healthySeen);
|
||||
saveSeenUrlsCache(seenUrlsCache);
|
||||
}
|
||||
|
||||
// ── Summary ──────────────────────────────────────────────────────────────
|
||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||
const mins = Math.floor(elapsed / 60);
|
||||
const hrs = Math.floor(mins / 60);
|
||||
|
||||
// Final scan
|
||||
const finalScan = scanDataset();
|
||||
const totalHave = [...finalScan.diseaseCounts.values()].reduce((s, c) => s + c, 0);
|
||||
const atTarget = [...finalScan.diseaseCounts.values()].filter(
|
||||
(c) => c >= TARGET_PER_DISEASE,
|
||||
).length;
|
||||
|
||||
console.log("\n" + "=".repeat(60));
|
||||
console.log(" ✅ FILL COMPLETE");
|
||||
console.log("=".repeat(60));
|
||||
console.log(` Time: ${hrs}h ${mins % 60}m`);
|
||||
console.log(` Diseases at target: ${atTarget}/${diseaseInfo.size}`);
|
||||
console.log(` Total images: ${totalHave}`);
|
||||
console.log(` Healthy images: ${finalScan.healthyCount}/${TARGET_HEALTHY}`);
|
||||
console.log(` New downloads: ${totalDownloaded}`);
|
||||
console.log(` Dataset dir: ${DATASET_DIR}/`);
|
||||
|
||||
await closeDb();
|
||||
console.log("=".repeat(60));
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\nFatal error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -4,10 +4,10 @@
|
||||
*
|
||||
* Collects a training dataset from DuckDuckGo, iNaturalist, and Wikimedia Commons.
|
||||
*
|
||||
* Targets (tiered by plant type):
|
||||
* - Core plants (houseplants + common garden): 100 images per disease
|
||||
* - Full set (all 11,498 DB diseases): 10 images per disease
|
||||
* - Healthy: 400 images
|
||||
* Target: Top 200 most common plant diseases (ranked by iNaturalist observation counts)
|
||||
* - 200 images per disease
|
||||
* - 200 healthy plant images
|
||||
* - Processes 5 diseases in parallel with 30 concurrent downloads each
|
||||
*
|
||||
* Sources (all free, no API keys):
|
||||
* 1. DB image_url — existing images already found
|
||||
@@ -42,66 +42,30 @@ try {
|
||||
|
||||
import { getDb, closeDb } from "@/lib/db/index";
|
||||
import { diseases } from "@/lib/db/schema";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
// ─── Config ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const DATASET_DIR = resolve(__dirname, "../data/dataset");
|
||||
const PROGRESS_FILE = resolve(DATASET_DIR, ".progress.json");
|
||||
|
||||
/** Target images per disease for CORE plants */
|
||||
const TARGET_CORE = 100;
|
||||
/** Target images per disease */
|
||||
const TARGET_PER_DISEASE = 200;
|
||||
|
||||
/** Target images per disease for the FULL set */
|
||||
const TARGET_FULL = 10;
|
||||
/** Number of diseases to target (most common first) */
|
||||
const TARGET_DISEASE_COUNT = 200;
|
||||
|
||||
/** Target images for the "healthy" class */
|
||||
const TARGET_HEALTHY = 400;
|
||||
|
||||
/** Core plants that get higher image targets */
|
||||
const CORE_PLANTS = new Set([
|
||||
// Houseplants
|
||||
"monstera",
|
||||
"pothos",
|
||||
"snake-plant",
|
||||
"peace-lily",
|
||||
"orchid",
|
||||
"succulent",
|
||||
"fiddle-leaf-fig",
|
||||
"aloe-vera",
|
||||
"cactus",
|
||||
"fern",
|
||||
// Garden plants
|
||||
"tomato",
|
||||
"basil",
|
||||
"rose",
|
||||
"pepper",
|
||||
"strawberry",
|
||||
"cucumber",
|
||||
"squash",
|
||||
"lettuce",
|
||||
"spinach",
|
||||
"cabbage",
|
||||
"lavender",
|
||||
"mint",
|
||||
"jasmine",
|
||||
"sunflower",
|
||||
"daisy",
|
||||
"zucchini",
|
||||
"bean",
|
||||
"eggplant",
|
||||
"chili",
|
||||
// General disease patterns
|
||||
"general",
|
||||
]);
|
||||
|
||||
/** Delay between DuckDuckGo search API calls (ms) */
|
||||
const SEARCH_DELAY = 1500;
|
||||
|
||||
/** Delay between image downloads (ms) */
|
||||
const DOWNLOAD_DELAY = 100;
|
||||
/** Max concurrent image downloads per disease */
|
||||
const CONCURRENT_DOWNLOADS = 30;
|
||||
|
||||
/** Max concurrent downloads */
|
||||
const CONCURRENT_DOWNLOADS = 10;
|
||||
/** Number of diseases to process in parallel */
|
||||
const DISEASE_CONCURRENCY = 5;
|
||||
|
||||
/** Minimum image size in bytes to accept */
|
||||
const MIN_IMAGE_SIZE = 10_000; // 10KB
|
||||
@@ -167,21 +131,246 @@ interface Progress {
|
||||
|
||||
// ─── DB Loading ──────────────────────────────────────────────────────────────
|
||||
|
||||
const INAT_CACHE_FILE = resolve(DATASET_DIR, ".inat-prevalence-cache.json");
|
||||
|
||||
/**
|
||||
* Load all diseases from the database with their existing image URLs.
|
||||
* Query iNaturalist for real-world prevalence of a disease.
|
||||
* Returns observation count (higher = more common in the real world).
|
||||
*/
|
||||
async function getInatPrevalence(diseaseName: string, plantName?: string): Promise<number> {
|
||||
try {
|
||||
const headers = { "User-Agent": UA, Accept: "application/json" };
|
||||
const signal = AbortSignal.timeout(10_000);
|
||||
const baseUrl = "https://api.inaturalist.org/v1/observations";
|
||||
|
||||
// Tier 1: disease + plant name, research-grade, Plantae/Fungi/Chromista
|
||||
// This is the most specific and reliable query — filters to relevant kingdoms
|
||||
// and only counts community-verified observations.
|
||||
if (plantName) {
|
||||
const q = `${diseaseName} ${plantName}`;
|
||||
const url =
|
||||
`${baseUrl}?q=${encodeURIComponent(q)}` +
|
||||
`&quality_grade=research` +
|
||||
`&iconic_taxon_id=47126,47158,47686` +
|
||||
`&photos_only=true&per_page=1`;
|
||||
const res = await fetch(url, { headers, signal });
|
||||
if (res.ok) {
|
||||
const data = (await res.json()) as { total_results: number };
|
||||
if ((data.total_results ?? 0) > 0) return data.total_results!;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: disease name only, all quality grades (original behavior)
|
||||
const url = `${baseUrl}?q=${encodeURIComponent(diseaseName.toLowerCase())}&photos_only=true&per_page=1`;
|
||||
const res = await fetch(url, { headers, signal });
|
||||
if (!res.ok) return 0;
|
||||
const data = (await res.json()) as { total_results: number };
|
||||
return data.total_results ?? 0;
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load prevalence data from cache or build it by querying iNaturalist.
|
||||
* Caches results to avoid re-querying on every run.
|
||||
*/
|
||||
async function loadPrevalenceData(
|
||||
uniqueNames: string[],
|
||||
plantMap?: Map<string, string>,
|
||||
): Promise<Map<string, number>> {
|
||||
// Load cache if exists
|
||||
let cache: Record<string, number> = {};
|
||||
if (existsSync(INAT_CACHE_FILE)) {
|
||||
try {
|
||||
cache = JSON.parse(readFileSync(INAT_CACHE_FILE, "utf-8"));
|
||||
} catch {}
|
||||
}
|
||||
|
||||
const prevalenceMap = new Map<string, number>();
|
||||
const toQuery: string[] = [];
|
||||
|
||||
// Check which names need querying
|
||||
for (const name of uniqueNames) {
|
||||
const key = name.toLowerCase();
|
||||
if (key in cache) {
|
||||
prevalenceMap.set(name, cache[key]);
|
||||
} else {
|
||||
toQuery.push(name);
|
||||
}
|
||||
}
|
||||
|
||||
if (toQuery.length > 0) {
|
||||
console.log(`\n Querying iNaturalist for ${toQuery.length} disease prevalence scores...`);
|
||||
let queried = 0;
|
||||
|
||||
for (const name of toQuery) {
|
||||
const count = await getInatPrevalence(name, plantMap?.get(name));
|
||||
const key = name.toLowerCase();
|
||||
cache[key] = count;
|
||||
prevalenceMap.set(name, count);
|
||||
queried++;
|
||||
|
||||
// Save cache every 10 queries
|
||||
if (queried % 10 === 0) {
|
||||
writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2));
|
||||
console.log(` Queried ${queried}/${toQuery.length}...`);
|
||||
}
|
||||
|
||||
// Rate limit: ~100 req/min
|
||||
await sleep(600);
|
||||
}
|
||||
|
||||
// Final cache save
|
||||
writeFileSync(INAT_CACHE_FILE, JSON.stringify(cache, null, 2));
|
||||
console.log(` ✓ Queried ${queried} diseases, cached to ${INAT_CACHE_FILE}`);
|
||||
}
|
||||
|
||||
return prevalenceMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Persist prevalence scores to the database and update prevalence enum.
|
||||
* Maps observation counts to common/uncommon/rare based on thresholds.
|
||||
*/
|
||||
async function persistPrevalenceData(
|
||||
db: ReturnType<typeof getDb>,
|
||||
prevalenceMap: Map<string, number>,
|
||||
): Promise<void> {
|
||||
// Load all diseases to update
|
||||
const allDiseases = await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
name: diseases.name,
|
||||
})
|
||||
.from(diseases);
|
||||
|
||||
// Compute percentile-based thresholds from actual score distribution.
|
||||
// Top 25% → common, bottom 25% → rare, middle 50% → uncommon.
|
||||
// This guarantees meaningful classification regardless of absolute scale.
|
||||
const scores = Array.from(prevalenceMap.values())
|
||||
.filter((s) => s > 0)
|
||||
.sort((a, b) => a - b);
|
||||
const n = scores.length;
|
||||
const commonThreshold = n > 0 ? scores[Math.floor(n * 0.75)] : 1000;
|
||||
const rareThreshold = n > 0 ? scores[Math.floor(n * 0.25)] : 10;
|
||||
|
||||
console.log(
|
||||
`\n Prevalence distribution: ${n} non-zero scores` +
|
||||
`, p25=${rareThreshold.toLocaleString()}` +
|
||||
`, p75=${commonThreshold.toLocaleString()}`,
|
||||
);
|
||||
console.log(` Persisting prevalence data for ${allDiseases.length} diseases...`);
|
||||
let updated = 0;
|
||||
|
||||
for (const disease of allDiseases) {
|
||||
const score = prevalenceMap.get(disease.name) ?? 0;
|
||||
|
||||
// Map score to prevalence enum using distribution-based thresholds.
|
||||
// Score of 0 means no iNaturalist observations found — genuinely rare.
|
||||
let prevalence: "common" | "uncommon" | "rare" | "very_rare";
|
||||
if (score === 0) {
|
||||
prevalence = "very_rare";
|
||||
} else if (score >= commonThreshold) {
|
||||
prevalence = "common";
|
||||
} else if (score > rareThreshold) {
|
||||
prevalence = "uncommon";
|
||||
} else {
|
||||
prevalence = "rare";
|
||||
}
|
||||
|
||||
await db
|
||||
.update(diseases)
|
||||
.set({
|
||||
prevalenceScore: score,
|
||||
prevalence,
|
||||
updatedAt: sql`(datetime('now'))`,
|
||||
})
|
||||
.where(sql`${diseases.id} = ${disease.id}`);
|
||||
|
||||
updated++;
|
||||
if (updated % 100 === 0) {
|
||||
console.log(` Updated ${updated}/${allDiseases.length}...`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(` ✓ Updated ${updated} diseases with prevalence data`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the top 200 most common diseases from the database.
|
||||
* Ranks by iNaturalist observation counts (real-world prevalence data).
|
||||
*/
|
||||
async function loadDiseasesFromDb(): Promise<DbDisease[]> {
|
||||
const db = getDb();
|
||||
const rows = await db
|
||||
|
||||
// Get unique disease names and their most common host plant for better iNaturalist queries.
|
||||
const nameStats = await db
|
||||
.select({
|
||||
name: diseases.name,
|
||||
plantId: diseases.plantId,
|
||||
count: sql<number>`COUNT(*)`.mapWith(Number),
|
||||
})
|
||||
.from(diseases)
|
||||
.groupBy(diseases.name, diseases.plantId);
|
||||
|
||||
// Aggregate: unique names, name frequency (across all plants), and most common plant per name
|
||||
const seenNames = new Set<string>();
|
||||
const nameFrequency = new Map<string, number>();
|
||||
const plantFreq = new Map<string, Map<string, number>>();
|
||||
let totalDiseases = 0;
|
||||
|
||||
for (const row of nameStats) {
|
||||
seenNames.add(row.name);
|
||||
nameFrequency.set(row.name, (nameFrequency.get(row.name) ?? 0) + row.count);
|
||||
totalDiseases += row.count;
|
||||
|
||||
if (!plantFreq.has(row.name)) plantFreq.set(row.name, new Map());
|
||||
plantFreq.get(row.name)!.set(row.plantId, row.count);
|
||||
}
|
||||
|
||||
const uniqueNames = [...seenNames];
|
||||
|
||||
// For each disease name, pick the most frequent host plant for more specific iNaturalist queries
|
||||
const plantMap = new Map<string, string>();
|
||||
for (const [name, freq] of plantFreq) {
|
||||
const top = [...freq.entries()].sort((a, b) => b[1] - a[1])[0];
|
||||
plantMap.set(name, top[0]);
|
||||
}
|
||||
|
||||
console.log(
|
||||
` Found ${uniqueNames.length} unique disease names across ${totalDiseases} diseases`,
|
||||
);
|
||||
|
||||
// Load or build prevalence data from iNaturalist (with plant context for better queries)
|
||||
const prevalenceMap = await loadPrevalenceData(uniqueNames, plantMap);
|
||||
|
||||
// Persist prevalence scores to database
|
||||
await persistPrevalenceData(db, prevalenceMap);
|
||||
|
||||
// Load all diseases
|
||||
const allDiseases = await db
|
||||
.select({
|
||||
id: diseases.id,
|
||||
plantId: diseases.plantId,
|
||||
name: diseases.name,
|
||||
imageUrl: diseases.imageUrl,
|
||||
})
|
||||
.from(diseases)
|
||||
.orderBy(diseases.id);
|
||||
return rows;
|
||||
.from(diseases);
|
||||
|
||||
// Sort by iNaturalist prevalence (descending), then by name frequency as tiebreaker
|
||||
allDiseases.sort((a, b) => {
|
||||
const prevA = prevalenceMap.get(a.name) ?? 0;
|
||||
const prevB = prevalenceMap.get(b.name) ?? 0;
|
||||
if (prevA !== prevB) return prevB - prevA;
|
||||
// Tiebreaker: name frequency
|
||||
const freqA = nameFrequency.get(a.name) ?? 0;
|
||||
const freqB = nameFrequency.get(b.name) ?? 0;
|
||||
return freqB - freqA;
|
||||
});
|
||||
|
||||
// Return top TARGET_DISEASE_COUNT
|
||||
return allDiseases.slice(0, TARGET_DISEASE_COUNT);
|
||||
}
|
||||
|
||||
// ─── DuckDuckGo API ─────────────────────────────────────────────────────────
|
||||
@@ -208,7 +397,9 @@ async function searchImagesDuckDuckGo(
|
||||
vqd: string,
|
||||
page: number,
|
||||
): Promise<DuckDuckGoImageResult[]> {
|
||||
const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(query)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
|
||||
const url = `https://duckduckgo.com/i.js?q=${encodeURIComponent(
|
||||
query,
|
||||
)}&vqd=${vqd}&o=json&p=${page}&f=,,,`;
|
||||
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
@@ -396,7 +587,9 @@ async function searchImagesCommons(
|
||||
for (const hit of hits) {
|
||||
if (results.length >= target) break;
|
||||
const filename = hit.title.replace(/^File:/, "");
|
||||
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(filename)}`;
|
||||
const imgUrl = `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(
|
||||
filename,
|
||||
)}`;
|
||||
if (seenUrls.has(imgUrl)) continue;
|
||||
seenUrls.add(imgUrl);
|
||||
results.push(imgUrl);
|
||||
@@ -461,7 +654,6 @@ async function downloadBatch(
|
||||
const paddedIndex = String(index).padStart(4, "0");
|
||||
const destPath = resolve(classDir, `img_${paddedIndex}.jpg`);
|
||||
const success = await downloadImage(url, destPath);
|
||||
await sleep(DOWNLOAD_DELAY);
|
||||
return { success, index: index++, url: url.substring(0, 50) };
|
||||
}),
|
||||
);
|
||||
@@ -496,19 +688,36 @@ function loadProgress(): Progress {
|
||||
}
|
||||
try {
|
||||
const raw = JSON.parse(readFileSync(PROGRESS_FILE, "utf-8")) as Partial<Progress>;
|
||||
// Backward compat: ensure new fields exist
|
||||
raw.phase ??= 0;
|
||||
raw.phaseIndex ??= 0;
|
||||
raw.classes ??= {};
|
||||
|
||||
// Migration: detect old tiered system (phaseIndex > 200 means it's from old core/full system)
|
||||
const isOldFormat = (raw.phaseIndex ?? 0) > 200 || !raw.phase;
|
||||
if (isOldFormat) {
|
||||
console.warn(" ↻ Migrating progress file from old tiered system to new format");
|
||||
console.warn(" Phase checkpoint reset to 0 (will re-scan all 200 diseases)");
|
||||
console.warn(" Per-class progress (seenUrls, counts) preserved");
|
||||
raw.phase = 0;
|
||||
raw.phaseIndex = 0;
|
||||
} else {
|
||||
raw.phase ??= 0;
|
||||
raw.phaseIndex ??= 0;
|
||||
}
|
||||
|
||||
// Ensure each class has the sources field
|
||||
for (const key of Object.keys(raw.classes)) {
|
||||
const cp = raw.classes[key] as Partial<ClassProgress>;
|
||||
cp.sources ??= {
|
||||
db: { exhausted: false },
|
||||
duckduckgo: { exhausted: false },
|
||||
inaturalist: { exhausted: false },
|
||||
wikimedia: { exhausted: false },
|
||||
};
|
||||
|
||||
// Migrate class-level exhausted to per-source exhausted if needed
|
||||
if (!cp.sources) {
|
||||
const classExhausted = cp.exhausted ?? false;
|
||||
cp.sources = {
|
||||
db: { exhausted: classExhausted },
|
||||
duckduckgo: { exhausted: classExhausted },
|
||||
inaturalist: { exhausted: classExhausted },
|
||||
wikimedia: { exhausted: classExhausted },
|
||||
};
|
||||
}
|
||||
|
||||
cp.seenUrls ??= [];
|
||||
}
|
||||
return raw as Progress;
|
||||
@@ -608,7 +817,6 @@ async function collectClassImages(
|
||||
progress: Progress,
|
||||
classDir: string,
|
||||
existingUrls: string[] = [],
|
||||
fastMode = false, // Skip slow DuckDuckGo, use iNat + Commons only
|
||||
): Promise<void> {
|
||||
const cp = getClassProgress(progress, classId);
|
||||
|
||||
@@ -664,7 +872,7 @@ async function collectClassImages(
|
||||
}
|
||||
|
||||
// ── Source 1: DuckDuckGo ──────────────────────────────────────────────
|
||||
if (!fastMode && !sources.duckduckgo.exhausted && allUrls.length < needed) {
|
||||
if (!sources.duckduckgo.exhausted && allUrls.length < needed) {
|
||||
for (const query of queries) {
|
||||
if (allUrls.length >= needed) break;
|
||||
process.stdout.write(` DDG: "${query.substring(0, 40)}"... `);
|
||||
@@ -753,7 +961,9 @@ async function collectClassImages(
|
||||
|
||||
const pct = Math.round((cp.count / target) * 100);
|
||||
console.log(
|
||||
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${allUrls.length} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
|
||||
` ${downloaded > 0 ? "✓" : "✗"} Got ${downloaded}/${
|
||||
allUrls.length
|
||||
} (${failed} failed). Total: ${cp.count}/${target} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -761,21 +971,18 @@ async function collectClassImages(
|
||||
|
||||
async function main() {
|
||||
console.log("=".repeat(60));
|
||||
console.log("PLANT DISEASE DATASET COLLECTOR — FULL DB");
|
||||
console.log("PLANT DISEASE DATASET COLLECTOR — TOP 200 COMMON DISEASES");
|
||||
console.log("=".repeat(60));
|
||||
|
||||
// Ensure dataset directory exists before any cache writes
|
||||
mkdirSync(DATASET_DIR, { recursive: true });
|
||||
|
||||
// Load diseases from DB
|
||||
console.log("\nLoading diseases from database...");
|
||||
console.log("\nLoading top 200 most common diseases from database...");
|
||||
const dbDiseases = await loadDiseasesFromDb();
|
||||
console.log(` ${dbDiseases.length} diseases loaded`);
|
||||
|
||||
const coreDiseases = dbDiseases.filter((d) => CORE_PLANTS.has(d.plantId));
|
||||
const fullDiseases = dbDiseases.filter((d) => !CORE_PLANTS.has(d.plantId));
|
||||
console.log(` Core plants: ${coreDiseases.length} diseases (target: ${TARGET_CORE})`);
|
||||
console.log(` Full set: ${fullDiseases.length} diseases (target: ${TARGET_FULL})`);
|
||||
|
||||
// Load progress
|
||||
mkdirSync(DATASET_DIR, { recursive: true });
|
||||
const progress = loadProgress();
|
||||
|
||||
// If all phases complete, exit early
|
||||
@@ -787,63 +994,57 @@ async function main() {
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// ── Phase 1: Core set ──────────────────────────────────────────────────
|
||||
// ── Phase 1: Common diseases (200 images each) ──────────────────────────
|
||||
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log("PHASE 1: Core Diseases (100 images each)");
|
||||
console.log("PHASE 1: Common Diseases (200 images each)");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
const coreStart = progress.phase === 0 ? progress.phaseIndex : 0;
|
||||
if (coreStart > 0) {
|
||||
const diseaseStart = progress.phase === 0 ? progress.phaseIndex : 0;
|
||||
if (diseaseStart > 0) {
|
||||
console.log(
|
||||
` Resuming from disease #${coreStart + 1} (${((coreStart / coreDiseases.length) * 100).toFixed(0)}% done)`,
|
||||
` Resuming from disease #${diseaseStart + 1} (${(
|
||||
(diseaseStart / dbDiseases.length) *
|
||||
100
|
||||
).toFixed(0)}% done)`,
|
||||
);
|
||||
}
|
||||
|
||||
for (let i = coreStart; i < coreDiseases.length; i++) {
|
||||
const d = coreDiseases[i];
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d);
|
||||
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
|
||||
// Process diseases in parallel batches
|
||||
for (let i = diseaseStart; i < dbDiseases.length; i += DISEASE_CONCURRENCY) {
|
||||
const batch = dbDiseases.slice(i, i + DISEASE_CONCURRENCY);
|
||||
const batchNum = Math.floor(i / DISEASE_CONCURRENCY) + 1;
|
||||
const totalBatches = Math.ceil(dbDiseases.length / DISEASE_CONCURRENCY);
|
||||
const pct = Math.round((i / dbDiseases.length) * 100);
|
||||
|
||||
const pct = Math.round((i / coreDiseases.length) * 100);
|
||||
console.log(`\n[${i + 1}/${coreDiseases.length}] (${pct}%) ${d.name || d.id} (${d.plantId})`);
|
||||
console.log(
|
||||
`\n[Batch ${batchNum}/${totalBatches}] (${pct}%) Processing ${batch.length} diseases in parallel...`,
|
||||
);
|
||||
|
||||
await collectClassImages(d.id, queries, TARGET_CORE, progress, classDir, existingUrls);
|
||||
// Process all diseases in this batch concurrently
|
||||
await Promise.all(
|
||||
batch.map(async (d, batchIdx) => {
|
||||
const diseaseIdx = i + batchIdx;
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d);
|
||||
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
|
||||
|
||||
// Save checkpoint: phase 0, at index i
|
||||
console.log(` [${diseaseIdx + 1}/${dbDiseases.length}] ${d.name || d.id} (${d.plantId})`);
|
||||
|
||||
await collectClassImages(
|
||||
d.id,
|
||||
queries,
|
||||
TARGET_PER_DISEASE,
|
||||
progress,
|
||||
classDir,
|
||||
existingUrls,
|
||||
);
|
||||
}),
|
||||
);
|
||||
|
||||
// Save checkpoint: phase 0, at index i + batch.length
|
||||
progress.phase = 0;
|
||||
progress.phaseIndex = i + 1;
|
||||
saveProgress(progress);
|
||||
}
|
||||
|
||||
// ── Phase 2: Full set ──────────────────────────────────────────────────
|
||||
|
||||
console.log("\n" + "─".repeat(60));
|
||||
console.log("PHASE 2: Full Disease Set (10 images each)");
|
||||
console.log("─".repeat(60));
|
||||
|
||||
const fullStart = progress.phase === 1 ? progress.phaseIndex : 0;
|
||||
if (fullStart > 0) {
|
||||
console.log(
|
||||
` Resuming from disease #${fullStart + 1} (${((fullStart / fullDiseases.length) * 100).toFixed(0)}% done)`,
|
||||
);
|
||||
}
|
||||
|
||||
for (let i = fullStart; i < fullDiseases.length; i++) {
|
||||
const d = fullDiseases[i];
|
||||
const classDir = resolve(DATASET_DIR, d.id);
|
||||
const queries = buildSearchQueries(d);
|
||||
const existingUrls = d.imageUrl ? [d.imageUrl] : [];
|
||||
|
||||
const pct = Math.round((i / fullDiseases.length) * 100);
|
||||
console.log(`\n[${i + 1}/${fullDiseases.length}] (${pct}%) ${d.id} (${d.plantId})`);
|
||||
|
||||
await collectClassImages(d.id, queries, TARGET_FULL, progress, classDir, existingUrls, true);
|
||||
|
||||
// Save checkpoint: phase 1, at index i
|
||||
progress.phase = 1;
|
||||
progress.phaseIndex = i + 1;
|
||||
progress.phaseIndex = i + batch.length;
|
||||
saveProgress(progress);
|
||||
}
|
||||
|
||||
|
||||
@@ -272,18 +272,22 @@ function PrevalenceBadge({ prevalence }: { prevalence: Prevalence }) {
|
||||
common: "📊",
|
||||
uncommon: "📋",
|
||||
rare: "📌",
|
||||
very_rare: "🔍",
|
||||
};
|
||||
const colors: Record<Prevalence, string> = {
|
||||
common: "bg-emerald-100 text-emerald-800 dark:bg-emerald-900/40 dark:text-emerald-300",
|
||||
uncommon: "bg-zinc-100 text-zinc-700 dark:bg-zinc-800/60 dark:text-zinc-300",
|
||||
rare: "bg-amber-100 text-amber-800 dark:bg-amber-900/40 dark:text-amber-300",
|
||||
very_rare: "bg-red-100 text-red-800 dark:bg-red-900/40 dark:text-red-300",
|
||||
};
|
||||
|
||||
const label = prevalence.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
||||
|
||||
return (
|
||||
<span
|
||||
className={`inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium ${colors[prevalence]}`}
|
||||
>
|
||||
{icons[prevalence]} {prevalence.charAt(0).toUpperCase() + prevalence.slice(1)}
|
||||
{icons[prevalence]} {label}
|
||||
</span>
|
||||
);
|
||||
}
|
||||
@@ -298,9 +302,10 @@ const SEVERITY_RANK: Record<Severity, number> = {
|
||||
};
|
||||
|
||||
const PREVALENCE_RANK: Record<Prevalence, number> = {
|
||||
common: 3,
|
||||
uncommon: 2,
|
||||
rare: 1,
|
||||
common: 4,
|
||||
uncommon: 3,
|
||||
rare: 2,
|
||||
very_rare: 1,
|
||||
};
|
||||
|
||||
type SortField = "prevalence" | "danger";
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
* for the browse page. Runs server-side only.
|
||||
*/
|
||||
|
||||
import { sql, eq } from "drizzle-orm";
|
||||
import { sql, eq, inArray, notInArray } from "drizzle-orm";
|
||||
import { getDb } from "@/lib/db/index";
|
||||
import { plants, diseases, plantViews } from "@/lib/db/schema";
|
||||
import type { PlantCardData } from "@/components/PlantCard";
|
||||
@@ -12,11 +12,13 @@ export type { PlantCardData };
|
||||
|
||||
/**
|
||||
* Get all plants with their disease counts for the browse page.
|
||||
*
|
||||
* Uses scalar subqueries for COUNT to avoid expensive LEFT JOIN + GROUP BY
|
||||
* on the large diseases table (11,498 rows).
|
||||
*/
|
||||
export async function getBrowsePlants(): Promise<PlantCardData[]> {
|
||||
const db = getDb();
|
||||
|
||||
// LEFT JOIN to include plants with zero diseases
|
||||
const rows = await db
|
||||
.select({
|
||||
id: plants.id,
|
||||
@@ -27,12 +29,10 @@ export async function getBrowsePlants(): Promise<PlantCardData[]> {
|
||||
imageUrl: plants.imageUrl,
|
||||
updatedAt: plants.updatedAt,
|
||||
viewCount: sql<number>`COALESCE(${plantViews.viewCount}, 0)`,
|
||||
diseaseCount: sql<number>`COUNT(${diseases.id})`,
|
||||
diseaseCount: sql<number>`(SELECT COUNT(*) FROM ${diseases} WHERE ${diseases.plantId} = ${plants.id})`,
|
||||
})
|
||||
.from(plants)
|
||||
.leftJoin(diseases, eq(diseases.plantId, plants.id))
|
||||
.leftJoin(plantViews, eq(plantViews.plantId, plants.id))
|
||||
.groupBy(plants.id)
|
||||
.orderBy(plants.commonName);
|
||||
|
||||
return rows.map((r) => ({
|
||||
@@ -61,12 +61,10 @@ export async function getBrowsePlant(id: string): Promise<PlantCardData | null>
|
||||
family: plants.family,
|
||||
category: plants.category,
|
||||
imageUrl: plants.imageUrl,
|
||||
diseaseCount: sql<number>`COUNT(${diseases.id})`,
|
||||
diseaseCount: sql<number>`(SELECT COUNT(*) FROM ${diseases} WHERE ${diseases.plantId} = ${plants.id})`,
|
||||
})
|
||||
.from(plants)
|
||||
.leftJoin(diseases, eq(diseases.plantId, plants.id))
|
||||
.where(eq(plants.id, id))
|
||||
.groupBy(plants.id)
|
||||
.limit(1);
|
||||
|
||||
return rows[0] ?? null;
|
||||
@@ -91,12 +89,47 @@ const FEATURED_IDS = [
|
||||
];
|
||||
|
||||
export async function getFeaturedPlants(): Promise<PlantCardData[]> {
|
||||
const all = await getBrowsePlants();
|
||||
const featured = all.filter((p) => FEATURED_IDS.includes(p.id));
|
||||
// If fewer than expected are found, pad with first available plants
|
||||
if (featured.length < 6) {
|
||||
const rest = all.filter((p) => !FEATURED_IDS.includes(p.id));
|
||||
return [...featured, ...rest].slice(0, 12);
|
||||
const db = getDb();
|
||||
|
||||
const selectFeatured = db
|
||||
.select({
|
||||
id: plants.id,
|
||||
commonName: plants.commonName,
|
||||
scientificName: plants.scientificName,
|
||||
family: plants.family,
|
||||
category: plants.category,
|
||||
imageUrl: plants.imageUrl,
|
||||
updatedAt: plants.updatedAt,
|
||||
viewCount: sql<number>`COALESCE(${plantViews.viewCount}, 0)`,
|
||||
diseaseCount: sql<number>`(SELECT COUNT(*) FROM ${diseases} WHERE ${diseases.plantId} = ${plants.id})`,
|
||||
})
|
||||
.from(plants)
|
||||
.leftJoin(plantViews, eq(plantViews.plantId, plants.id));
|
||||
|
||||
const rows = await selectFeatured
|
||||
.where(inArray(plants.id, FEATURED_IDS))
|
||||
.orderBy(plants.commonName);
|
||||
|
||||
if (rows.length < 6) {
|
||||
const padRows = await selectFeatured
|
||||
.where(notInArray(plants.id, FEATURED_IDS))
|
||||
.orderBy(plants.commonName)
|
||||
.limit(12 - rows.length);
|
||||
return [...rows, ...padRows].map(mapRow);
|
||||
}
|
||||
return featured.slice(0, 12);
|
||||
return rows.slice(0, 12).map(mapRow);
|
||||
}
|
||||
|
||||
function mapRow(r: Record<string, unknown>): PlantCardData {
|
||||
return {
|
||||
id: r.id as string,
|
||||
commonName: r.commonName as string,
|
||||
scientificName: r.scientificName as string,
|
||||
family: r.family as string,
|
||||
category: r.category as string,
|
||||
imageUrl: r.imageUrl as string,
|
||||
updatedAt: r.updatedAt as string | undefined,
|
||||
viewCount: r.viewCount as number,
|
||||
diseaseCount: r.diseaseCount as number,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -280,7 +280,7 @@ export async function validateKnowledgeBase(): Promise<string[]> {
|
||||
"environmental",
|
||||
];
|
||||
const validSeverities: Severity[] = ["low", "moderate", "high", "critical"];
|
||||
const validPrevalences: Prevalence[] = ["common", "uncommon", "rare"];
|
||||
const validPrevalences: Prevalence[] = ["common", "uncommon", "rare", "very_rare"];
|
||||
|
||||
const db = getDb();
|
||||
|
||||
|
||||
@@ -55,10 +55,11 @@ export const diseases = sqliteTable(
|
||||
prevention: text("prevention", { mode: "json" }).notNull().default([]).$type<string[]>(),
|
||||
lookalikeIds: text("lookalike_ids", { mode: "json" }).notNull().default([]).$type<string[]>(),
|
||||
prevalence: text("prevalence", {
|
||||
enum: ["common", "uncommon", "rare"],
|
||||
enum: ["common", "uncommon", "rare", "very_rare"],
|
||||
})
|
||||
.notNull()
|
||||
.default("uncommon"),
|
||||
prevalenceScore: integer("prevalence_score").notNull().default(0),
|
||||
severity: text("severity", {
|
||||
enum: ["low", "moderate", "high", "critical"],
|
||||
}).notNull(),
|
||||
|
||||
@@ -10,7 +10,7 @@ export type CausalAgentType = "fungal" | "bacterial" | "viral" | "environmental"
|
||||
export type Severity = "low" | "moderate" | "high" | "critical";
|
||||
|
||||
/** How common/prevalent a disease is in the field */
|
||||
export type Prevalence = "common" | "uncommon" | "rare";
|
||||
export type Prevalence = "common" | "uncommon" | "rare" | "very_rare";
|
||||
|
||||
/** Plant category for grouping and filtering */
|
||||
export type PlantCategory =
|
||||
|
||||
Reference in New Issue
Block a user