Files
plant-disease-id/scripts/audit-dataset.py

179 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""Audit the dataset: parse dir names, cross-ref KB, gather stats."""
import json, os, sys
from collections import Counter
BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Load KB
with open(os.path.join(BASE, 'src', 'data', 'plants.json')) as f:
plants = json.load(f)
with open(os.path.join(BASE, 'src', 'data', 'diseases.json')) as f:
diseases = json.load(f)
plant_ids = {p['id'] for p in plants}
disease_ids = {d['id'] for d in diseases}
sorted_disease_ids = sorted(disease_ids, key=len, reverse=True)
# Scan dataset dirs (skip hidden files)
dataset_dir = os.path.join(BASE, 'data', 'dataset')
dirs = sorted([d for d in os.listdir(dataset_dir)
if os.path.isdir(os.path.join(dataset_dir, d)) and not d.startswith('.')])
print(f"Total dataset directories: {len(dirs)}")
# Strategy 1: Match against known plant IDs (longest first) to split plant/disease
sorted_plant_ids = sorted(plant_ids, key=len, reverse=True)
parsed = []
unmatched_names = []
for d in dirs:
# Try matching plant prefix (longest plant IDs first)
found = False
for pid in sorted_plant_ids:
prefix = pid + '-'
if d.startswith(prefix):
disease_part = d[len(prefix):]
parsed.append((pid, disease_part))
found = True
break
if not found:
# Try matching disease suffix from KB
for did in sorted_disease_ids:
suffix = '-' + did
if d.endswith(suffix):
plant_part = d[:-len(suffix)]
parsed.append((plant_part, did))
found = True
break
if not found:
unmatched_names.append(d)
print(f"Parsed: {len(parsed)}")
print(f"Unmatched: {len(unmatched_names)}")
if unmatched_names:
print(f"First 20 unmatched: {unmatched_names[:20]}")
# For unmatched, try heuristic: last N words (separated by -) are the disease
# We need to figure out how many words constitute the disease
# Let's analyze the unmatched ones
print(f"\n=== Analyzing unmatched dir names ===")
# Get all unique "last word" suffixes from unmatched
suffix_counts = Counter()
for d in unmatched_names:
parts = d.split('-')
# Try last 1, 2, 3, 4 words as disease
for n in range(1, min(7, len(parts))):
suffix = '-'.join(parts[-n:])
suffix_counts[suffix] += 1
print(f"Most common last-1-word suffixes (potential single-word diseases):")
for suf, cnt in suffix_counts.most_common(30):
if len(suf.split('-')) == 1:
print(f" {suf}: {cnt}")
print(f"\nMost common last-2-word suffixes:")
for suf, cnt in suffix_counts.most_common(20):
if len(suf.split('-')) == 2:
print(f" {suf}: {cnt}")
# Build a heuristic disease dictionary from the dataset itself
# For dirs that matched via plant prefix, extract all unique disease parts
dataset_diseases = Counter()
dataset_plants = Counter()
for plant, disease in parsed:
dataset_diseases[disease] += 1
dataset_plants[plant] += 1
print(f"\n=== Dataset disease labels (from plant-prefix matches) ===")
for disease, cnt in sorted(dataset_diseases.items(), key=lambda x: -x[1]):
in_kb = "" if disease in disease_ids else ""
print(f" {disease}: {cnt}x {in_kb}")
# Now re-parse ALL dirs using our accumulated knowledge
# Build a comprehensive disease ID list from both KB and dataset
known_diseases = set(disease_ids) | set(dataset_diseases.keys())
known_diseases.add('healthy')
sorted_known_diseases = sorted(known_diseases, key=len, reverse=True)
print(f"\n=== Comprehensive parsing ===")
reparsed = []
still_unmatched = []
for d in dirs:
found = False
# Match disease suffix first
for did in sorted_known_diseases:
suffix = '-' + did
if d.endswith(suffix):
plant_part = d[:-len(suffix)]
# Plant part should end with a known plant or be a reasonable plant name
reparsed.append((plant_part, did))
found = True
break
if not found:
still_unmatched.append(d)
print(f"Re-parsed: {len(reparsed)}")
print(f"Still unmatched: {len(still_unmatched)}")
if still_unmatched:
for u in still_unmatched[:20]:
print(f" {u}")
# Final analysis: unique plants and diseases
unique_plants = sorted(set(p[0] for p in reparsed))
unique_diseases = sorted(set(p[1] for p in reparsed))
print(f"\nUnique plants: {len(unique_plants)}")
print(f"Unique diseases: {len(unique_diseases)}")
# Plant class counts
plant_class_counts = Counter(p[0] for p in reparsed)
print(f"\nTop 25 plants by class count:")
for plant, cnt in plant_class_counts.most_common(25):
print(f" {plant}: {cnt}")
# Now count actual image files
print(f"\n=== Image counts (full scan) ===")
total_images = 0
class_sizes = Counter()
for d in dirs:
full_path = os.path.join(dataset_dir, d)
file_count = len([f for f in os.listdir(full_path)
if os.path.isfile(os.path.join(full_path, f))])
class_sizes[d] = file_count
total_images += file_count
size_vals = list(class_sizes.values())
print(f"Total images: {total_images:,}")
print(f"Classes: {len(class_sizes)}")
print(f"Min/class: {min(size_vals)}, Max/class: {max(size_vals)}")
print(f"Mean/class: {sum(size_vals)/len(size_vals):.0f}")
print(f"Median/class: {sorted(size_vals)[len(size_vals)//2]}")
# Images per plant (using the parsed data)
plant_image_counts = Counter()
for d, size in class_sizes.items():
# Find the plant for this dir
for plant, disease in reparsed:
# Simple matching: does the dir start with plant?
pass # Let's do it properly below
# Better: build a dir_name -> (plant, disease) lookup
dir_to_class = {}
for plant, disease in reparsed:
# Approximate the original dir name
key = f"{plant}-{disease}"
dir_to_class[key] = (plant, disease)
plant_image_totals = Counter()
for d, size in class_sizes.items():
# Find matching entry
# The dir name might not exactly match the key
if d in dir_to_class:
plant, disease = dir_to_class[d]
plant_image_totals[plant] += size
print(f"\nTop 15 plants by total images:")
for plant, cnt in plant_image_totals.most_common(15):
print(f" {plant}: {cnt:,}")