plant-disease-id/scripts/audit-dataset.py

#!/usr/bin/env python3
"""Audit the dataset: parse dir names, cross-ref KB, gather stats."""

import json, os, sys
from collections import Counter

BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Load KB
with open(os.path.join(BASE, 'src', 'data', 'plants.json')) as f:
    plants = json.load(f)
with open(os.path.join(BASE, 'src', 'data', 'diseases.json')) as f:
    diseases = json.load(f)

plant_ids = {p['id'] for p in plants}
disease_ids = {d['id'] for d in diseases}
sorted_disease_ids = sorted(disease_ids, key=len, reverse=True)

# Scan dataset dirs (skip hidden files)
dataset_dir = os.path.join(BASE, 'data', 'dataset')
dirs = sorted([d for d in os.listdir(dataset_dir)
               if os.path.isdir(os.path.join(dataset_dir, d)) and not d.startswith('.')])

print(f"Total dataset directories: {len(dirs)}")

# Strategy 1: Match against known plant IDs (longest first) to split plant/disease
sorted_plant_ids = sorted(plant_ids, key=len, reverse=True)

parsed = []
unmatched_names = []
for d in dirs:
    # Try matching plant prefix (longest plant IDs first)
    found = False
    for pid in sorted_plant_ids:
        prefix = pid + '-'
        if d.startswith(prefix):
            disease_part = d[len(prefix):]
            parsed.append((pid, disease_part))
            found = True
            break
    if not found:
        # Try matching disease suffix from KB
        for did in sorted_disease_ids:
            suffix = '-' + did
            if d.endswith(suffix):
                plant_part = d[:-len(suffix)]
                parsed.append((plant_part, did))
                found = True
                break
    if not found:
        unmatched_names.append(d)

print(f"Parsed: {len(parsed)}")
print(f"Unmatched: {len(unmatched_names)}")
if unmatched_names:
    print(f"First 20 unmatched: {unmatched_names[:20]}")

# For unmatched, try heuristic: last N words (separated by -) are the disease
# We need to figure out how many words constitute the disease
# Let's analyze the unmatched ones
print(f"\n=== Analyzing unmatched dir names ===")
# Get all unique "last word" suffixes from unmatched
suffix_counts = Counter()
for d in unmatched_names:
    parts = d.split('-')
    # Try last 1, 2, 3, 4 words as disease
    for n in range(1, min(7, len(parts))):
        suffix = '-'.join(parts[-n:])
        suffix_counts[suffix] += 1

print(f"Most common last-1-word suffixes (potential single-word diseases):")
for suf, cnt in suffix_counts.most_common(30):
    if len(suf.split('-')) == 1:
        print(f"  {suf}: {cnt}")

print(f"\nMost common last-2-word suffixes:")
for suf, cnt in suffix_counts.most_common(20):
    if len(suf.split('-')) == 2:
        print(f"  {suf}: {cnt}")

# Build a heuristic disease dictionary from the dataset itself
# For dirs that matched via plant prefix, extract all unique disease parts
dataset_diseases = Counter()
dataset_plants = Counter()
for plant, disease in parsed:
    dataset_diseases[disease] += 1
    dataset_plants[plant] += 1

print(f"\n=== Dataset disease labels (from plant-prefix matches) ===")
for disease, cnt in sorted(dataset_diseases.items(), key=lambda x: -x[1]):
    in_kb = "✓" if disease in disease_ids else ""
    print(f"  {disease}: {cnt}x {in_kb}")

# Now re-parse ALL dirs using our accumulated knowledge
# Build a comprehensive disease ID list from both KB and dataset
known_diseases = set(disease_ids) | set(dataset_diseases.keys())
known_diseases.add('healthy')
sorted_known_diseases = sorted(known_diseases, key=len, reverse=True)

print(f"\n=== Comprehensive parsing ===")
reparsed = []
still_unmatched = []
for d in dirs:
    found = False
    # Match disease suffix first
    for did in sorted_known_diseases:
        suffix = '-' + did
        if d.endswith(suffix):
            plant_part = d[:-len(suffix)]
            # Plant part should end with a known plant or be a reasonable plant name
            reparsed.append((plant_part, did))
            found = True
            break
    if not found:
        still_unmatched.append(d)

print(f"Re-parsed: {len(reparsed)}")
print(f"Still unmatched: {len(still_unmatched)}")
if still_unmatched:
    for u in still_unmatched[:20]:
        print(f"  {u}")

# Final analysis: unique plants and diseases
unique_plants = sorted(set(p[0] for p in reparsed))
unique_diseases = sorted(set(p[1] for p in reparsed))
print(f"\nUnique plants: {len(unique_plants)}")
print(f"Unique diseases: {len(unique_diseases)}")

# Plant class counts
plant_class_counts = Counter(p[0] for p in reparsed)
print(f"\nTop 25 plants by class count:")
for plant, cnt in plant_class_counts.most_common(25):
    print(f"  {plant}: {cnt}")

# Now count actual image files
print(f"\n=== Image counts (full scan) ===")
total_images = 0
class_sizes = Counter()
for d in dirs:
    full_path = os.path.join(dataset_dir, d)
    file_count = len([f for f in os.listdir(full_path)
                      if os.path.isfile(os.path.join(full_path, f))])
    class_sizes[d] = file_count
    total_images += file_count

size_vals = list(class_sizes.values())
print(f"Total images: {total_images:,}")
print(f"Classes: {len(class_sizes)}")
print(f"Min/class: {min(size_vals)}, Max/class: {max(size_vals)}")
print(f"Mean/class: {sum(size_vals)/len(size_vals):.0f}")
print(f"Median/class: {sorted(size_vals)[len(size_vals)//2]}")

# Images per plant (using the parsed data)
plant_image_counts = Counter()
for d, size in class_sizes.items():
    # Find the plant for this dir
    for plant, disease in reparsed:
        # Simple matching: does the dir start with plant?
        pass  # Let's do it properly below

# Better: build a dir_name -> (plant, disease) lookup
dir_to_class = {}
for plant, disease in reparsed:
    # Approximate the original dir name
    key = f"{plant}-{disease}"
    dir_to_class[key] = (plant, disease)

plant_image_totals = Counter()
for d, size in class_sizes.items():
    # Find matching entry
    # The dir name might not exactly match the key
    if d in dir_to_class:
        plant, disease = dir_to_class[d]
        plant_image_totals[plant] += size

print(f"\nTop 15 plants by total images:")
for plant, cnt in plant_image_totals.most_common(15):
    print(f"  {plant}: {cnt:,}")