179 lines
6.1 KiB
Python
179 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Audit the dataset: parse dir names, cross-ref KB, gather stats."""
|
|
|
|
import json, os, sys
|
|
from collections import Counter
|
|
|
|
BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# Load KB
|
|
with open(os.path.join(BASE, 'src', 'data', 'plants.json')) as f:
|
|
plants = json.load(f)
|
|
with open(os.path.join(BASE, 'src', 'data', 'diseases.json')) as f:
|
|
diseases = json.load(f)
|
|
|
|
plant_ids = {p['id'] for p in plants}
|
|
disease_ids = {d['id'] for d in diseases}
|
|
sorted_disease_ids = sorted(disease_ids, key=len, reverse=True)
|
|
|
|
# Scan dataset dirs (skip hidden files)
|
|
dataset_dir = os.path.join(BASE, 'data', 'dataset')
|
|
dirs = sorted([d for d in os.listdir(dataset_dir)
|
|
if os.path.isdir(os.path.join(dataset_dir, d)) and not d.startswith('.')])
|
|
|
|
print(f"Total dataset directories: {len(dirs)}")
|
|
|
|
# Strategy 1: Match against known plant IDs (longest first) to split plant/disease
|
|
sorted_plant_ids = sorted(plant_ids, key=len, reverse=True)
|
|
|
|
parsed = []
|
|
unmatched_names = []
|
|
for d in dirs:
|
|
# Try matching plant prefix (longest plant IDs first)
|
|
found = False
|
|
for pid in sorted_plant_ids:
|
|
prefix = pid + '-'
|
|
if d.startswith(prefix):
|
|
disease_part = d[len(prefix):]
|
|
parsed.append((pid, disease_part))
|
|
found = True
|
|
break
|
|
if not found:
|
|
# Try matching disease suffix from KB
|
|
for did in sorted_disease_ids:
|
|
suffix = '-' + did
|
|
if d.endswith(suffix):
|
|
plant_part = d[:-len(suffix)]
|
|
parsed.append((plant_part, did))
|
|
found = True
|
|
break
|
|
if not found:
|
|
unmatched_names.append(d)
|
|
|
|
print(f"Parsed: {len(parsed)}")
|
|
print(f"Unmatched: {len(unmatched_names)}")
|
|
if unmatched_names:
|
|
print(f"First 20 unmatched: {unmatched_names[:20]}")
|
|
|
|
# For unmatched, try heuristic: last N words (separated by -) are the disease
|
|
# We need to figure out how many words constitute the disease
|
|
# Let's analyze the unmatched ones
|
|
print(f"\n=== Analyzing unmatched dir names ===")
|
|
# Get all unique "last word" suffixes from unmatched
|
|
suffix_counts = Counter()
|
|
for d in unmatched_names:
|
|
parts = d.split('-')
|
|
# Try last 1, 2, 3, 4 words as disease
|
|
for n in range(1, min(7, len(parts))):
|
|
suffix = '-'.join(parts[-n:])
|
|
suffix_counts[suffix] += 1
|
|
|
|
print(f"Most common last-1-word suffixes (potential single-word diseases):")
|
|
for suf, cnt in suffix_counts.most_common(30):
|
|
if len(suf.split('-')) == 1:
|
|
print(f" {suf}: {cnt}")
|
|
|
|
print(f"\nMost common last-2-word suffixes:")
|
|
for suf, cnt in suffix_counts.most_common(20):
|
|
if len(suf.split('-')) == 2:
|
|
print(f" {suf}: {cnt}")
|
|
|
|
# Build a heuristic disease dictionary from the dataset itself
|
|
# For dirs that matched via plant prefix, extract all unique disease parts
|
|
dataset_diseases = Counter()
|
|
dataset_plants = Counter()
|
|
for plant, disease in parsed:
|
|
dataset_diseases[disease] += 1
|
|
dataset_plants[plant] += 1
|
|
|
|
print(f"\n=== Dataset disease labels (from plant-prefix matches) ===")
|
|
for disease, cnt in sorted(dataset_diseases.items(), key=lambda x: -x[1]):
|
|
in_kb = "✓" if disease in disease_ids else ""
|
|
print(f" {disease}: {cnt}x {in_kb}")
|
|
|
|
# Now re-parse ALL dirs using our accumulated knowledge
|
|
# Build a comprehensive disease ID list from both KB and dataset
|
|
known_diseases = set(disease_ids) | set(dataset_diseases.keys())
|
|
known_diseases.add('healthy')
|
|
sorted_known_diseases = sorted(known_diseases, key=len, reverse=True)
|
|
|
|
print(f"\n=== Comprehensive parsing ===")
|
|
reparsed = []
|
|
still_unmatched = []
|
|
for d in dirs:
|
|
found = False
|
|
# Match disease suffix first
|
|
for did in sorted_known_diseases:
|
|
suffix = '-' + did
|
|
if d.endswith(suffix):
|
|
plant_part = d[:-len(suffix)]
|
|
# Plant part should end with a known plant or be a reasonable plant name
|
|
reparsed.append((plant_part, did))
|
|
found = True
|
|
break
|
|
if not found:
|
|
still_unmatched.append(d)
|
|
|
|
print(f"Re-parsed: {len(reparsed)}")
|
|
print(f"Still unmatched: {len(still_unmatched)}")
|
|
if still_unmatched:
|
|
for u in still_unmatched[:20]:
|
|
print(f" {u}")
|
|
|
|
# Final analysis: unique plants and diseases
|
|
unique_plants = sorted(set(p[0] for p in reparsed))
|
|
unique_diseases = sorted(set(p[1] for p in reparsed))
|
|
print(f"\nUnique plants: {len(unique_plants)}")
|
|
print(f"Unique diseases: {len(unique_diseases)}")
|
|
|
|
# Plant class counts
|
|
plant_class_counts = Counter(p[0] for p in reparsed)
|
|
print(f"\nTop 25 plants by class count:")
|
|
for plant, cnt in plant_class_counts.most_common(25):
|
|
print(f" {plant}: {cnt}")
|
|
|
|
# Now count actual image files
|
|
print(f"\n=== Image counts (full scan) ===")
|
|
total_images = 0
|
|
class_sizes = Counter()
|
|
for d in dirs:
|
|
full_path = os.path.join(dataset_dir, d)
|
|
file_count = len([f for f in os.listdir(full_path)
|
|
if os.path.isfile(os.path.join(full_path, f))])
|
|
class_sizes[d] = file_count
|
|
total_images += file_count
|
|
|
|
size_vals = list(class_sizes.values())
|
|
print(f"Total images: {total_images:,}")
|
|
print(f"Classes: {len(class_sizes)}")
|
|
print(f"Min/class: {min(size_vals)}, Max/class: {max(size_vals)}")
|
|
print(f"Mean/class: {sum(size_vals)/len(size_vals):.0f}")
|
|
print(f"Median/class: {sorted(size_vals)[len(size_vals)//2]}")
|
|
|
|
# Images per plant (using the parsed data)
|
|
plant_image_counts = Counter()
|
|
for d, size in class_sizes.items():
|
|
# Find the plant for this dir
|
|
for plant, disease in reparsed:
|
|
# Simple matching: does the dir start with plant?
|
|
pass # Let's do it properly below
|
|
|
|
# Better: build a dir_name -> (plant, disease) lookup
|
|
dir_to_class = {}
|
|
for plant, disease in reparsed:
|
|
# Approximate the original dir name
|
|
key = f"{plant}-{disease}"
|
|
dir_to_class[key] = (plant, disease)
|
|
|
|
plant_image_totals = Counter()
|
|
for d, size in class_sizes.items():
|
|
# Find matching entry
|
|
# The dir name might not exactly match the key
|
|
if d in dir_to_class:
|
|
plant, disease = dir_to_class[d]
|
|
plant_image_totals[plant] += size
|
|
|
|
print(f"\nTop 15 plants by total images:")
|
|
for plant, cnt in plant_image_totals.most_common(15):
|
|
print(f" {plant}: {cnt:,}")
|