#!/usr/bin/env python3 """Audit the dataset: parse dir names, cross-ref KB, gather stats.""" import json, os, sys from collections import Counter BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Load KB with open(os.path.join(BASE, 'src', 'data', 'plants.json')) as f: plants = json.load(f) with open(os.path.join(BASE, 'src', 'data', 'diseases.json')) as f: diseases = json.load(f) plant_ids = {p['id'] for p in plants} disease_ids = {d['id'] for d in diseases} sorted_disease_ids = sorted(disease_ids, key=len, reverse=True) # Scan dataset dirs (skip hidden files) dataset_dir = os.path.join(BASE, 'data', 'dataset') dirs = sorted([d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d)) and not d.startswith('.')]) print(f"Total dataset directories: {len(dirs)}") # Strategy 1: Match against known plant IDs (longest first) to split plant/disease sorted_plant_ids = sorted(plant_ids, key=len, reverse=True) parsed = [] unmatched_names = [] for d in dirs: # Try matching plant prefix (longest plant IDs first) found = False for pid in sorted_plant_ids: prefix = pid + '-' if d.startswith(prefix): disease_part = d[len(prefix):] parsed.append((pid, disease_part)) found = True break if not found: # Try matching disease suffix from KB for did in sorted_disease_ids: suffix = '-' + did if d.endswith(suffix): plant_part = d[:-len(suffix)] parsed.append((plant_part, did)) found = True break if not found: unmatched_names.append(d) print(f"Parsed: {len(parsed)}") print(f"Unmatched: {len(unmatched_names)}") if unmatched_names: print(f"First 20 unmatched: {unmatched_names[:20]}") # For unmatched, try heuristic: last N words (separated by -) are the disease # We need to figure out how many words constitute the disease # Let's analyze the unmatched ones print(f"\n=== Analyzing unmatched dir names ===") # Get all unique "last word" suffixes from unmatched suffix_counts = Counter() for d in unmatched_names: parts = d.split('-') # Try last 1, 2, 3, 4 words as disease for n in range(1, min(7, len(parts))): suffix = '-'.join(parts[-n:]) suffix_counts[suffix] += 1 print(f"Most common last-1-word suffixes (potential single-word diseases):") for suf, cnt in suffix_counts.most_common(30): if len(suf.split('-')) == 1: print(f" {suf}: {cnt}") print(f"\nMost common last-2-word suffixes:") for suf, cnt in suffix_counts.most_common(20): if len(suf.split('-')) == 2: print(f" {suf}: {cnt}") # Build a heuristic disease dictionary from the dataset itself # For dirs that matched via plant prefix, extract all unique disease parts dataset_diseases = Counter() dataset_plants = Counter() for plant, disease in parsed: dataset_diseases[disease] += 1 dataset_plants[plant] += 1 print(f"\n=== Dataset disease labels (from plant-prefix matches) ===") for disease, cnt in sorted(dataset_diseases.items(), key=lambda x: -x[1]): in_kb = "✓" if disease in disease_ids else "" print(f" {disease}: {cnt}x {in_kb}") # Now re-parse ALL dirs using our accumulated knowledge # Build a comprehensive disease ID list from both KB and dataset known_diseases = set(disease_ids) | set(dataset_diseases.keys()) known_diseases.add('healthy') sorted_known_diseases = sorted(known_diseases, key=len, reverse=True) print(f"\n=== Comprehensive parsing ===") reparsed = [] still_unmatched = [] for d in dirs: found = False # Match disease suffix first for did in sorted_known_diseases: suffix = '-' + did if d.endswith(suffix): plant_part = d[:-len(suffix)] # Plant part should end with a known plant or be a reasonable plant name reparsed.append((plant_part, did)) found = True break if not found: still_unmatched.append(d) print(f"Re-parsed: {len(reparsed)}") print(f"Still unmatched: {len(still_unmatched)}") if still_unmatched: for u in still_unmatched[:20]: print(f" {u}") # Final analysis: unique plants and diseases unique_plants = sorted(set(p[0] for p in reparsed)) unique_diseases = sorted(set(p[1] for p in reparsed)) print(f"\nUnique plants: {len(unique_plants)}") print(f"Unique diseases: {len(unique_diseases)}") # Plant class counts plant_class_counts = Counter(p[0] for p in reparsed) print(f"\nTop 25 plants by class count:") for plant, cnt in plant_class_counts.most_common(25): print(f" {plant}: {cnt}") # Now count actual image files print(f"\n=== Image counts (full scan) ===") total_images = 0 class_sizes = Counter() for d in dirs: full_path = os.path.join(dataset_dir, d) file_count = len([f for f in os.listdir(full_path) if os.path.isfile(os.path.join(full_path, f))]) class_sizes[d] = file_count total_images += file_count size_vals = list(class_sizes.values()) print(f"Total images: {total_images:,}") print(f"Classes: {len(class_sizes)}") print(f"Min/class: {min(size_vals)}, Max/class: {max(size_vals)}") print(f"Mean/class: {sum(size_vals)/len(size_vals):.0f}") print(f"Median/class: {sorted(size_vals)[len(size_vals)//2]}") # Images per plant (using the parsed data) plant_image_counts = Counter() for d, size in class_sizes.items(): # Find the plant for this dir for plant, disease in reparsed: # Simple matching: does the dir start with plant? pass # Let's do it properly below # Better: build a dir_name -> (plant, disease) lookup dir_to_class = {} for plant, disease in reparsed: # Approximate the original dir name key = f"{plant}-{disease}" dir_to_class[key] = (plant, disease) plant_image_totals = Counter() for d, size in class_sizes.items(): # Find matching entry # The dir name might not exactly match the key if d in dir_to_class: plant, disease = dir_to_class[d] plant_image_totals[plant] += size print(f"\nTop 15 plants by total images:") for plant, cnt in plant_image_totals.most_common(15): print(f" {plant}: {cnt:,}")