pre torch.compile -chkpoint made

2026-06-16 10:40:38 -05:00
parent 34855eff55
commit 6650d3c5ea
19 changed files with 2519 additions and 0 deletions
--- a/scripts/organize-dataset.py
+++ b/scripts/organize-dataset.py
@@ -0,0 +1,471 @@
+#!/usr/bin/env python3
+"""
+Phase 1 — Dataset Reorganization for Hierarchical Model Training.
+
+Reorganizes flat data/dataset/plant-disease-name/ directories into:
+  data/organized/
+    train/{species}/{disease}/
+    val/{species}/{disease}/
+    species_index.json
+    class_hierarchy.json
+    dataset_stats.json
+
+Usage: python3 scripts/organize-dataset.py
+"""
+
+import json
+import os
+import random
+from collections import Counter, defaultdict
+from pathlib import Path
+
+from PIL import Image
+from joblib import Parallel, delayed
+from tqdm import tqdm
+
+# ─── Config ───────────────────────────────────────────────────────────────────
+
+BASE_DIR = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+DATASET_DIR = BASE_DIR / "data" / "dataset"
+ORGANIZED_DIR = BASE_DIR / "data" / "organized"
+TRAIN_DIR = ORGANIZED_DIR / "train"
+VAL_DIR = ORGANIZED_DIR / "val"
+
+RANDOM_SEED = 42
+TRAIN_RATIO = 0.85
+VAL_RATIO = 1.0 - TRAIN_RATIO
+
+MAX_DIM = 512
+JPEG_QUALITY = 90
+N_JOBS = 16
+
+random.seed(RANDOM_SEED)
+
+# Known disease-prefix words — words that start disease names but should NOT
+# be part of a plant name. If a plant part ends with one of these, we know
+# the split point is wrong.
+DISEASE_PREFIX_WORDS = {
+    "bacterial", "fungal", "viral", "downy", "powdery",
+    "alternaria", "phytophthora", "phoma", "phymatotrichum",
+    "pythium", "rhizoctonia", "sclerotinia", "fusarium",
+    "verticillium", "cercospora", "septoria", "anthracnose",
+    "black", "white", "gray", "brown", "green", "pink", "blue",
+    "soft", "hard", "sour", "bitter",
+    "southern", "northern", "common", "false", "true",
+    "european", "american", "aspen", "bacterial-blight",
+    "cercospora-leaf", "septoria-leaf", "alternaria-leaf",
+}
+
+# Valid multi-word plant suffixes (these CAN follow a hyphen in plant names)
+VALID_MULTI_WORD_PLANTS = {
+    "squash", "bean", "berry", "apple", "fern", "tree", "vine",
+    "cactus", "grass", "weed", "mint", "root", "seed", "leaf",
+    "flower", "fruit", "bark", "wood", "nut", "pea", "lily",
+    "rose", "moss", "palm", "fern", "orchid", "fig", "cress",
+    "plant", "sage", "thyme", "leaf-fig", "nest-fern", "tongue",
+    "tail", "ear", "eye", "nut-tree", "bean-tree",
+}
+
+IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff", ".tif"}
+
+# ─── Load KB Data ─────────────────────────────────────────────────────────────
+
+def load_kb():
+    with open(BASE_DIR / "src" / "data" / "plants.json") as f:
+        plants = json.load(f)
+    with open(BASE_DIR / "src" / "data" / "diseases.json") as f:
+        diseases = json.load(f)
+    return plants, diseases
+
+PLANTS, DISEASES = load_kb()
+KB_PLANT_IDS = {p["id"] for p in PLANTS}
+
+def get_dataset_dirs():
+    """Get all non-hidden subdirectories in the dataset folder."""
+    dirs = sorted([
+        d for d in os.listdir(DATASET_DIR)
+        if os.path.isdir(DATASET_DIR / d) and not d.startswith(".")
+    ])
+    return dirs
+
+def count_images(path):
+    """Count image files in a directory."""
+    if not path.exists():
+        return 0
+    return len([
+        f for f in os.listdir(path)
+        if os.path.isfile(path / f) and os.path.splitext(f)[1].lower() in IMAGE_EXTS
+    ])
+
+# ─── Phase 1: Parse directory names ────────────────────────────────────────────
+
+def build_plant_and_disease_dictionaries(dirs):
+    """
+    Build verified plant names and disease suffixes from the dataset.
+    Returns (parsed_dict, unmatched_list).
+    """
+    # Phase 1: Verify plant names from prefixes that appear with >=3 diseases
+    plant_candidates = defaultdict(set)
+    for d in dirs:
+        parts = d.split("-")
+        if len(parts) < 2:
+            continue
+        for split in range(1, min(len(parts), 6)):
+            plant = "-".join(parts[:split])
+            disease = "-".join(parts[split:])
+            if plant and disease and len(disease) > 2:
+                plant_candidates[plant].add(disease)
+
+    verified_plants = set(KB_PLANT_IDS)
+    for plant, diseases in plant_candidates.items():
+        if len(diseases) >= 3 and plant not in verified_plants:
+            verified_plants.add(plant)
+
+    print(f"  Verified plants: {len(verified_plants)} ({len(verified_plants & KB_PLANT_IDS)} from KB)")
+
+    # Phase 2: Match dirs by plant prefix (longest plant first)
+    sorted_plants = sorted(verified_plants, key=len, reverse=True)
+    plant_matched = {}
+    not_matched = []
+
+    for d in dirs:
+        matched = False
+        for plant in sorted_plants:
+            prefix = plant + "-"
+            if d.startswith(prefix):
+                disease = d[len(prefix):]
+                if disease:
+                    plant_matched[d] = (plant, disease)
+                    matched = True
+                    break
+        if not matched:
+            if d.endswith("-healthy"):
+                plant = d[:-len("-healthy")]
+                plant_matched[d] = (plant, "healthy")
+            else:
+                not_matched.append(d)
+
+    # Collect disease suffixes from Phase 2 matches
+    disease_suffixes = set(p[1] for p in plant_matched.values())
+    print(f"  Plant-matched dirs: {len(plant_matched)}, disease suffixes: {len(disease_suffixes)}")
+
+    # Phase 3: Match remaining dirs by disease suffix (longest suffix first)
+    sorted_disease_suffixes = sorted(disease_suffixes, key=len, reverse=True)
+    still_not_matched = []
+
+    for d in not_matched:
+        matched = False
+        for suffix in sorted_disease_suffixes:
+            if d.endswith("-" + suffix):
+                plant_part = d[:-len("-" + suffix)]
+                if plant_part and not plant_part.endswith("-"):
+                    plant_matched[d] = (plant_part, suffix)
+                    matched = True
+                    break
+        if not matched:
+            still_not_matched.append(d)
+
+    print(f"  Phase 3 matched: {len(not_matched) - len(still_not_matched)}")
+    print(f"  Phase 3 remaining: {len(still_not_matched)}")
+
+    # Phase 4: Handle trailing-hyphen dirs and healthy parent dir
+    final_unmatched = []
+    for d in still_not_matched:
+        if d.endswith("-"):
+            plant = d[:-1]
+            if plant:
+                plant_matched[d] = (plant, "unlabeled")
+        elif d == "healthy":
+            healthy_dir = DATASET_DIR / "healthy"
+            if healthy_dir.exists():
+                plant_subdirs = [
+                    s for s in os.listdir(healthy_dir)
+                    if os.path.isdir(healthy_dir / s) and not s.startswith(".")
+                ]
+                for sub_plant in plant_subdirs:
+                    # Use healthy/{sub_plant} as key so we know where to find the images
+                    plant_matched[f"healthy/{sub_plant}"] = (sub_plant, "healthy")
+                print(f"  Healthy dir: {len(plant_subdirs)} per-plant healthy classes")
+        else:
+            final_unmatched.append(d)
+
+    print(f"  Phase 4 handled {len(still_not_matched) - len(final_unmatched)} edge cases")
+    print(f"  Final unmatched: {len(final_unmatched)}")
+    if final_unmatched:
+        print(f"    E.g.: {final_unmatched[:10]}")
+
+    # Phase 5: Post-processing — fix species names that ate disease-prefix words
+    fix_count = 0
+    for d in list(plant_matched.keys()):
+        if d.startswith("healthy/"):
+            continue  # Skip healthy subdirs — these are correct
+        species, disease = plant_matched[d]
+        parts = species.split("-")
+        if len(parts) >= 2 and parts[-1] in DISEASE_PREFIX_WORDS:
+            # Move the last word from species to disease
+            new_species = "-".join(parts[:-1])
+            new_disease = parts[-1] + "-" + disease
+            plant_matched[d] = (new_species, new_disease)
+            fix_count += 1
+
+    print(f"  Post-process fixes (species ending with disease-prefix): {fix_count}")
+
+    return plant_matched, final_unmatched
+
+# ─── Image Processing ────────────────────────────────────────────────────────
+
+def process_image(args):
+    """Resize and convert a single image to 512px max JPEG q90."""
+    src_path, dst_path = args
+    try:
+        img = Image.open(src_path)
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        w, h = img.size
+        if max(w, h) > MAX_DIM:
+            ratio = MAX_DIM / max(w, h)
+            img = img.resize((int(w * ratio), int(h * ratio)), Image.LANCZOS)
+        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
+        img.save(dst_path, "JPEG", quality=JPEG_QUALITY, optimize=True)
+        return (src_path, True, None)
+    except Exception as e:
+        return (src_path, False, str(e))
+
+def copy_and_split_class(src_dir, dst_train_dir, dst_val_dir, train_ratio=TRAIN_RATIO):
+    """
+    Copy images from src_dir to train/val dirs, splitting at the IMAGE level.
+    Returns (train_processed, train_failed, val_processed, val_failed).
+    """
+    # Check both possible source paths (regular dir or healthy subdir)
+    if not src_dir.exists():
+        return (0, 0, 0, 0)
+
+    src_files = sorted([
+        f for f in os.listdir(src_dir)
+        if os.path.isfile(src_dir / f) and os.path.splitext(f)[1].lower() in IMAGE_EXTS
+    ])
+    if not src_files:
+        return (0, 0, 0, 0)
+
+    # Split files at IMAGE level
+    random.shuffle(src_files)
+    split_idx = max(1, int(len(src_files) * train_ratio))
+    train_files = src_files[:split_idx]
+    val_files = src_files[split_idx:]
+
+    # Process train images
+    train_pairs = [
+        (str(src_dir / f), str(dst_train_dir / f"img_{i:04d}.jpg"))
+        for i, f in enumerate(train_files)
+    ]
+    val_pairs = [
+        (str(src_dir / f), str(dst_val_dir / f"img_{i:04d}.jpg"))
+        for i, f in enumerate(val_files)
+    ]
+
+    results = Parallel(n_jobs=N_JOBS, prefer="threads")(
+        delayed(process_image)(pair) for pair in train_pairs + val_pairs
+    )
+
+    train_ok = sum(1 for i, (_, ok, _) in enumerate(results) if ok and i < len(train_pairs))
+    train_fail = sum(1 for i, (_, ok, _) in enumerate(results) if not ok and i < len(train_pairs))
+    val_ok = sum(1 for i, (_, ok, _) in enumerate(results) if ok and i >= len(train_pairs))
+    val_fail = sum(1 for i, (_, ok, _) in enumerate(results) if not ok and i >= len(train_pairs))
+
+    return (train_ok, train_fail, val_ok, val_fail)
+
+# ─── Build Metadata ──────────────────────────────────────────────────────────
+
+def build_metadata(parsed, train_counts, val_counts, unmatched):
+    """Build species_index.json, class_hierarchy.json, dataset_stats.json."""
+    species_disease_map = defaultdict(set)
+    for species, disease in parsed.values():
+        species_disease_map[species].add(disease)
+    species_index = {k: sorted(v) for k, v in sorted(species_disease_map.items())}
+
+    class_hierarchy = {
+        "version": "1.0",
+        "description": "Hierarchical plant disease classification dataset",
+        "num_species": len(species_index),
+        "num_classes": len(parsed),
+        "species": {species: sorted(diseases) for species, diseases in species_index.items()}
+    }
+
+    # Aggregate counts
+    total_train = sum(cnt for sp, di, cnt in train_counts)
+    total_val = sum(cnt for sp, di, cnt in val_counts)
+    total_all = total_train + total_val
+
+    all_counts = [cnt for _, _, cnt in (train_counts + val_counts)]
+
+    species_disease_counts = defaultdict(lambda: defaultdict(int))
+    for sp, di, cnt in train_counts + val_counts:
+        species_disease_counts[sp][di] += cnt
+
+    # Also count classes from the parsed dict (unique species/disease combos)
+    parsed_classes = set((sp, di) for sp, di in parsed.values())
+
+    stats = {
+        "total_images": total_all,
+        "total_species": len(species_index),
+        "total_classes": len(parsed_classes),
+        "train_images": total_train,
+        "val_images": total_val,
+        "images_per_class": {
+            "min": min(all_counts) if all_counts else 0,
+            "max": max(all_counts) if all_counts else 0,
+            "mean": round(sum(all_counts) / len(all_counts)) if all_counts else 0,
+            "median": sorted(all_counts)[len(all_counts) // 2] if all_counts else 0,
+        },
+        "train_pct": round(total_train / total_all * 100, 1) if total_all else 0,
+        "val_pct": round(total_val / total_all * 100, 1) if total_all else 0,
+        "unmatched_dirs": len(unmatched),
+        "unmatched_dir_names": unmatched[:100] if unmatched else [],
+        "species_disease_counts": {
+            species: dict(diseases) for species, diseases in species_disease_counts.items()
+        }
+    }
+
+    return species_index, class_hierarchy, stats
+
+# ─── Main Pipeline ───────────────────────────────────────────────────────────
+
+def main():
+    print("=" * 60)
+    print("Phase 1 — Dataset Reorganization")
+    print("=" * 60)
+    print(f"Dataset: {DATASET_DIR}")
+    print(f"Output: {ORGANIZED_DIR}")
+    print()
+
+    # Step 1: Scan
+    print("─" * 40)
+    print("Step 1: Scanning dataset directories...")
+    print("─" * 40)
+    dirs = get_dataset_dirs()
+    print(f"  Found {len(dirs)} class directories")
+
+    # Step 2: Parse directory names into (species, disease) pairs
+    print()
+    print("─" * 40)
+    print("Step 2: Parsing directory names...")
+    print("─" * 40)
+    parsed, unmatched = build_plant_and_disease_dictionaries(dirs)
+
+    species_set = set(s for s, _ in parsed.values())
+    disease_set = set(d for _, d in parsed.values())
+    raw_classes = len(parsed)
+    unique_classes = len(set((s, d) for s, d in parsed.values()))
+    print(f"\n  Parsed: {raw_classes} entries")
+    print(f"  Unique species: {len(species_set)}")
+    print(f"  Unique disease labels: {len(disease_set)}")
+    print(f"  Unique (species, disease) pairs: {unique_classes}")
+
+    # Step 3: Process images with image-level train/val split
+    print()
+    print("─" * 40)
+    print("Step 3: Processing images (resize + train/val split)...")
+    print(f"  Max dimension: {MAX_DIM}px, JPEG q{JPEG_QUALITY}")
+    print(f"  Workers: {N_JOBS}")
+    print(f"  Split: {TRAIN_RATIO*100:.0f}/{VAL_RATIO*100:.0f} (image-level)")
+    print("─" * 40)
+
+    train_counts = []  # (species, disease, count)
+    val_counts = []
+    total_skipped = 0
+
+    # Process regular dirs
+    regular_items = [(d, sp, di) for d, (sp, di) in parsed.items()
+                     if not d.startswith("healthy/") and d in dirs]
+    healthy_items = [(d, sp, di) for d, (sp, di) in parsed.items()
+                     if d.startswith("healthy/")]
+
+    # Organize healthy items by plant
+    healthy_by_plant = {}
+    for d, sp, di in healthy_items:
+        healthy_by_plant[sp] = d  # d is like "healthy/tomato"
+
+    print(f"\n  Processing {len(regular_items)} disease + {len(healthy_items)} healthy classes...")
+
+    for d, species, disease in tqdm(regular_items, desc="  Disease classes"):
+        src_dir = DATASET_DIR / d
+        dst_train = TRAIN_DIR / species / disease
+        dst_val = VAL_DIR / species / disease
+
+        # Skip if already done (check a few files)
+        if dst_train.exists() and dst_val.exists() and \
+           len(os.listdir(dst_train)) + len(os.listdir(dst_val)) >= count_images(src_dir):
+            total_skipped += count_images(src_dir)
+            continue
+
+        tr_ok, tr_fail, va_ok, va_fail = copy_and_split_class(src_dir, dst_train, dst_val)
+        train_counts.append((species, disease, tr_ok))
+        val_counts.append((species, disease, va_ok))
+
+    # Process healthy subdirs
+    for sp, hkey in tqdm(healthy_by_plant.items(), desc="  Healthy classes"):
+        src_dir = DATASET_DIR / hkey  # e.g. data/dataset/healthy/tomato
+        dst_train = TRAIN_DIR / sp / "healthy"
+        dst_val = VAL_DIR / sp / "healthy"
+
+        if dst_train.exists() and dst_val.exists() and \
+           len(os.listdir(dst_train)) + len(os.listdir(dst_val)) >= count_images(src_dir):
+            total_skipped += count_images(src_dir)
+            continue
+
+        tr_ok, tr_fail, va_ok, va_fail = copy_and_split_class(src_dir, dst_train, dst_val)
+        train_counts.append((sp, "healthy", tr_ok))
+        val_counts.append((sp, "healthy", va_ok))
+
+    total_train = sum(c for _, _, c in train_counts)
+    total_val = sum(c for _, _, c in val_counts)
+    print(f"\n  Train images: {total_train:,}")
+    print(f"  Val images: {total_val:,}")
+    print(f"  Skipped previously processed: {total_skipped:,}")
+
+    # Step 4: Build metadata
+    print()
+    print("─" * 40)
+    print("Step 4: Building metadata files...")
+    print("─" * 40)
+    ORGANIZED_DIR.mkdir(parents=True, exist_ok=True)
+
+    species_index, class_hierarchy, stats = build_metadata(
+        parsed, train_counts, val_counts, unmatched
+    )
+
+    with open(ORGANIZED_DIR / "species_index.json", "w") as f:
+        json.dump(species_index, f, indent=2)
+    print(f"  ✓ species_index.json ({len(species_index)} species)")
+
+    with open(ORGANIZED_DIR / "class_hierarchy.json", "w") as f:
+        json.dump(class_hierarchy, f, indent=2)
+    print(f"  ✓ class_hierarchy.json")
+
+    with open(ORGANIZED_DIR / "dataset_stats.json", "w") as f:
+        json.dump(stats, f, indent=2)
+    print(f"  ✓ dataset_stats.json")
+
+    # Summary
+    print()
+    print("=" * 60)
+    print("Done!")
+    print("=" * 60)
+    print(f"  Total images: {stats['total_images']:,}")
+    print(f"  Species: {stats['total_species']}")
+    print(f"  Classes: {stats['total_classes']}")
+    print(f"  Train: {stats['train_images']:,} ({stats['train_pct']}%)")
+    print(f"  Val: {stats['val_images']:,} ({stats['val_pct']}%)")
+    print(f"  Unmatched dirs: {stats['unmatched_dirs']}")
+    print(f"  Train dir: {TRAIN_DIR}")
+    print(f"  Val dir: {VAL_DIR}")
+
+    if stats['unmatched_dirs'] > 0:
+        print(f"\n  ⚠  Manual review needed for {stats['unmatched_dirs']} dirs:")
+        for u in stats['unmatched_dir_names'][:20]:
+            print(f"     {u}")
+
+    return stats
+
+if __name__ == "__main__":
+    main()