Courtvision-AI/dataset.py at main · stonedseeker/Courtvision-AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
dataset.py - Dataset preparation and annotation management

Theory:
- Deep learning needs labeled data (images + bounding boxes)
- YOLO format: <class_id> <x_center> <y_center> <width> <height> (normalized 0-1)
- Data split: 70% train, 20% validation, 10% test
"""

import os
import shutil
import random
from pathlib import Path
import cv2
import yaml

class BadmintonDatasetPrep:
    """
    Prepare dataset for YOLO training

    Expected structure:
    dataset/
    ├── images/
    │   ├── train/
    │   ├── val/
    │   └── test/
    └── labels/
        ├── train/
        ├── val/
        └── test/
    """

    def __init__(self, raw_data_path, output_path):
        self.raw_data_path = Path(raw_data_path)
        self.output_path = Path(output_path)

    def create_directory_structure(self):
        """Create YOLO dataset structure"""
        splits = ['train', 'val', 'test']
        for split in splits:
            (self.output_path / 'images' / split).mkdir(parents=True, exist_ok=True)
            (self.output_path / 'labels' / split).mkdir(parents=True, exist_ok=True)

    def split_dataset(self, train_ratio=0.7, val_ratio=0.2):
        """
        Split dataset into train/val/test

        Why these ratios?
        - Train (70%): Need enough data to learn patterns
        - Val (20%): Monitor overfitting during training
        - Test (10%): Final unbiased performance evaluation
        """
        all_images = list(self.raw_data_path.glob('*.jpg')) + \
                     list(self.raw_data_path.glob('*.png'))

        random.shuffle(all_images)

        train_split = int(len(all_images) * train_ratio)
        val_split = int(len(all_images) * (train_ratio + val_ratio))

        splits = {
            'train': all_images[:train_split],
            'val': all_images[train_split:val_split],
            'test': all_images[val_split:]
        }

        return splits

    def create_yaml_config(self, dataset_type='player'):
        """
        Create YAML config for YOLO training

        This file tells YOLO:
        - Where to find data
        - How many classes
        - Class names
        """
        if dataset_type == 'player':
            class_names = ['player']
        else:  # shuttle
            class_names = ['shuttle']

        config = {
            'path': str(self.output_path.absolute()),
            'train': 'images/train',
            'val': 'images/val',
            'test': 'images/test',
            'nc': len(class_names),  # number of classes
            'names': class_names
        }

        yaml_path = self.output_path / f'{dataset_type}_data.yaml'
        with open(yaml_path, 'w') as f:
            yaml.dump(config, f, sort_keys=False)

        print(f"Created config: {yaml_path}")
        return yaml_path

# Example usage
if __name__ == "__main__":
    # For player dataset
    player_prep = BadmintonDatasetPrep(
        raw_data_path='raw_data/players',
        output_path='datasets/player_dataset'
    )
    player_prep.create_directory_structure()

    # For shuttle dataset
    shuttle_prep = BadmintonDatasetPrep(
        raw_data_path='raw_data/shuttles',
        output_path='datasets/shuttle_dataset'
    )
    shuttle_prep.create_directory_structure()