-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataset.py
More file actions
113 lines (94 loc) · 3.34 KB
/
dataset.py
File metadata and controls
113 lines (94 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
dataset.py - Dataset preparation and annotation management
Theory:
- Deep learning needs labeled data (images + bounding boxes)
- YOLO format: <class_id> <x_center> <y_center> <width> <height> (normalized 0-1)
- Data split: 70% train, 20% validation, 10% test
"""
import os
import shutil
import random
from pathlib import Path
import cv2
import yaml
class BadmintonDatasetPrep:
"""
Prepare dataset for YOLO training
Expected structure:
dataset/
├── images/
│ ├── train/
│ ├── val/
│ └── test/
└── labels/
├── train/
├── val/
└── test/
"""
def __init__(self, raw_data_path, output_path):
self.raw_data_path = Path(raw_data_path)
self.output_path = Path(output_path)
def create_directory_structure(self):
"""Create YOLO dataset structure"""
splits = ['train', 'val', 'test']
for split in splits:
(self.output_path / 'images' / split).mkdir(parents=True, exist_ok=True)
(self.output_path / 'labels' / split).mkdir(parents=True, exist_ok=True)
def split_dataset(self, train_ratio=0.7, val_ratio=0.2):
"""
Split dataset into train/val/test
Why these ratios?
- Train (70%): Need enough data to learn patterns
- Val (20%): Monitor overfitting during training
- Test (10%): Final unbiased performance evaluation
"""
all_images = list(self.raw_data_path.glob('*.jpg')) + \
list(self.raw_data_path.glob('*.png'))
random.shuffle(all_images)
train_split = int(len(all_images) * train_ratio)
val_split = int(len(all_images) * (train_ratio + val_ratio))
splits = {
'train': all_images[:train_split],
'val': all_images[train_split:val_split],
'test': all_images[val_split:]
}
return splits
def create_yaml_config(self, dataset_type='player'):
"""
Create YAML config for YOLO training
This file tells YOLO:
- Where to find data
- How many classes
- Class names
"""
if dataset_type == 'player':
class_names = ['player']
else: # shuttle
class_names = ['shuttle']
config = {
'path': str(self.output_path.absolute()),
'train': 'images/train',
'val': 'images/val',
'test': 'images/test',
'nc': len(class_names), # number of classes
'names': class_names
}
yaml_path = self.output_path / f'{dataset_type}_data.yaml'
with open(yaml_path, 'w') as f:
yaml.dump(config, f, sort_keys=False)
print(f"Created config: {yaml_path}")
return yaml_path
# Example usage
if __name__ == "__main__":
# For player dataset
player_prep = BadmintonDatasetPrep(
raw_data_path='raw_data/players',
output_path='datasets/player_dataset'
)
player_prep.create_directory_structure()
# For shuttle dataset
shuttle_prep = BadmintonDatasetPrep(
raw_data_path='raw_data/shuttles',
output_path='datasets/shuttle_dataset'
)
shuttle_prep.create_directory_structure()