Skip to content

Commit

Permalink
Add:
Browse files Browse the repository at this point in the history
1. New classes txt file;
2. New api _example.ipynb;
Fix:
1. Fixed errors when using the val folder instead of valid;
2. Fixed errors when creating a config for the ssd model that caused the gradient to zero;
3. Automatic resizing of images and annotations has been added for ssd 300.
4. The code for obtaining paths for the dataset has been optimized, now you can simply specify the name of the dataset or the path to it, if the dataset is not in users_dataset, it will be copied there;
5. Fixed errors during split dataset, now you can not specify the size of the test sample, it will be collected from the remaining images and annotations;
6. Fixed some bugs with paths

Delete:
1. Folder yolo in user_dataset;
2. File prepare_train
  • Loading branch information
orekhovskiy committed Nov 28, 2023
1 parent 23d06c0 commit 91a8e06
Show file tree
Hide file tree
Showing 14 changed files with 1,384 additions and 2,042 deletions.
16 changes: 8 additions & 8 deletions ODRS/data_utils/convert_yolo_to_voc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@
import re
from pathlib import Path
import shutil
from loguru import logger
from PIL import Image
from tqdm import tqdm
from ODRS.data_utils.prepare_ssd import create_ssd_json


def convert_voc(data_path, txt_path):
print("Creating VOC format for dataset")
for i in ['train', 'test', 'valid']:
convert_yolo_to_voc(f'{data_path}/{i}', txt_path, 'annotations')
shutil.rmtree(f'{data_path}/{i}/labels')
create_ssd_json(f'{data_path}/{i}', txt_path)
# except:
# continue
logger.info("Creating VOC format for dataset")
path = Path(data_path)
folder_names = [folder.name for folder in path.iterdir() if folder.is_dir()]
for name in folder_names:
convert_yolo_to_voc(Path(data_path) / name, txt_path, 'annotations')
shutil.rmtree(Path(data_path) / name / 'labels')
create_ssd_json(Path(data_path) / name, txt_path)


def copy_files_to_jpeg_images_folder(data_path):
Expand All @@ -26,7 +27,6 @@ def copy_files_to_jpeg_images_folder(data_path):
file_path = os.path.join(subfolder_path, file_name)
if os.path.isfile(file_path):
shutil.copy(file_path, jpeg_images_folder)

return jpeg_images_folder


Expand Down
32 changes: 14 additions & 18 deletions ODRS/data_utils/create_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,52 +22,48 @@ def delete_cache(data_path):


def create_config_data(train_path, val_path, classname_file, config_path, arch, batch_size, epochs, model):
# Get current file path
current_file_path = Path(__file__).resolve()

# Create runs directory if it does not exist
runs_directory = f"{current_file_path.parents[2]}/runs"
runs_directory = Path(current_file_path.parents[2]) / 'runs'
if not os.path.exists(runs_directory):
os.makedirs(runs_directory, exist_ok=True)

# Create runs path
runs_path = f"{runs_directory}/{str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))}_{model}"
os.makedirs(f"{runs_path}", exist_ok=True)
class_file_path = f"{current_file_path.parents[2]}/{classname_file}"
runs_path = runs_directory / f"{str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))}_{model}"
os.makedirs(runs_path, exist_ok=True)
class_file_path = Path(current_file_path.parents[2]) / classname_file

# Create config path
config_path = f"{runs_path}/{config_path}"
config_path = runs_path / config_path
if arch == 'ssd':
class_names = read_names_from_txt(class_file_path)
dataset_yaml = '''\
# Data
train_json: {}
val_json: {}
class_names: {}
recall_steps: 101
recall_steps: 11
image_mean: [123., 117., 104.]
image_stddev: [1., 1, 1.]
# Model
model: SSD
backbone:
name: VGG16
num_stages: 7
input_size: 512
anchor_scales: [0.04, 0.1, 0.26, 0.42, 0.58, 0.74, 0.9]
anchor_aspect_ratios: [[1, 2], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2], [1, 2]]
num_stages: 6
input_size: 300
anchor_scales: [0.1, 0.2, 0.375, 0.55, 0.725, 0.9]
anchor_aspect_ratios: [[1, 2], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2], [1, 2]]
# Training
batch_size: {}
epochs: {}
optim:
name: SGD
lr: 0.001
lr: 0.0001
momentum: 0.9
weight_decay: 0.0005
scheduler:
name: MultiStepLR
milestones: [90, 110]
milestones: [155, 195]
gamma: 0.1
'''.format(train_path, val_path, class_names, batch_size, epochs)
logger.info("Create config file")
Expand Down Expand Up @@ -98,8 +94,8 @@ def create_config_data(train_path, val_path, classname_file, config_path, arch,
# Whether to save the predictions of the validation set while training.
SAVE_VALID_PREDICTION_IMAGES: True
'''.format(f'{train_path}/images', f'{train_path}/annotations', f'{val_path}/images',
f'{val_path}/annotations', class_names, len(class_names))
'''.format(train_path / 'images', train_path / 'annotations', val_path / 'images',
val_path / 'annotations', class_names, len(class_names))
logger.info("Create config file")
with open(config_path, 'w') as file:
file.write(dataset_yaml)
Expand Down
84 changes: 70 additions & 14 deletions ODRS/data_utils/prepare_ssd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import json
import glob
from PIL import Image
import xml.etree.ElementTree as ET
from tqdm import tqdm
from pathlib import Path
Expand Down Expand Up @@ -62,28 +64,82 @@ def get_image_names(folder_path):

def create_ssd_json(path_folder, txt_path):
current_file_path = Path(__file__).resolve()
txt_path = f"{current_file_path.parents[2]}/{txt_path}"
txt_path = Path(current_file_path.parents[2]) / txt_path
class_names = read_names_from_txt(txt_path)

paths = {
2007: os.path.join(os.path.dirname(path_folder), path_folder.split('/')[-1])
2007: os.path.join(os.path.dirname(path_folder), path_folder)
}

dataset = []
for year, path in paths.items():
ids = get_image_names(f'{path_folder}/images')
ids = get_image_names(Path(path_folder) / 'images')
for id in tqdm(ids):
image_path = os.path.join(path, 'images', id + '.jpg')
annotation_path = os.path.join(path, 'annotations', id + '.xml')
if check_filename(annotation_path):
boxes, classes, difficulties = parse_annotation(annotation_path)
classes = [class_names.index(c) for c in classes]
dataset.append(
{
'image': os.path.abspath(image_path),
'boxes': boxes,
'classes': classes,
'difficulties': difficulties
}
)
save_as_json(f'{os.path.dirname(path_folder)}/{path_folder.split("/")[-1]}.json', dataset)
try:
boxes, classes, difficulties = parse_annotation(annotation_path)
classes = [class_names.index(c) for c in classes]
dataset.append(
{
'image': os.path.abspath(image_path),
'boxes': boxes,
'classes': classes,
'difficulties': difficulties
}
)
except Exception as e:
print(e)

save_as_json(Path(os.path.dirname(path_folder)) / f'{path_folder.name}.json', dataset)



def resize_images_and_annotations(data_path, img_size):
size = img_size if img_size <= 300 else 300
path = Path(data_path)
folder_names = [folder.name for folder in path.iterdir() if folder.is_dir()]
for name in folder_names:
folder_path = path / name
images_path = os.path.join(folder_path, 'images')
labels_path = os.path.join(folder_path, 'labels')

for image_name in tqdm(os.listdir(images_path), desc=f'Resize {name} images'):
image_path = os.path.join(images_path, image_name)
label_path = os.path.join(labels_path, image_name.replace('.jpg', '.txt'))

with Image.open(image_path) as img:
original_width, original_height = img.size

if original_width > size or original_height > size:
img = img.resize((size, size))

if os.path.exists(label_path):
with open(label_path, 'r') as file:
lines = file.readlines()

with open(label_path, 'w') as file:
for line in lines:
parts = line.split()
if len(parts) == 5:
x_center = float(parts[1]) * original_width
y_center = float(parts[2]) * original_height
width = float(parts[3]) * original_width
height = float(parts[4]) * original_height

x_center *= size / original_width
y_center *= size / original_height
width *= size / original_width
height *= size / original_height

x_center /= size
y_center /= size
width /= size
height /= size

file.write(f"{parts[0]} {x_center} {y_center} {width} {height}\n")

img.save(image_path)

# resize_images_and_annotations('/media/space/ssd_1_tb_evo_sumsung/ITMO/ODRS/user_datasets/Warp-D_voc/test')
34 changes: 0 additions & 34 deletions ODRS/data_utils/prepare_train.py

This file was deleted.

73 changes: 43 additions & 30 deletions ODRS/data_utils/split_dataset.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
import os
import shutil
import glob
import sys
from tqdm import tqdm
from loguru import logger


def split_data(datapath, split_train_value, split_val_value, split_test_value):
def split_data(datapath, split_train_value, split_valid_value):
selected_folders = ['test', 'train', 'valid']
selected_files = ['classes.txt']

train_path = os.path.join(datapath, 'train')
test_path = os.path.join(datapath, 'test')
val_path = os.path.join(datapath, 'valid')

if os.path.exists(train_path) and os.path.exists(test_path) and (os.path.exists(val_path)
or os.path.exists(os.path.join(datapath, 'valid'))):
return "Dataset is ready"
if os.path.exists(train_path) and (os.path.exists(val_path)
or os.path.exists(os.path.join(datapath, 'val'))):
logger.info("Dataset is ready")
return train_path, val_path if os.path.exists(val_path) else os.path.join(datapath, 'val')
if os.path.exists(train_path) and not (os.path.exists(val_path)
or os.path.exists(os.path.join(datapath, 'val'))):
logger.error("Dataset has no validation sample")
sys.exit()
if not os.path.exists(train_path) and (os.path.exists(val_path)
or os.path.exists(os.path.join(datapath, 'val'))):
logger.error("Dataset has no training sample")
sys.exit()


images_path = os.path.join(datapath, 'images')
labels_path = os.path.join(datapath, 'labels')
Expand All @@ -29,28 +40,34 @@ def split_data(datapath, split_train_value, split_val_value, split_test_value):
glob.glob(os.path.join(datapath, '*.png'))
label_files = glob.glob(os.path.join(datapath, '*.txt'))

image_files.sort()
label_files.sort()

total_files = len(image_files) + len(label_files)

if total_files == 0:
print("Error: No image or label files found in the datapath.")
return
logger.error("Error: No image or label files found in the datapath.")

train_split = int(len(image_files) * split_train_value)
val_split = int(len(image_files) * split_val_value)
val_split = int(len(image_files) * split_valid_value)

print(f'Len_images_files:{len(image_files)}')
logger.info(f'Total number of images:{len(image_files)}')
logger.info(f'Total number of labels:{len(label_files)}')

train_images = image_files[:train_split]
train_labels = label_files[:train_split]
print(f'train_images:{len(train_images)}')
logger.info(f'Number train images:{len(train_images)}')
logger.info(f'Number train labels:{len(train_labels)}')

val_images = image_files[train_split:train_split+val_split]
val_labels = label_files[train_split:train_split+val_split]
print(f'val_labels:{len(val_labels)}')
logger.info(f'Number valid images:{len(val_images)}')
logger.info(f'Number valid labels:{len(val_labels)}')

test_images = image_files[train_split+val_split:]
test_labels = label_files[train_split+val_split:]
print(f'test_labels:{len(test_labels)}')
logger.info(f'Number test images:{len(test_images)}')
logger.info(f'Number test labels:{len(test_labels)}')

for path in [train_path, test_path, val_path]:
if not os.path.exists(path):
Expand All @@ -60,44 +77,40 @@ def split_data(datapath, split_train_value, split_val_value, split_test_value):
os.makedirs(images_subpath)
os.makedirs(labels_subpath)

for image_file in train_images:
for image_file in tqdm(train_images, desc="Train images"):
shutil.copy(image_file, os.path.join(train_path, 'images', os.path.basename(image_file)))
for image_file in val_images:
for image_file in tqdm(val_images, desc="Valid images"):
shutil.copy(image_file, os.path.join(val_path, 'images', os.path.basename(image_file)))
for image_file in test_images:
for image_file in tqdm(test_images, desc="Test images"):
shutil.copy(image_file, os.path.join(test_path, 'images', os.path.basename(image_file)))

for label_file in train_labels:
for label_file in tqdm(train_labels, desc="Train labels"):
shutil.copy(label_file, os.path.join(train_path, 'labels', os.path.basename(label_file)))
for label_file in val_labels:
for label_file in tqdm(val_labels, desc="Valid labels"):
shutil.copy(label_file, os.path.join(val_path, 'labels', os.path.basename(label_file)))
for label_file in test_labels:
for label_file in tqdm(test_labels, desc="Test labels"):
shutil.copy(label_file, os.path.join(test_path, 'labels', os.path.basename(label_file)))

for root, dirs, files in os.walk(datapath, topdown=False):
for name in files:
file_path = os.path.join(root, name)
if name not in selected_files and file_path.split('/')[-3] not in selected_folders:
if file_path.split('/')[-3] not in selected_folders:
os.remove(file_path)

for name in dirs:
dir_path = os.path.join(root, name)
if name not in selected_folders and dir_path.split('/')[-2] not in selected_folders:
shutil.rmtree(dir_path)

return "Dataset was split"
logger.info("Dataset was split")
return train_path, val_path


def remove_folder(path):
shutil.rmtree(path)


def copy_arch_folder(dataset_path):
folder_name = dataset_path.split('/')[-1]
dataset_path = os.path.dirname(dataset_path)
voc_path = os.path.join(os.path.dirname(dataset_path), "voc")
dataset_folder = dataset_path.parent
dataset_name = f'{dataset_path.name}_voc'
voc_path = os.path.join(dataset_folder, dataset_name)
yolo_path = os.path.join(dataset_path)
if os.path.exists(voc_path):
remove_folder(voc_path)
shutil.copytree(yolo_path, voc_path)
return f'{voc_path}/{folder_name}'
return voc_path
Loading

0 comments on commit 91a8e06

Please sign in to comment.