-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_pipeline.py
164 lines (135 loc) · 5.21 KB
/
data_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# This script creates a test, validation and training dataset as TFRecord files.
# Thus, a continuos connection to the location of the images and labels is not
# necessary. The data is scaled between 0 and 1 fitting a normalizer on the
# training data and then using it on the testing and validation data.
#
# The variable "path" has to be specified before using this script.
#
# author: Emilio Rivera
import os
import json
import sys
import random
import joblib
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
# Replace with the path to the folder where the folders with the cropped and augmented
# images are stored
path = "C:\\Users\\uffie\\bwSyncAndShare\\Bachelorarbeit-master"
img_type = "CLAHE"
rnd_seed = 42
labels = list()
images = list()
my_dict = dict()
"""
Data from augmented imgs
"""
# Open labels
with open(os.path.join(path,"Augmented180", "labels_augmented_imgs.json")) as f:
json_file = json.load(f)
for folder in json_file:
for img in json_file[folder]:
if json_file[folder][img] != None:
labels.append(json_file[folder][img][-1])
# Only load images that have a label
for subdir, dirs, files in os.walk(os.path.join(path, "Augmented180", img_type)):
for file in files:
folder = subdir.split("\\")[-1]
if ".png" in file and file in json_file[folder] and json_file[folder][file] != None:
images.append(os.path.join(subdir, file))
else:
pass
"""
Data from normal imgs
"""
# Open labels
with open(os.path.join(path, "Cropped", "combined_labels.json")) as f:
json_file = json.load(f)
for folder in json_file:
if "gerissen" in folder or "licht" in folder:
pass
else:
for img in json_file[folder]:
if json_file[folder][img] != None:
labels.append(json_file[folder][img][-1])
# Only loads images that have a label
for subdir, dirs, files in os.walk(os.path.join(path, "Cropped", img_type)):
if "gerissen" in subdir or "licht" in subdir or "weiß" in subdir or "papier" in subdir:
pass
else:
for file in files:
folder = subdir.split("\\")[-1]
if ".png" in file and file in json_file[folder] and json_file[folder][file] != None:
images.append(os.path.join(subdir, file))
else:
pass
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def create_data_record(out_filename, paths, pick_lengths):
"""
Creates and stores a TFRecord file.
Args:
out_filename: The specified name for the file.
paths: Paths to the images that will be stored in the file.
pick_lengths: Labels belonging to the images that will be stored in the file.
Returns:
A TFRecord file containing several images and labels stored as a sequence of
binary records.
"""
writer = tf.io.TFRecordWriter(out_filename)
for i in range(len(paths)):
# print nr of saved images every 500 images
if not i % 500:
print(f"Train data: {i} / {len(paths)}")
sys.stdout.flush()
image_string = open(paths[i], "rb").read()
label = pick_lengths[i]
feature = {
"img": _bytes_feature(image_string),
"label": _float_feature(label)
}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()
# Zip and shuffle the data
all_data = list(zip(images, labels))
np.random.seed(rnd_seed)
np.random.shuffle(all_data)
images, labels = zip(*all_data)
normalizer = MinMaxScaler()
# Training dataset containing 70% of the data
train_imgs = images[0:int(0.7 * len(images))]
train_labels = labels[0:int(0.7 * len(labels))]
train_labels = np.array(train_labels)
train_labels = train_labels.reshape(-1, 1)
train_labels = normalizer.fit_transform(train_labels)
train_labels = train_labels.reshape(1, -1)
train_labels = train_labels[0].tolist()
print("train", len(train_imgs), len(train_labels))
# Validation dataset containing 15% of the data
val_imgs = images[int(0.7 * len(images)):int(0.85 * len(images))]
val_labels = labels[int(0.7 * len(labels)):int(0.85 * len(labels))]
val_labels = np.array(val_labels)
val_labels = val_labels.reshape(-1, 1)
val_labels = normalizer.transform(val_labels)
val_labels = val_labels.reshape(1, -1)
val_labels = val_labels[0].tolist()
print("val", len(val_imgs), len(val_labels))
# Testing dataset containing 15% of the data
test_imgs = images[int(0.85 * len(images)):]
test_labels = labels[int(0.85 * len(labels)):]
test_labels = np.array(test_labels)
test_labels = test_labels.reshape(-1, 1)
test_labels = normalizer.transform(test_labels)
test_labels = test_labels.reshape(1, -1)
test_labels = test_labels[0].tolist()
print("test", len(test_imgs), len(test_labels))
create_data_record("train.tfrecords", train_imgs, train_labels)
create_data_record("val.tfrecords", val_imgs, val_labels)
create_data_record("test.tfrecords", test_imgs, test_labels)
# Save normalizer
joblib.dump(normalizer, "normalizer.save")