-
Notifications
You must be signed in to change notification settings - Fork 5
/
hdf5datasetgenerator.py
116 lines (101 loc) · 4.79 KB
/
hdf5datasetgenerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
" License:
" -----------------------------------------------------------------------------
" Copyright (c) 2018, Ratnajit Mukherjee.
" All rights reserved.
"
" Redistribution and use in source and binary forms, with or without
" modification, are permitted provided that the following conditions are met:
"
" 1. Redistributions of source code must retain the above copyright notice,
" this list of conditions and the following disclaimer.
"
" 2. Redistributions in binary form must reproduce the above copyright notice,
" this list of conditions and the following disclaimer in the documentation
" and/or other materials provided with the distribution.
"
" 3. Neither the name of the copyright holder nor the names of its contributors
" may be used to endorse or promote products derived from this software
" without specific prior written permission.
"
" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
" ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
" POSSIBILITY OF SUCH DAMAGE.
" -----------------------------------------------------------------------------
"
" Description: Originally written to convert Kaggle datasets in CSV format to HDF5 files
" for training, validation and test sets by Arian Rosebrock
" Creation date: June 2017
" URL: https://www.pyimagesearch.com/
"
" Integrated into an emotion detection application by:
" Author: Ratnajit Mukherjee, [email protected]
" Date: July 2018
"""
# import the necessary packages
from keras.utils import np_utils
import numpy as np
import h5py
class HDF5DatasetGenerator:
def __init__(self, dbPath, batchSize, preprocessors=None, aug=None, binarize=True, classes=2):
# store the batch size, preprocessors, and data augmentor,
# whether or not the labels should be binarized, along with
# the total number of classes
self.batchSize = batchSize
self.preprocessors = preprocessors
self.aug = aug
self.binarize = binarize
self.classes = classes
# open the HDF5 database for reading and determine the total
# number of entries in the database
self.db = h5py.File(dbPath)
self.numImages = self.db["labels"].shape[0]
def generator(self, passes=np.inf):
# initialize the epoch count
epochs = 0
# keep looping infinitely -- the model will stop once we have
# reach the desired number of epochs
while epochs < passes:
# loop over the HDF5 dataset
for i in np.arange(0, self.numImages, self.batchSize):
# extract the images and labels from the HDF dataset
images = self.db["images"][i: i + self.batchSize]
labels = self.db["labels"][i: i + self.batchSize]
# check to see if the labels should be binarized (one-hot encoding)
if self.binarize:
labels = np_utils.to_categorical(labels,
self.classes)
# check to see if our preprocessors are not None
if self.preprocessors is not None:
# initialize the list of processed images
procImages = []
# loop over the images
for image in images:
# loop over the preprocessors and apply each
# to the image
for p in self.preprocessors:
image = p.preprocess(image)
# update the list of processed images
procImages.append(image)
# update the images array to be the processed
# images
images = np.array(procImages)
# if the data augmenator exists, apply it
if self.aug is not None:
(images, labels) = next(self.aug.flow(images,
labels, batch_size=self.batchSize))
# yield a tuple of images and labels
yield (images, labels)
# increment the total number of epochs
epochs += 1
def close(self):
# close the database
self.db.close()