-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_load.py
131 lines (113 loc) · 4.89 KB
/
data_load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Lint as: python3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# pylint: disable=g-bad-import-order
"""Load data from the specified paths and format them for training."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import numpy as np
import tensorflow as tf
from data_augmentation import augment_data
LABEL_NAME = "gesture"
DATA_NAME = "accel_ms2_xyz"
class DataLoader(object):
"""Loads data and prepares for training."""
def __init__(self, train_data_path, valid_data_path, test_data_path,
folders_name, seq_length):
self.dim = 3
self.seq_length = seq_length
#self.label2id = {"wing": 0, "ring": 1, "slope": 2, "negative": 3}
#self.label2id = {"action1": 0, "action2": 1, "action3": 2, "negative": 3}
self.label2id = {}
for idx, label in enumerate(folders_name):
self.label2id[label] = idx
self.label2id["negative"] = (idx + 1)
print(self.label2id)
self.train_data, self.train_label, self.train_len = self.get_data_file(
train_data_path, "train")
self.valid_data, self.valid_label, self.valid_len = self.get_data_file(
valid_data_path, "valid")
self.test_data, self.test_label, self.test_len = self.get_data_file(
test_data_path, "test")
def get_data_file(self, data_path, data_type):
"""Get train, valid and test data from files."""
data = []
label = []
with open(data_path, "r") as f:
lines = f.readlines()
for idx, line in enumerate(lines): # pylint: disable=unused-variable
dic = json.loads(line)
data.append(dic[DATA_NAME])
label.append(dic[LABEL_NAME])
#if data_type == "train":
# data, label = augment_data(data, label)
length = len(label)
print(data_type + " data length:" + str(length))
return data, label, length
def pad(self, data, seq_length, dim):
"""Get neighbour padding.
There are 2 padding:
x is random val, o is original data.
before padding [x,x,...,x,o,o,o,...o,o]
after padding [o,o,o,...o,o,x,x,...,x]
exception:
If the ori len(data) > seq_length.
x is masked, o is choosen.
before padding [o,o,o,...o,o,x,x,...,x]
after padding [x,x,...,x,o,o,o,...o,o]
"""
noise_level = 20
padded_data = []
# Before- Neighbour padding
tmp_data = (np.random.rand(seq_length, dim) - 0.5) * noise_level + data[0]
tmp_data[(seq_length -
min(len(data), seq_length)):] = data[:min(len(data), seq_length)]
padded_data.append(tmp_data)
# After- Neighbour padding
tmp_data = (np.random.rand(seq_length, dim) - 0.5) * noise_level + data[-1]
if len(data) < seq_length:
tmp_data[:min(len(data), seq_length)] = data[:min(len(data), seq_length)]
else:
tmp_data[:seq_length] = data[(len(data) - seq_length): ]
padded_data.append(tmp_data)
return padded_data
def format_support_func(self, padded_num, length, data, label):
"""Support function for format.(Helps format train, valid and test.)"""
# Add 2 padding, initialize data and label
length *= padded_num
features = np.zeros((length, self.seq_length, self.dim))
labels = np.zeros(length)
# Get padding for train, valid and test
for idx, (data, label) in enumerate(zip(data, label)):
#print(idx,'==>',len(data), self.seq_length, self.dim)
padded_data = self.pad(data, self.seq_length, self.dim)
for num in range(padded_num):
features[padded_num * idx + num] = padded_data[num]
labels[padded_num * idx + num] = self.label2id[label]
# Turn into tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices(
(features, labels.astype("int32")))
return length, dataset
def format(self):
"""Format data(including padding, etc.) and get the dataset for the model."""
padded_num = 2
self.train_len, self.train_data = self.format_support_func(
padded_num, self.train_len, self.train_data, self.train_label)
self.valid_len, self.valid_data = self.format_support_func(
padded_num, self.valid_len, self.valid_data, self.valid_label)
self.test_len, self.test_data = self.format_support_func(
padded_num, self.test_len, self.test_data, self.test_label)