-
Notifications
You must be signed in to change notification settings - Fork 15
/
combine.py
executable file
·140 lines (105 loc) · 3.98 KB
/
combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
"""
Combine different datasets from two hdf5 files
"""
import hdf5
import numpy as np
from parser import get_args_combine as parser
import msg
import check
from extract import update_data
def build_data_dict(data1, data2, match):
"""Build a dictionary of zeros like the union of the two data
dictionaries, but with 'length' equal to the shorter dict.
"""
data = {}
keys1 = [key for key in data1.keys() if key != match]
keys2 = [key for key in data2.keys() if key != match]
nfinal = min(np.shape(data1[match])[0],
np.shape(data2[match])[0])
if nfinal == np.shape(data1[match])[0]:
data[match] = np.zeros_like(data1[match])
else:
data[match] = np.zeros_like(data2[match])
for k in keys1:
data[k] = np.zeros_like(data1[k])
shp = list(np.shape(data1[k]))
shp[0] = nfinal
data[k] = np.resize(data[k], tuple(shp))
for k in keys2:
data[k] = np.zeros_like(data2[k])
shp = list(np.shape(data2[k]))
shp[0] = nfinal
data[k] = np.resize(data[k], tuple(shp))
return data, keys1, keys2
def merge_data(data1, data2, match,
print_warnings=True, show_progress=False, sorted=True):
"""Merge data1 and data2 respect to match key
Keyword arguments:
data1 -- dictionary with data
data2 -- dictionary with data
match -- common key use to order data
if the order of the eventids (or matching idx) is sorted, we can consider:
index2 = np.array([np.searchsorted(data2[match], i)])
"""
data, keys1, keys2 = build_data_dict(data1, data2, match)
# don't use enumerate here because we only want to increment the counter
# when we have a match
ct = 0
for i in data1[match]:
index1 = np.array([ct])
index2, = np.where(data2[match] == i)
if not index2.size:
if print_warnings:
msg.warning("%(key)s = %(val)d found in the first file, "
"but not in the second one."
% {"key": match, "val": i})
continue
data[match][ct] = i
for key in keys1:
data[key][ct] = data1[key][index1]
for key in keys2:
data[key][ct] = data2[key][index2]
if show_progress:
if ct % 100 == 0:
print("finished event {}".format(ct))
ct += 1
# TODO - pass in a value here; generally speaking, it is not right to
# never allow the match index value to be zero - it might be so
# legitimately; but for now...
badidx = np.where(data[match] == 0)
if len(badidx[0] > 1):
data[match] = np.delete(data[match], badidx, axis=0)
for key in keys1:
data[key] = np.delete(data[key], badidx, axis=0)
for key in keys2:
data[key] = np.delete(data[key], badidx, axis=0)
return data
def get_data(filename, match, keys):
"""Load file, check if contains match,
update datasets based on command line options. Return data dictionary.
Keyword arguments:
filename -- input hdf5 file
match -- common key use to order data
keys -- user-chosen datasets to save
"""
data = hdf5.load(filename)
print("\nThe following datasets were found in %s:\n" % filename)
msg.list_dataset(data)
check.key_exists(match, data, filename)
if keys:
msg.info("Using only: " + keys)
update_data(data, [k.strip() for k in keys.split(',')], args.match)
return data
if __name__ == '__main__':
msg.box("HDF5 MANIPULATOR: COMBINE")
args = parser()
data1 = get_data(args.input1, args.match, args.keys1)
data2 = get_data(args.input2, args.match, args.keys2)
check.different_keys(data1, data2, args.match)
data = merge_data(data1, data2, args.match,
args.print_warnings, args.show_progress)
print("\nThe following datasets will be saved in %s:\n" % args.output)
msg.list_dataset(data)
hdf5.save(args.output, data)
msg.info("Done")