forked from TomaszGolan/hdf5_manipulator
-
Notifications
You must be signed in to change notification settings - Fork 1
/
combine.py
executable file
·124 lines (91 loc) · 3.42 KB
/
combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
"""
Combine different datasets from two hdf5 files
"""
import hdf5
import numpy as np
from parser import get_args_combine as parser
import msg
import check
from extract import update_data
def build_data_dict(data1, data2, match):
"""Build a dictionary of zeros like the union of the two data
dictionaries, but with 'length' equal to the shorter dict.
"""
data = {}
keys1 = [key for key in data1.keys() if key != match]
keys2 = [key for key in data2.keys() if key != match]
nfinal = min(np.shape(data1[match])[0],
np.shape(data2[match])[0])
if nfinal == np.shape(data1[match])[0]:
data[match] = np.zeros_like(data1[match])
else:
data[match] = np.zeros_like(data2[match])
for k in keys1:
data[k] = np.zeros_like(data1[k])
shp = list(np.shape(data1[k]))
shp[0] = nfinal
data[k] = np.resize(data[k], tuple(shp))
for k in keys2:
data[k] = np.zeros_like(data2[k])
shp = list(np.shape(data2[k]))
shp[0] = nfinal
data[k] = np.resize(data[k], tuple(shp))
return data, keys1, keys2
def merge_data(data1, data2, match,
print_warnings=True, show_progress=False, sorted=True):
"""Merge data1 and data2 respect to match key
Keyword arguments:
data1 -- dictionary with data
data2 -- dictionary with data
match -- common key use to order data
if the order of the eventids (or matching idx) is sorted, we can consider:
index2 = np.array([np.searchsorted(data2[match], i)])
"""
data, keys1, keys2 = build_data_dict(data1, data2, match)
for ct, i in enumerate(data1[match]):
index1 = np.array([ct])
index2, = np.where(data2[match] == i)
if not index2.size:
if print_warnings:
msg.warning("%(key)s = %(val)d found in the first file, "
"but not in the second one."
% {"key": match, "val": i})
continue
data[match][ct] = i
for key in keys1:
data[key][ct] = data1[key][index1]
for key in keys2:
data[key][ct] = data2[key][index2]
if show_progress:
if ct % 100 == 0:
print("finished event {}".format(ct))
return data
def get_data(filename, match, keys):
"""Load file, check if contains match,
update datasets based on command line options. Return data dictionary.
Keyword arguments:
filename -- input hdf5 file
match -- common key use to order data
keys -- user-chosen datasets to save
"""
data = hdf5.load(filename)
print "\nThe following datasets were found in %s:\n" % filename
msg.list_dataset(data)
check.key_exists(match, data, filename)
if keys:
msg.info("Using only: " + keys)
update_data(data, [k.strip() for k in keys.split(',')], args.match)
return data
if __name__ == '__main__':
msg.box("HDF5 MANIPULATOR: COMBINE")
args = parser()
data1 = get_data(args.input1, args.match, args.keys1)
data2 = get_data(args.input2, args.match, args.keys2)
check.different_keys(data1, data2, args.match)
data = merge_data(data1, data2, args.match,
args.print_warnings, args.show_progress)
print "\nThe following datasets will be saved in %s:\n" % args.output
msg.list_dataset(data)
hdf5.save(args.output, data)
msg.info("Done")