-
Notifications
You must be signed in to change notification settings - Fork 0
/
hdf_util.py
36 lines (26 loc) · 845 Bytes
/
hdf_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os.path
import numpy
from sklearn.decomposition import PCA
import h5py
file = h5py.File(name=os.path.join(os.getcwd(), "embeddings.hdf5"), mode='a')
file256 = h5py.File(name=os.path.join(os.getcwd(), "embeddings_256.hdf5"), mode='a')
for i in file.keys():
print(file[i].dtype)
print(file[i].shape)
break
with open("embeddings.csv", mode="w") as f:
f.write('\n'.join([i + ',' + ','.join(['{:.5f}'.format(num) for num in file[i][()].tolist()]) for i in file.keys()]))
exit(0)
data = []
for i in file.keys():
data.append(file[i][()])
data = numpy.array(data)
data = PCA(n_components=256).fit_transform(data)
for compacted, name in zip(data, file):
file256.create_dataset(name=name, data=compacted)
for i in file256.keys():
print(i)
# print(file256[i].shape)
# break
file.close()
file256.close()