Skip to content

Commit

Permalink
Merge pull request #128 from fact-project/zstd_comp
Browse files Browse the repository at this point in the history
Enable zstandard compression for h5py by default
  • Loading branch information
maxnoe authored Jul 22, 2019
2 parents c1f3ab9 + 35603db commit d1f3f11
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 8 deletions.
12 changes: 7 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ matrix:
- python: "3.6"
- python: "3.7"
dist: xenial
addons:
apt:
packages:
- libhdf5-serial-dev

addons:
apt:
packages:
- libhdf5-serial-dev


before_install:
Expand All @@ -17,7 +18,8 @@ before_install:

install:
- pip install restructuredtext-lint sphinx~=1.8 pygments
- pip install .
# make sure tables and h5py get linked vs the same hdf5 library
- pip install --no-binary=h5py --no-binary=tables .

script:
- python setup.py test
Expand Down
2 changes: 1 addition & 1 deletion fact/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.24.0
0.25.0
54 changes: 52 additions & 2 deletions fact/io.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from os import path
import pandas as pd
import json
import tables
import h5py
import pandas as pd
import sys
import logging
import numpy as np
from copy import copy
import astropy.units as u
import tempfile
import warnings
from time import perf_counter


__all__ = [
Expand All @@ -17,6 +21,7 @@
'read_h5py_chunked',
'check_extension',
'to_h5py',
'create_blosc_compression_options',
]

log = logging.getLogger(__name__)
Expand All @@ -26,6 +31,42 @@
native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder]


def create_blosc_compression_options(complevel=5, complib='blosc:zstd', shuffle=True):
'''Create correct kwargs for h5py.create_dataset to use the more modern
compression filters, default is zstandard with moderate compression settings
See https://github.com/h5py/h5py/issues/611#issuecomment-353694301
'''
shuffle = 2 if shuffle == 'bit' else 1 if shuffle else 0
compressors = tables.filters.blosc_compressor_list()
complib = ['blosc:' + c for c in compressors].index(complib)
args = {
'compression': 32001,
'compression_opts': (0, 0, 0, 0, complevel, shuffle, complib)
}
if shuffle:
args['shuffle'] = False
return args


DEFAULT_COMPRESSION = {}
with tempfile.NamedTemporaryFile(suffix='.hdf5') as f:
zstd_opts = create_blosc_compression_options()

try:
with h5py.File(f.name, 'w') as of:
of.create_dataset('test', dtype='float64', shape=(1, ), **zstd_opts)

DEFAULT_COMPRESSION.update(zstd_opts)
except ValueError:
warnings.warn(
'BLOSC compression for hdf5 not available, you will not be able'
' to create or read blosc compressed datasets'
' make sure tables and h5py are linked against the same hdf5 library'
' e.g. by installing hdf5 in your system and doing '
' `pip install --no-binary=tables --no-binary=h5py tables h5py`'
)


def write_data(df, file_path, key='data', use_h5py=True, **kwargs):
'''
Write a pandas DataFrame to several output formats, determined by the
Expand Down Expand Up @@ -363,7 +404,12 @@ def initialize_h5py(f, array, key='events', **kwargs):

dtypes = array.dtype
for name in dtypes.names:
create_empty_h5py_dataset(array[name], group, name, **kwargs)
create_empty_h5py_dataset(
array[name],
group,
name,
**kwargs,
)

return group

Expand Down Expand Up @@ -408,6 +454,10 @@ def create_empty_h5py_dataset(array, group, name, **kwargs):
else:
dt = dtype.base

# add default compression options if no options are given
if 'compression' not in kwargs:
kwargs.update(DEFAULT_COMPRESSION)

dataset = group.create_dataset(
name,
shape=tuple(shape),
Expand Down
31 changes: 31 additions & 0 deletions tests/test_io.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import tables.filters
import pandas as pd
import tempfile
import numpy as np
Expand Down Expand Up @@ -304,3 +305,33 @@ def test_read_data_h5py():
df_from_file = read_data(f.name, key='lecker_daten').sort_index(1)
assert set(df.columns) == set(df_from_file.columns)
assert df.equals(df_from_file)


def test_compression():
from fact.io import to_h5py, read_h5py

df = pd.DataFrame({
'x': np.random.normal(size=50),
'N': np.random.randint(0, 10, dtype='uint8'),
'idx': np.arange(50),
})

with tempfile.NamedTemporaryFile() as f:
to_h5py(df, f.name, key='test', compression=None)

with h5py.File(f.name, 'r') as hf:

assert 'test' in hf.keys()

g = hf['test']

assert 'x' in g.keys()
assert 'N' in g.keys()

df2 = read_h5py(f.name, key='test')
df2.sort_index(1, inplace=True)
df.sort_index(1, inplace=True)

assert all(df.dtypes == df2.dtypes)
assert all(df['x'] == df2['x'])
assert all(df['N'] == df2['N'])

0 comments on commit d1f3f11

Please sign in to comment.