Merge pull request #128 from fact-project/zstd_comp

Enable zstandard compression for h5py by default
fact-project · Jul 22, 2019 · d1f3f11 · d1f3f11
2 parents c1f3ab9 + 35603db
commit d1f3f11
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 8 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,10 +5,11 @@ matrix:
     - python: "3.6"
     - python: "3.7"
       dist: xenial
-      addons:
-        apt:
-          packages:
-            - libhdf5-serial-dev
+
+addons:
+  apt:
+    packages:
+      - libhdf5-serial-dev
 
 
 before_install:
@@ -17,7 +18,8 @@ before_install:
 
 install:
   - pip install restructuredtext-lint sphinx~=1.8 pygments
-  - pip install .
+  # make sure tables and h5py get linked vs the same hdf5 library
+  - pip install --no-binary=h5py --no-binary=tables .
 
 script:
   - python setup.py test

diff --git a/fact/VERSION b/fact/VERSION
@@ -1 +1 @@
-0.24.0
+0.25.0
diff --git a/fact/io.py b/fact/io.py
@@ -1,12 +1,16 @@
 from os import path
-import pandas as pd
 import json
+import tables
 import h5py
+import pandas as pd
 import sys
 import logging
 import numpy as np
 from copy import copy
 import astropy.units as u
+import tempfile
+import warnings
+from time import perf_counter
 
 
 __all__ = [
@@ -17,6 +21,7 @@
     'read_h5py_chunked',
     'check_extension',
     'to_h5py',
+    'create_blosc_compression_options',
 ]
 
 log = logging.getLogger(__name__)
@@ -26,6 +31,42 @@
 native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder]
 
 
+def create_blosc_compression_options(complevel=5, complib='blosc:zstd', shuffle=True):
+    '''Create correct kwargs for h5py.create_dataset to use the more modern
+    compression filters, default is zstandard with moderate compression settings
+    See https://github.com/h5py/h5py/issues/611#issuecomment-353694301
+    '''
+    shuffle = 2 if shuffle == 'bit' else 1 if shuffle else 0
+    compressors = tables.filters.blosc_compressor_list()
+    complib = ['blosc:' + c for c in compressors].index(complib)
+    args = {
+        'compression': 32001,
+        'compression_opts': (0, 0, 0, 0, complevel, shuffle, complib)
+    }
+    if shuffle:
+        args['shuffle'] = False
+    return args
+
+
+DEFAULT_COMPRESSION = {}
+with tempfile.NamedTemporaryFile(suffix='.hdf5') as f:
+    zstd_opts = create_blosc_compression_options()
+
+    try:
+        with h5py.File(f.name, 'w') as of:
+            of.create_dataset('test', dtype='float64', shape=(1, ), **zstd_opts)
+
+        DEFAULT_COMPRESSION.update(zstd_opts)
+    except ValueError:
+        warnings.warn(
+            'BLOSC compression for hdf5 not available, you will not be able'
+            ' to create or read blosc compressed datasets'
+            ' make sure tables and h5py are linked against the same hdf5 library'
+            ' e.g. by installing hdf5 in your system and doing '
+            ' `pip install --no-binary=tables --no-binary=h5py tables h5py`'
+        )
+
+
 def write_data(df, file_path, key='data', use_h5py=True, **kwargs):
     '''
     Write a pandas DataFrame to several output formats, determined by the
@@ -363,7 +404,12 @@ def initialize_h5py(f, array, key='events', **kwargs):
 
     dtypes = array.dtype
     for name in dtypes.names:
-        create_empty_h5py_dataset(array[name], group, name, **kwargs)
+        create_empty_h5py_dataset(
+            array[name],
+            group,
+            name,
+            **kwargs,
+        )
 
     return group
 
@@ -408,6 +454,10 @@ def create_empty_h5py_dataset(array, group, name, **kwargs):
     else:
         dt = dtype.base
 
+    # add default compression options if no options are given
+    if 'compression' not in kwargs:
+        kwargs.update(DEFAULT_COMPRESSION)
+
     dataset = group.create_dataset(
         name,
         shape=tuple(shape),

diff --git a/tests/test_io.py b/tests/test_io.py
@@ -1,3 +1,4 @@
+import tables.filters
 import pandas as pd
 import tempfile
 import numpy as np
@@ -304,3 +305,33 @@ def test_read_data_h5py():
         df_from_file = read_data(f.name, key='lecker_daten').sort_index(1)
         assert set(df.columns) == set(df_from_file.columns)
         assert df.equals(df_from_file)
+
+
+def test_compression():
+    from fact.io import to_h5py, read_h5py
+
+    df = pd.DataFrame({
+        'x': np.random.normal(size=50),
+        'N': np.random.randint(0, 10, dtype='uint8'),
+        'idx': np.arange(50),
+    })
+
+    with tempfile.NamedTemporaryFile() as f:
+        to_h5py(df, f.name, key='test', compression=None)
+
+        with h5py.File(f.name, 'r') as hf:
+
+            assert 'test' in hf.keys()
+
+            g = hf['test']
+
+            assert 'x' in g.keys()
+            assert 'N' in g.keys()
+
+        df2 = read_h5py(f.name, key='test')
+        df2.sort_index(1, inplace=True)
+        df.sort_index(1, inplace=True)
+
+        assert all(df.dtypes == df2.dtypes)
+        assert all(df['x'] == df2['x'])
+        assert all(df['N'] == df2['N'])