From 7472e95c398715702ae7f6a692894ce6669fae4f Mon Sep 17 00:00:00 2001
From: Kai Bruegge <kai.bruegge@udo.edu>
Date: Wed, 11 May 2016 13:28:25 +0200
Subject: [PATCH 01/16] make project really isntallable this time. rename it to
 klaas. for stupid reasons


From 0f34b6c4914766b798bd506f0f3d3143cf90fa64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Thu, 22 Dec 2016 15:44:17 +0100
Subject: [PATCH 02/16] Add funtion to read hdf5 written by h5py, move io stuff
 to own submodule

---
 io.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 io.py

diff --git a/io.py b/io.py
new file mode 100644
index 0000000..304e96c
--- /dev/null
+++ b/io.py
@@ -0,0 +1,118 @@
+from os import path
+import pandas as pd
+import json
+from sklearn_pandas import DataFrameMapper
+from sklearn.externals import joblib
+from sklearn2pmml import sklearn2pmml
+import h5py
+import sys
+import logging
+
+log = logging.getLogger(__name__)
+
+
+allowed_extensions = ('.hdf', '.hdf5', '.h5', '.json', '.csv')
+native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder]
+
+
+def write_data(df, file_path, hdf_key='table'):
+    name, extension = path.splitext(file_path)
+    if extension in ['.hdf', '.hdf5', '.h5']:
+        df.to_hdf(file_path, key=hdf_key)
+    elif extension == '.json':
+        df.to_json(file_path)
+    elif extension == '.csv':
+        df.to_csv(file_path, delimiter=',', index=False)
+    else:
+        raise IOError(
+            'cannot write tabular data with format {}. Allowed formats: {}'.format(
+                extension, 'hdf5, json, csv'
+            )
+        )
+
+
+def read_h5py(file_path, group_name='events', columns=None):
+    '''
+    Read a hdf5 file written with h5py into a dataframe
+
+    Parameters
+    ----------
+    file_path: str
+        file to read in
+    group_name: str
+        name of the hdf5 group to read in
+    columns: iterable[str]
+        Names of the datasets to read in. If not given read all 1d datasets
+    '''
+    df = pd.DataFrame()
+
+    with h5py.File(file_path) as f:
+        group = f.get(group_name)
+        # get all columns of which don't have more than one value per row
+        if columns is None:
+            columns = [col for col in group.keys() if group[col].ndim == 1]
+
+        for col in columns:
+            if group[col].dtype.byteorder not in ('=', native_byteorder):
+                df[col] = group[col][:].byteswap().newbyteorder()
+            else:
+                df[col] = group[col]
+
+    return df
+
+
+def read_pandas_hdf5(file_path, key=None, columns=None):
+    df = pd.read_hdf(file_path, key=key, columns=columns)
+    return df
+
+
+def read_data(file_path, query=None, sample=-1, key=None, columns=None):
+    name, extension = path.splitext(file_path)
+
+    if extension in ['.hdf', '.hdf5', '.h5']:
+        try:
+            df = read_pandas_hdf5(file_path, key=key, columns=columns)
+        except (TypeError, ValueError):
+
+            df = read_h5py(file_path, columns=columns)
+
+    elif extension == '.json':
+        with open(file_path, 'r') as j:
+            d = json.load(j)
+            df = pd.DataFrame(d)
+    else:
+        raise NotImplementedError('Unknown data file extension {}'.format(extension))
+
+    if sample > 0:
+        print('Taking {} random samples'.format(sample))
+        df = df.sample(sample)
+
+    if query:
+        print('Quering with string: {}'.format(query))
+        df = df.copy().query(query)
+
+    return df
+
+
+def check_extension(file_path, allowed_extensions=allowed_extensions):
+    p, extension = path.splitext(file_path)
+    if extension not in allowed_extensions:
+        raise IOError('Allowed formats: {}'.format(allowed_extensions))
+
+
+def pickle_model(classifier, feature_names, model_path, label_text='label'):
+    p, extension = path.splitext(model_path)
+    classifier.feature_names = feature_names
+    if (extension == '.pmml'):
+        print("Pickling model to {} ...".format(model_path))
+
+        mapper = DataFrameMapper([
+            (feature_names, None),
+            (label_text, None),
+        ])
+
+        joblib.dump(classifier, p + '.pkl', compress=4)
+        sklearn2pmml(classifier, mapper,  model_path)
+
+    else:
+        joblib.dump(classifier, model_path, compress=4)

From 69bf64132028b24fd25273436cb037774e73792a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Mon, 6 Feb 2017 16:26:25 +0100
Subject: [PATCH 03/16] Implement chunked reading for read_hdf

---
 io.py | 64 +++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/io.py b/io.py
index 304e96c..68def32 100644
--- a/io.py
+++ b/io.py
@@ -7,6 +7,7 @@
 import h5py
 import sys
 import logging
+import numpy as np
 
 log = logging.getLogger(__name__)
 
@@ -16,13 +17,18 @@
 
 
 def write_data(df, file_path, hdf_key='table'):
+
     name, extension = path.splitext(file_path)
+
     if extension in ['.hdf', '.hdf5', '.h5']:
         df.to_hdf(file_path, key=hdf_key)
+
     elif extension == '.json':
         df.to_json(file_path)
+
     elif extension == '.csv':
         df.to_csv(file_path, delimiter=',', index=False)
+
     else:
         raise IOError(
             'cannot write tabular data with format {}. Allowed formats: {}'.format(
@@ -31,7 +37,16 @@ def write_data(df, file_path, hdf_key='table'):
         )
 
 
-def read_h5py(file_path, group_name='events', columns=None):
+def to_native_byteorder(array):
+    ''' Convert numpy array to native byteorder '''
+
+    if array.dtype.byteorder not in ('=', native_byteorder):
+        return array.byteswap().newbyteorder()
+
+    return array
+
+
+def read_h5py(file_path, key='events', columns=None, chunksize=None):
     '''
     Read a hdf5 file written with h5py into a dataframe
 
@@ -39,42 +54,61 @@ def read_h5py(file_path, group_name='events', columns=None):
     ----------
     file_path: str
         file to read in
-    group_name: str
+    key: str
         name of the hdf5 group to read in
     columns: iterable[str]
         Names of the datasets to read in. If not given read all 1d datasets
     '''
-    df = pd.DataFrame()
 
     with h5py.File(file_path) as f:
-        group = f.get(group_name)
+        group = f.get(key)
+        if group is None:
+            raise IOError('File does not contain group "{}"'.format(key))
         # get all columns of which don't have more than one value per row
         if columns is None:
             columns = [col for col in group.keys() if group[col].ndim == 1]
 
-        for col in columns:
-            if group[col].dtype.byteorder not in ('=', native_byteorder):
-                df[col] = group[col][:].byteswap().newbyteorder()
-            else:
-                df[col] = group[col]
+        # read all columns and rows in one dataframe if now chunksize given
+        if chunksize is None:
+            df = pd.DataFrame()
+            for col in columns:
+                df[col] = to_native_byteorder(group[col][:])
 
-    return df
+            return df
+
+        # read data in chunks if chunksize is given
+        n_events = group[next(iter(group.keys()))].shape[0]
+        chunks = int(np.ceil(n_events / chunksize))
+
+        for chunk in range(chunks):
+
+            start = chunk * chunksize
+            end = min(n_events, (chunk + 1) * chunksize)
 
+            df = pd.DataFrame(index=np.arange(start, end))
 
-def read_pandas_hdf5(file_path, key=None, columns=None):
-    df = pd.read_hdf(file_path, key=key, columns=columns)
+            for col in columns:
+                df[col] = to_native_byteorder(group[col][start:end])
+
+            yield df
+
+
+def read_pandas_hdf5(file_path, key=None, columns=None, chunksize=None):
+    df = pd.read_hdf(file_path, key=key, columns=columns, chunksize=chunksize)
     return df
 
 
-def read_data(file_path, query=None, sample=-1, key=None, columns=None):
+def read_data(file_path, query=None, sample=-1, key=None, columns=None, chunksize=None):
     name, extension = path.splitext(file_path)
 
     if extension in ['.hdf', '.hdf5', '.h5']:
         try:
-            df = read_pandas_hdf5(file_path, key=key, columns=columns)
+            df = read_pandas_hdf5(
+                file_path, key=key, columns=columns, chunksize=chunksize
+            )
         except (TypeError, ValueError):
 
-            df = read_h5py(file_path, columns=columns)
+            df = read_h5py(file_path, key=key, columns=columns, chunksize=chunksize)
 
     elif extension == '.json':
         with open(file_path, 'r') as j:

From 32b088238e5ebfa74b590ebf4606dfaa4d91056e Mon Sep 17 00:00:00 2001
From: Maximilian Noethe <maximilian.noethe@tu-dortmund.de>
Date: Tue, 7 Feb 2017 15:24:39 +0100
Subject: [PATCH 04/16] Fix readh5py generator

---
 io.py | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/io.py b/io.py
index 68def32..1ccb827 100644
--- a/io.py
+++ b/io.py
@@ -60,23 +60,39 @@ def read_h5py(file_path, key='events', columns=None, chunksize=None):
         Names of the datasets to read in. If not given read all 1d datasets
     '''
 
+    # read all columns and rows in one dataframe if no chunksize given
+    if chunksize is None:
+        with h5py.File(file_path) as f:
+            group = f.get(key)
+            if group is None:
+                raise IOError('File does not contain group "{}"'.format(key))
+
+            # get all columns of which don't have more than one value per row
+            if columns is None:
+                columns = [col for col in group.keys() if group[col].ndim == 1]
+
+            df = pd.DataFrame()
+            for col in columns:
+                df[col] = to_native_byteorder(group[col][:])
+
+        return df
+
+    # read data in chunks if chunksize is given
+    return read_h5py_chunked(
+        file_path, key=key, columns=columns, chunksize=chunksize
+    )
+
+
+def read_h5py_chunked(file_path, key='events', columns=None, chunksize=10000):
     with h5py.File(file_path) as f:
         group = f.get(key)
         if group is None:
             raise IOError('File does not contain group "{}"'.format(key))
+
         # get all columns of which don't have more than one value per row
         if columns is None:
             columns = [col for col in group.keys() if group[col].ndim == 1]
 
-        # read all columns and rows in one dataframe if now chunksize given
-        if chunksize is None:
-            df = pd.DataFrame()
-            for col in columns:
-                df[col] = to_native_byteorder(group[col][:])
-
-            return df
-
-        # read data in chunks if chunksize is given
         n_events = group[next(iter(group.keys()))].shape[0]
         chunks = int(np.ceil(n_events / chunksize))
 
@@ -104,11 +120,19 @@ def read_data(file_path, query=None, sample=-1, key=None, columns=None, chunksiz
     if extension in ['.hdf', '.hdf5', '.h5']:
         try:
             df = read_pandas_hdf5(
-                file_path, key=key, columns=columns, chunksize=chunksize
+                file_path,
+                key=key or 'table',
+                columns=columns,
+                chunksize=chunksize,
             )
         except (TypeError, ValueError):
 
-            df = read_h5py(file_path, key=key, columns=columns, chunksize=chunksize)
+            df = read_h5py(
+                file_path,
+                key=key or 'events',
+                columns=columns,
+                chunksize=chunksize,
+            )
 
     elif extension == '.json':
         with open(file_path, 'r') as j:

From 109ceacf06585d7c91f024ec508950c42ef9dbad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Tue, 7 Feb 2017 16:29:29 +0100
Subject: [PATCH 05/16] Allow setting number of signal and background events
 for separator

---
 io.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/io.py b/io.py
index 1ccb827..886e1b7 100644
--- a/io.py
+++ b/io.py
@@ -114,7 +114,7 @@ def read_pandas_hdf5(file_path, key=None, columns=None, chunksize=None):
     return df
 
 
-def read_data(file_path, query=None, sample=-1, key=None, columns=None, chunksize=None):
+def read_data(file_path, key=None, columns=None, chunksize=None):
     name, extension = path.splitext(file_path)
 
     if extension in ['.hdf', '.hdf5', '.h5']:
@@ -141,14 +141,6 @@ def read_data(file_path, query=None, sample=-1, key=None, columns=None, chunksiz
     else:
         raise NotImplementedError('Unknown data file extension {}'.format(extension))
 
-    if sample > 0:
-        print('Taking {} random samples'.format(sample))
-        df = df.sample(sample)
-
-    if query:
-        print('Quering with string: {}'.format(query))
-        df = df.copy().query(query)
-
     return df
 
 
From 7c598c528d86fe65e3542b839596c86eaf5ba496 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Wed, 8 Feb 2017 09:48:44 +0100
Subject: [PATCH 06/16] Implement chunking in apply_separation_model

---
 io.py | 54 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/io.py b/io.py
index 886e1b7..7d68672 100644
--- a/io.py
+++ b/io.py
@@ -46,7 +46,7 @@ def to_native_byteorder(array):
     return array
 
 
-def read_h5py(file_path, key='events', columns=None, chunksize=None):
+def read_h5py(file_path, key='events', columns=None):
     '''
     Read a hdf5 file written with h5py into a dataframe
 
@@ -59,31 +59,29 @@ def read_h5py(file_path, key='events', columns=None, chunksize=None):
     columns: iterable[str]
         Names of the datasets to read in. If not given read all 1d datasets
     '''
+    with h5py.File(file_path) as f:
+        group = f.get(key)
+        if group is None:
+            raise IOError('File does not contain group "{}"'.format(key))
 
-    # read all columns and rows in one dataframe if no chunksize given
-    if chunksize is None:
-        with h5py.File(file_path) as f:
-            group = f.get(key)
-            if group is None:
-                raise IOError('File does not contain group "{}"'.format(key))
-
-            # get all columns of which don't have more than one value per row
-            if columns is None:
-                columns = [col for col in group.keys() if group[col].ndim == 1]
+        # get all columns of which don't have more than one value per row
+        if columns is None:
+            columns = [col for col in group.keys() if group[col].ndim == 1]
 
-            df = pd.DataFrame()
-            for col in columns:
-                df[col] = to_native_byteorder(group[col][:])
+        df = pd.DataFrame()
+        for col in columns:
+            df[col] = to_native_byteorder(group[col][:])
 
-        return df
+    return df
 
-    # read data in chunks if chunksize is given
-    return read_h5py_chunked(
-        file_path, key=key, columns=columns, chunksize=chunksize
-    )
 
+def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None):
+    '''
+    Generator function to read from h5py hdf5 in chunks,
+    returns an iterator over pandas dataframes.
 
-def read_h5py_chunked(file_path, key='events', columns=None, chunksize=10000):
+    When chunksize is None, use 1 chunk
+    '''
     with h5py.File(file_path) as f:
         group = f.get(key)
         if group is None:
@@ -94,9 +92,15 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=10000):
             columns = [col for col in group.keys() if group[col].ndim == 1]
 
         n_events = group[next(iter(group.keys()))].shape[0]
-        chunks = int(np.ceil(n_events / chunksize))
 
-        for chunk in range(chunks):
+        if chunksize is None:
+            n_chunks = 1
+            chunksize = n_events
+        else:
+            n_chunks = int(np.ceil(n_events / chunksize))
+            log.info('Splitting data into {} chunks'.format(n_chunks))
+
+        for chunk in range(n_chunks):
 
             start = chunk * chunksize
             end = min(n_events, (chunk + 1) * chunksize)
@@ -106,7 +110,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=10000):
             for col in columns:
                 df[col] = to_native_byteorder(group[col][start:end])
 
-            yield df
+            yield df, start, end
 
 
 def read_pandas_hdf5(file_path, key=None, columns=None, chunksize=None):
@@ -114,7 +118,7 @@ def read_pandas_hdf5(file_path, key=None, columns=None, chunksize=None):
     return df
 
 
-def read_data(file_path, key=None, columns=None, chunksize=None):
+def read_data(file_path, key=None, columns=None):
     name, extension = path.splitext(file_path)
 
     if extension in ['.hdf', '.hdf5', '.h5']:
@@ -123,7 +127,6 @@ def read_data(file_path, key=None, columns=None, chunksize=None):
                 file_path,
                 key=key or 'table',
                 columns=columns,
-                chunksize=chunksize,
             )
         except (TypeError, ValueError):
 
@@ -131,7 +134,6 @@ def read_data(file_path, key=None, columns=None, chunksize=None):
                 file_path,
                 key=key or 'events',
                 columns=columns,
-                chunksize=chunksize,
             )
 
     elif extension == '.json':

From 7d1b50ed04ba48719f2afbb18ccad4dfe3ff40a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Wed, 8 Feb 2017 10:51:06 +0100
Subject: [PATCH 07/16] Add __all__ to io.py

---
 io.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/io.py b/io.py
index 7d68672..1400608 100644
--- a/io.py
+++ b/io.py
@@ -9,6 +9,11 @@
 import logging
 import numpy as np
 
+__all__ = [
+    'write_data', 'to_native_byteorder', 'read_h5py', 'read_h5py_chunked',
+    'read_pandas_hdf5', 'pickle_model', 'check_extension', 'read_data'
+]
+
 log = logging.getLogger(__name__)
 
 
From 76810293da0c39d73ef1d22ac121f23c35d58062 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Thu, 9 Feb 2017 18:36:17 +0100
Subject: [PATCH 08/16] Add first version of apply_cuts

---
 io.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/io.py b/io.py
index 1400608..6f57560 100644
--- a/io.py
+++ b/io.py
@@ -80,6 +80,17 @@ def read_h5py(file_path, key='events', columns=None):
     return df
 
 
+def h5py_get_n_events(file_path, key='events'):
+
+    with h5py.File(file_path) as f:
+        group = f.get(key)
+
+        if group is None:
+            raise IOError('File does not contain group "{}"'.format(key))
+
+        return group[next(iter(group.keys()))].shape[0]
+
+
 def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None):
     '''
     Generator function to read from h5py hdf5 in chunks,
@@ -96,7 +107,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None):
         if columns is None:
             columns = [col for col in group.keys() if group[col].ndim == 1]
 
-        n_events = group[next(iter(group.keys()))].shape[0]
+        n_events = h5py_get_n_events(file_path, key=key)
 
         if chunksize is None:
             n_chunks = 1

From df4e92f2683f369337ba3f7afe7376ec7d4abbb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Fri, 24 Feb 2017 16:33:47 +0100
Subject: [PATCH 09/16] Open h5py files with r mode, throws error if file not
 exists

---
 io.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/io.py b/io.py
index 6f57560..8f13aa1 100644
--- a/io.py
+++ b/io.py
@@ -64,7 +64,7 @@ def read_h5py(file_path, key='events', columns=None):
     columns: iterable[str]
         Names of the datasets to read in. If not given read all 1d datasets
     '''
-    with h5py.File(file_path) as f:
+    with h5py.File(file_path, 'r') as f:
         group = f.get(key)
         if group is None:
             raise IOError('File does not contain group "{}"'.format(key))
@@ -82,7 +82,7 @@ def read_h5py(file_path, key='events', columns=None):
 
 def h5py_get_n_events(file_path, key='events'):
 
-    with h5py.File(file_path) as f:
+    with h5py.File(file_path, 'r') as f:
         group = f.get(key)
 
         if group is None:
@@ -98,7 +98,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None):
 
     When chunksize is None, use 1 chunk
     '''
-    with h5py.File(file_path) as f:
+    with h5py.File(file_path, 'r') as f:
         group = f.get(key)
         if group is None:
             raise IOError('File does not contain group "{}"'.format(key))

From 40b86f5c5e05feb2a4f5dbe4a980043b98fc57a9 Mon Sep 17 00:00:00 2001
From: Maximilian Noethe <maximilian.noethe@tu-dortmund.de>
Date: Wed, 1 Mar 2017 12:52:41 +0100
Subject: [PATCH 10/16] Fix h5py file modes

---
 io.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/io.py b/io.py
index 8f13aa1..643f00a 100644
--- a/io.py
+++ b/io.py
@@ -64,7 +64,7 @@ def read_h5py(file_path, key='events', columns=None):
     columns: iterable[str]
         Names of the datasets to read in. If not given read all 1d datasets
     '''
-    with h5py.File(file_path, 'r') as f:
+    with h5py.File(file_path, 'r+') as f:
         group = f.get(key)
         if group is None:
             raise IOError('File does not contain group "{}"'.format(key))
@@ -82,7 +82,7 @@ def read_h5py(file_path, key='events', columns=None):
 
 def h5py_get_n_events(file_path, key='events'):
 
-    with h5py.File(file_path, 'r') as f:
+    with h5py.File(file_path, 'r+') as f:
         group = f.get(key)
 
         if group is None:
@@ -98,7 +98,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None):
 
     When chunksize is None, use 1 chunk
     '''
-    with h5py.File(file_path, 'r') as f:
+    with h5py.File(file_path, 'r+') as f:
         group = f.get(key)
         if group is None:
             raise IOError('File does not contain group "{}"'.format(key))

From ab81dd753e9fc25f6032d767c4278ee54eb27a65 Mon Sep 17 00:00:00 2001
From: Maximilian Noethe <maximilian.noethe@tu-dortmund.de>
Date: Thu, 23 Mar 2017 17:08:28 +0100
Subject: [PATCH 11/16] Add support for 2d columns

---
 io.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/io.py b/io.py
index 643f00a..8637631 100644
--- a/io.py
+++ b/io.py
@@ -8,6 +8,7 @@
 import sys
 import logging
 import numpy as np
+from copy import copy
 
 __all__ = [
     'write_data', 'to_native_byteorder', 'read_h5py', 'read_h5py_chunked',
@@ -75,7 +76,14 @@ def read_h5py(file_path, key='events', columns=None):
 
         df = pd.DataFrame()
         for col in columns:
-            df[col] = to_native_byteorder(group[col][:])
+            array = to_native_byteorder(group[col][:])
+            if array.ndim == 1:
+                df[col] == array
+            elif array.ndim == 2:
+                for i in range(array.shape[1]):
+                    df[col + '_{}'.format(i)] = array[:, i]
+            else:
+                log.warning('Skipping column {}, not 1d or 2d'.format(col))
 
     return df
 
@@ -116,6 +124,11 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None):
             n_chunks = int(np.ceil(n_events / chunksize))
             log.info('Splitting data into {} chunks'.format(n_chunks))
 
+        for col in copy(columns):
+            if group[col].ndim > 2:
+                columns.remove(col)
+                log.warning('Ignoring column {}, not 1d or 2d'.format(col))
+
         for chunk in range(n_chunks):
 
             start = chunk * chunksize
@@ -124,7 +137,14 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None):
             df = pd.DataFrame(index=np.arange(start, end))
 
             for col in columns:
-                df[col] = to_native_byteorder(group[col][start:end])
+                array = to_native_byteorder(group[col][start:end])
+
+                if array.ndim == 1:
+                    df[col] == array
+
+                else:
+                    for i in range(array.shape[1]):
+                        df[col + '_{}'.format(i)] = array[:, i]
 
             yield df, start, end
 

From 14feb34f885e72614e1ddaeeb85870ab40972d45 Mon Sep 17 00:00:00 2001
From: Maximilian Noethe <maximilian.noethe@tu-dortmund.de>
Date: Thu, 23 Mar 2017 17:13:01 +0100
Subject: [PATCH 12/16] Fix stupid == typo

---
 io.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io.py b/io.py
index 8637631..33fa53b 100644
--- a/io.py
+++ b/io.py
@@ -78,7 +78,7 @@ def read_h5py(file_path, key='events', columns=None):
         for col in columns:
             array = to_native_byteorder(group[col][:])
             if array.ndim == 1:
-                df[col] == array
+                df[col] = array
             elif array.ndim == 2:
                 for i in range(array.shape[1]):
                     df[col + '_{}'.format(i)] = array[:, i]
@@ -140,7 +140,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None):
                 array = to_native_byteorder(group[col][start:end])
 
                 if array.ndim == 1:
-                    df[col] == array
+                    df[col] = array
 
                 else:
                     for i in range(array.shape[1]):

From f37bd88add482f211e8358d97443d58c3c6e4f31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Fri, 24 Mar 2017 13:43:09 +0100
Subject: [PATCH 13/16] Move io.py into fact package

---
 io.py => fact/io.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename io.py => fact/io.py (100%)

diff --git a/io.py b/fact/io.py
similarity index 100%
rename from io.py
rename to fact/io.py

From f914ad34137435736cc1c968bd9be61a5a4c643b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Fri, 24 Mar 2017 13:47:05 +0100
Subject: [PATCH 14/16] Adapt io.py to pyfact

---
 fact/io.py | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

diff --git a/fact/io.py b/fact/io.py
index 33fa53b..9975b9d 100644
--- a/fact/io.py
+++ b/fact/io.py
@@ -1,9 +1,6 @@
 from os import path
 import pandas as pd
 import json
-from sklearn_pandas import DataFrameMapper
-from sklearn.externals import joblib
-from sklearn2pmml import sklearn2pmml
 import h5py
 import sys
 import logging
@@ -11,8 +8,13 @@
 from copy import copy
 
 __all__ = [
-    'write_data', 'to_native_byteorder', 'read_h5py', 'read_h5py_chunked',
-    'read_pandas_hdf5', 'pickle_model', 'check_extension', 'read_data'
+    'write_data',
+    'to_native_byteorder',
+    'read_data',
+    'read_h5py',
+    'read_h5py_chunked',
+    'read_pandas_hdf5',
+    'check_extension',
 ]
 
 log = logging.getLogger(__name__)
@@ -22,12 +24,12 @@
 native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder]
 
 
-def write_data(df, file_path, hdf_key='table'):
+def write_data(df, file_path, key='table'):
 
     name, extension = path.splitext(file_path)
 
     if extension in ['.hdf', '.hdf5', '.h5']:
-        df.to_hdf(file_path, key=hdf_key)
+        df.to_hdf(file_path, key=key, format='table')
 
     elif extension == '.json':
         df.to_json(file_path)
@@ -176,6 +178,8 @@ def read_data(file_path, key=None, columns=None):
         with open(file_path, 'r') as j:
             d = json.load(j)
             df = pd.DataFrame(d)
+    elif extension in ('.jsonl', '.jsonlines'):
+        df = pd.read_json(file_path, lines=True)
     else:
         raise NotImplementedError('Unknown data file extension {}'.format(extension))
 
@@ -186,21 +190,3 @@ def check_extension(file_path, allowed_extensions=allowed_extensions):
     p, extension = path.splitext(file_path)
     if extension not in allowed_extensions:
         raise IOError('Allowed formats: {}'.format(allowed_extensions))
-
-
-def pickle_model(classifier, feature_names, model_path, label_text='label'):
-    p, extension = path.splitext(model_path)
-    classifier.feature_names = feature_names
-    if (extension == '.pmml'):
-        print("Pickling model to {} ...".format(model_path))
-
-        mapper = DataFrameMapper([
-            (feature_names, None),
-            (label_text, None),
-        ])
-
-        joblib.dump(classifier, p + '.pkl', compress=4)
-        sklearn2pmml(classifier, mapper,  model_path)
-
-    else:
-        joblib.dump(classifier, model_path, compress=4)

From a989dd3c6b5ee282676f8ae6b43f95423f646f66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Fri, 24 Mar 2017 13:48:00 +0100
Subject: [PATCH 15/16] Bump version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3b4b54a..11c7a43 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name='pyfact',
-    version='0.8.8',
+    version='0.9.0',
     description='A module containing useful methods for working with fact',
     url='http://github.com/fact-project/pyfact',
     author='Maximilian Noethe, Dominik Neise',

From c52563d07793089577c321b26399dbbea8c36995 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20N=C3=B6the?= <maximilian.noethe@tu-dortmund.de>
Date: Tue, 28 Mar 2017 15:17:32 +0200
Subject: [PATCH 16/16] Adapt to kai's comments

---
 fact/io.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fact/io.py b/fact/io.py
index 9975b9d..38c78aa 100644
--- a/fact/io.py
+++ b/fact/io.py
@@ -20,7 +20,7 @@
 log = logging.getLogger(__name__)
 
 
-allowed_extensions = ('.hdf', '.hdf5', '.h5', '.json', '.csv')
+allowed_extensions = ('.hdf', '.hdf5', '.h5', '.json', '.jsonl', '.jsonlines', '.csv')
 native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder]
 
 
@@ -34,13 +34,16 @@ def write_data(df, file_path, key='table'):
     elif extension == '.json':
         df.to_json(file_path)
 
+    elif extension in ('.jsonl', '.jsonline'):
+        df.to_json(file_path, lines=True, orient='records')
+
     elif extension == '.csv':
         df.to_csv(file_path, delimiter=',', index=False)
 
     else:
         raise IOError(
             'cannot write tabular data with format {}. Allowed formats: {}'.format(
-                extension, 'hdf5, json, csv'
+                extension, allowed_extensions,
             )
         )