From 7472e95c398715702ae7f6a692894ce6669fae4f Mon Sep 17 00:00:00 2001 From: Kai Bruegge Date: Wed, 11 May 2016 13:28:25 +0200 Subject: [PATCH 01/16] make project really isntallable this time. rename it to klaas. for stupid reasons From 0f34b6c4914766b798bd506f0f3d3143cf90fa64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Thu, 22 Dec 2016 15:44:17 +0100 Subject: [PATCH 02/16] Add funtion to read hdf5 written by h5py, move io stuff to own submodule --- io.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 io.py diff --git a/io.py b/io.py new file mode 100644 index 0000000..304e96c --- /dev/null +++ b/io.py @@ -0,0 +1,118 @@ +from os import path +import pandas as pd +import json +from sklearn_pandas import DataFrameMapper +from sklearn.externals import joblib +from sklearn2pmml import sklearn2pmml +import h5py +import sys +import logging + +log = logging.getLogger(__name__) + + +allowed_extensions = ('.hdf', '.hdf5', '.h5', '.json', '.csv') +native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder] + + +def write_data(df, file_path, hdf_key='table'): + name, extension = path.splitext(file_path) + if extension in ['.hdf', '.hdf5', '.h5']: + df.to_hdf(file_path, key=hdf_key) + elif extension == '.json': + df.to_json(file_path) + elif extension == '.csv': + df.to_csv(file_path, delimiter=',', index=False) + else: + raise IOError( + 'cannot write tabular data with format {}. Allowed formats: {}'.format( + extension, 'hdf5, json, csv' + ) + ) + + +def read_h5py(file_path, group_name='events', columns=None): + ''' + Read a hdf5 file written with h5py into a dataframe + + Parameters + ---------- + file_path: str + file to read in + group_name: str + name of the hdf5 group to read in + columns: iterable[str] + Names of the datasets to read in. If not given read all 1d datasets + ''' + df = pd.DataFrame() + + with h5py.File(file_path) as f: + group = f.get(group_name) + # get all columns of which don't have more than one value per row + if columns is None: + columns = [col for col in group.keys() if group[col].ndim == 1] + + for col in columns: + if group[col].dtype.byteorder not in ('=', native_byteorder): + df[col] = group[col][:].byteswap().newbyteorder() + else: + df[col] = group[col] + + return df + + +def read_pandas_hdf5(file_path, key=None, columns=None): + df = pd.read_hdf(file_path, key=key, columns=columns) + return df + + +def read_data(file_path, query=None, sample=-1, key=None, columns=None): + name, extension = path.splitext(file_path) + + if extension in ['.hdf', '.hdf5', '.h5']: + try: + df = read_pandas_hdf5(file_path, key=key, columns=columns) + except (TypeError, ValueError): + + df = read_h5py(file_path, columns=columns) + + elif extension == '.json': + with open(file_path, 'r') as j: + d = json.load(j) + df = pd.DataFrame(d) + else: + raise NotImplementedError('Unknown data file extension {}'.format(extension)) + + if sample > 0: + print('Taking {} random samples'.format(sample)) + df = df.sample(sample) + + if query: + print('Quering with string: {}'.format(query)) + df = df.copy().query(query) + + return df + + +def check_extension(file_path, allowed_extensions=allowed_extensions): + p, extension = path.splitext(file_path) + if extension not in allowed_extensions: + raise IOError('Allowed formats: {}'.format(allowed_extensions)) + + +def pickle_model(classifier, feature_names, model_path, label_text='label'): + p, extension = path.splitext(model_path) + classifier.feature_names = feature_names + if (extension == '.pmml'): + print("Pickling model to {} ...".format(model_path)) + + mapper = DataFrameMapper([ + (feature_names, None), + (label_text, None), + ]) + + joblib.dump(classifier, p + '.pkl', compress=4) + sklearn2pmml(classifier, mapper, model_path) + + else: + joblib.dump(classifier, model_path, compress=4) From 69bf64132028b24fd25273436cb037774e73792a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Mon, 6 Feb 2017 16:26:25 +0100 Subject: [PATCH 03/16] Implement chunked reading for read_hdf --- io.py | 64 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/io.py b/io.py index 304e96c..68def32 100644 --- a/io.py +++ b/io.py @@ -7,6 +7,7 @@ import h5py import sys import logging +import numpy as np log = logging.getLogger(__name__) @@ -16,13 +17,18 @@ def write_data(df, file_path, hdf_key='table'): + name, extension = path.splitext(file_path) + if extension in ['.hdf', '.hdf5', '.h5']: df.to_hdf(file_path, key=hdf_key) + elif extension == '.json': df.to_json(file_path) + elif extension == '.csv': df.to_csv(file_path, delimiter=',', index=False) + else: raise IOError( 'cannot write tabular data with format {}. Allowed formats: {}'.format( @@ -31,7 +37,16 @@ def write_data(df, file_path, hdf_key='table'): ) -def read_h5py(file_path, group_name='events', columns=None): +def to_native_byteorder(array): + ''' Convert numpy array to native byteorder ''' + + if array.dtype.byteorder not in ('=', native_byteorder): + return array.byteswap().newbyteorder() + + return array + + +def read_h5py(file_path, key='events', columns=None, chunksize=None): ''' Read a hdf5 file written with h5py into a dataframe @@ -39,42 +54,61 @@ def read_h5py(file_path, group_name='events', columns=None): ---------- file_path: str file to read in - group_name: str + key: str name of the hdf5 group to read in columns: iterable[str] Names of the datasets to read in. If not given read all 1d datasets ''' - df = pd.DataFrame() with h5py.File(file_path) as f: - group = f.get(group_name) + group = f.get(key) + if group is None: + raise IOError('File does not contain group "{}"'.format(key)) # get all columns of which don't have more than one value per row if columns is None: columns = [col for col in group.keys() if group[col].ndim == 1] - for col in columns: - if group[col].dtype.byteorder not in ('=', native_byteorder): - df[col] = group[col][:].byteswap().newbyteorder() - else: - df[col] = group[col] + # read all columns and rows in one dataframe if now chunksize given + if chunksize is None: + df = pd.DataFrame() + for col in columns: + df[col] = to_native_byteorder(group[col][:]) - return df + return df + + # read data in chunks if chunksize is given + n_events = group[next(iter(group.keys()))].shape[0] + chunks = int(np.ceil(n_events / chunksize)) + + for chunk in range(chunks): + + start = chunk * chunksize + end = min(n_events, (chunk + 1) * chunksize) + df = pd.DataFrame(index=np.arange(start, end)) -def read_pandas_hdf5(file_path, key=None, columns=None): - df = pd.read_hdf(file_path, key=key, columns=columns) + for col in columns: + df[col] = to_native_byteorder(group[col][start:end]) + + yield df + + +def read_pandas_hdf5(file_path, key=None, columns=None, chunksize=None): + df = pd.read_hdf(file_path, key=key, columns=columns, chunksize=chunksize) return df -def read_data(file_path, query=None, sample=-1, key=None, columns=None): +def read_data(file_path, query=None, sample=-1, key=None, columns=None, chunksize=None): name, extension = path.splitext(file_path) if extension in ['.hdf', '.hdf5', '.h5']: try: - df = read_pandas_hdf5(file_path, key=key, columns=columns) + df = read_pandas_hdf5( + file_path, key=key, columns=columns, chunksize=chunksize + ) except (TypeError, ValueError): - df = read_h5py(file_path, columns=columns) + df = read_h5py(file_path, key=key, columns=columns, chunksize=chunksize) elif extension == '.json': with open(file_path, 'r') as j: From 32b088238e5ebfa74b590ebf4606dfaa4d91056e Mon Sep 17 00:00:00 2001 From: Maximilian Noethe Date: Tue, 7 Feb 2017 15:24:39 +0100 Subject: [PATCH 04/16] Fix readh5py generator --- io.py | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/io.py b/io.py index 68def32..1ccb827 100644 --- a/io.py +++ b/io.py @@ -60,23 +60,39 @@ def read_h5py(file_path, key='events', columns=None, chunksize=None): Names of the datasets to read in. If not given read all 1d datasets ''' + # read all columns and rows in one dataframe if no chunksize given + if chunksize is None: + with h5py.File(file_path) as f: + group = f.get(key) + if group is None: + raise IOError('File does not contain group "{}"'.format(key)) + + # get all columns of which don't have more than one value per row + if columns is None: + columns = [col for col in group.keys() if group[col].ndim == 1] + + df = pd.DataFrame() + for col in columns: + df[col] = to_native_byteorder(group[col][:]) + + return df + + # read data in chunks if chunksize is given + return read_h5py_chunked( + file_path, key=key, columns=columns, chunksize=chunksize + ) + + +def read_h5py_chunked(file_path, key='events', columns=None, chunksize=10000): with h5py.File(file_path) as f: group = f.get(key) if group is None: raise IOError('File does not contain group "{}"'.format(key)) + # get all columns of which don't have more than one value per row if columns is None: columns = [col for col in group.keys() if group[col].ndim == 1] - # read all columns and rows in one dataframe if now chunksize given - if chunksize is None: - df = pd.DataFrame() - for col in columns: - df[col] = to_native_byteorder(group[col][:]) - - return df - - # read data in chunks if chunksize is given n_events = group[next(iter(group.keys()))].shape[0] chunks = int(np.ceil(n_events / chunksize)) @@ -104,11 +120,19 @@ def read_data(file_path, query=None, sample=-1, key=None, columns=None, chunksiz if extension in ['.hdf', '.hdf5', '.h5']: try: df = read_pandas_hdf5( - file_path, key=key, columns=columns, chunksize=chunksize + file_path, + key=key or 'table', + columns=columns, + chunksize=chunksize, ) except (TypeError, ValueError): - df = read_h5py(file_path, key=key, columns=columns, chunksize=chunksize) + df = read_h5py( + file_path, + key=key or 'events', + columns=columns, + chunksize=chunksize, + ) elif extension == '.json': with open(file_path, 'r') as j: From 109ceacf06585d7c91f024ec508950c42ef9dbad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Tue, 7 Feb 2017 16:29:29 +0100 Subject: [PATCH 05/16] Allow setting number of signal and background events for separator --- io.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/io.py b/io.py index 1ccb827..886e1b7 100644 --- a/io.py +++ b/io.py @@ -114,7 +114,7 @@ def read_pandas_hdf5(file_path, key=None, columns=None, chunksize=None): return df -def read_data(file_path, query=None, sample=-1, key=None, columns=None, chunksize=None): +def read_data(file_path, key=None, columns=None, chunksize=None): name, extension = path.splitext(file_path) if extension in ['.hdf', '.hdf5', '.h5']: @@ -141,14 +141,6 @@ def read_data(file_path, query=None, sample=-1, key=None, columns=None, chunksiz else: raise NotImplementedError('Unknown data file extension {}'.format(extension)) - if sample > 0: - print('Taking {} random samples'.format(sample)) - df = df.sample(sample) - - if query: - print('Quering with string: {}'.format(query)) - df = df.copy().query(query) - return df From 7c598c528d86fe65e3542b839596c86eaf5ba496 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Wed, 8 Feb 2017 09:48:44 +0100 Subject: [PATCH 06/16] Implement chunking in apply_separation_model --- io.py | 54 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/io.py b/io.py index 886e1b7..7d68672 100644 --- a/io.py +++ b/io.py @@ -46,7 +46,7 @@ def to_native_byteorder(array): return array -def read_h5py(file_path, key='events', columns=None, chunksize=None): +def read_h5py(file_path, key='events', columns=None): ''' Read a hdf5 file written with h5py into a dataframe @@ -59,31 +59,29 @@ def read_h5py(file_path, key='events', columns=None, chunksize=None): columns: iterable[str] Names of the datasets to read in. If not given read all 1d datasets ''' + with h5py.File(file_path) as f: + group = f.get(key) + if group is None: + raise IOError('File does not contain group "{}"'.format(key)) - # read all columns and rows in one dataframe if no chunksize given - if chunksize is None: - with h5py.File(file_path) as f: - group = f.get(key) - if group is None: - raise IOError('File does not contain group "{}"'.format(key)) - - # get all columns of which don't have more than one value per row - if columns is None: - columns = [col for col in group.keys() if group[col].ndim == 1] + # get all columns of which don't have more than one value per row + if columns is None: + columns = [col for col in group.keys() if group[col].ndim == 1] - df = pd.DataFrame() - for col in columns: - df[col] = to_native_byteorder(group[col][:]) + df = pd.DataFrame() + for col in columns: + df[col] = to_native_byteorder(group[col][:]) - return df + return df - # read data in chunks if chunksize is given - return read_h5py_chunked( - file_path, key=key, columns=columns, chunksize=chunksize - ) +def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None): + ''' + Generator function to read from h5py hdf5 in chunks, + returns an iterator over pandas dataframes. -def read_h5py_chunked(file_path, key='events', columns=None, chunksize=10000): + When chunksize is None, use 1 chunk + ''' with h5py.File(file_path) as f: group = f.get(key) if group is None: @@ -94,9 +92,15 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=10000): columns = [col for col in group.keys() if group[col].ndim == 1] n_events = group[next(iter(group.keys()))].shape[0] - chunks = int(np.ceil(n_events / chunksize)) - for chunk in range(chunks): + if chunksize is None: + n_chunks = 1 + chunksize = n_events + else: + n_chunks = int(np.ceil(n_events / chunksize)) + log.info('Splitting data into {} chunks'.format(n_chunks)) + + for chunk in range(n_chunks): start = chunk * chunksize end = min(n_events, (chunk + 1) * chunksize) @@ -106,7 +110,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=10000): for col in columns: df[col] = to_native_byteorder(group[col][start:end]) - yield df + yield df, start, end def read_pandas_hdf5(file_path, key=None, columns=None, chunksize=None): @@ -114,7 +118,7 @@ def read_pandas_hdf5(file_path, key=None, columns=None, chunksize=None): return df -def read_data(file_path, key=None, columns=None, chunksize=None): +def read_data(file_path, key=None, columns=None): name, extension = path.splitext(file_path) if extension in ['.hdf', '.hdf5', '.h5']: @@ -123,7 +127,6 @@ def read_data(file_path, key=None, columns=None, chunksize=None): file_path, key=key or 'table', columns=columns, - chunksize=chunksize, ) except (TypeError, ValueError): @@ -131,7 +134,6 @@ def read_data(file_path, key=None, columns=None, chunksize=None): file_path, key=key or 'events', columns=columns, - chunksize=chunksize, ) elif extension == '.json': From 7d1b50ed04ba48719f2afbb18ccad4dfe3ff40a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Wed, 8 Feb 2017 10:51:06 +0100 Subject: [PATCH 07/16] Add __all__ to io.py --- io.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io.py b/io.py index 7d68672..1400608 100644 --- a/io.py +++ b/io.py @@ -9,6 +9,11 @@ import logging import numpy as np +__all__ = [ + 'write_data', 'to_native_byteorder', 'read_h5py', 'read_h5py_chunked', + 'read_pandas_hdf5', 'pickle_model', 'check_extension', 'read_data' +] + log = logging.getLogger(__name__) From 76810293da0c39d73ef1d22ac121f23c35d58062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Thu, 9 Feb 2017 18:36:17 +0100 Subject: [PATCH 08/16] Add first version of apply_cuts --- io.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/io.py b/io.py index 1400608..6f57560 100644 --- a/io.py +++ b/io.py @@ -80,6 +80,17 @@ def read_h5py(file_path, key='events', columns=None): return df +def h5py_get_n_events(file_path, key='events'): + + with h5py.File(file_path) as f: + group = f.get(key) + + if group is None: + raise IOError('File does not contain group "{}"'.format(key)) + + return group[next(iter(group.keys()))].shape[0] + + def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None): ''' Generator function to read from h5py hdf5 in chunks, @@ -96,7 +107,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None): if columns is None: columns = [col for col in group.keys() if group[col].ndim == 1] - n_events = group[next(iter(group.keys()))].shape[0] + n_events = h5py_get_n_events(file_path, key=key) if chunksize is None: n_chunks = 1 From df4e92f2683f369337ba3f7afe7376ec7d4abbb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Fri, 24 Feb 2017 16:33:47 +0100 Subject: [PATCH 09/16] Open h5py files with r mode, throws error if file not exists --- io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io.py b/io.py index 6f57560..8f13aa1 100644 --- a/io.py +++ b/io.py @@ -64,7 +64,7 @@ def read_h5py(file_path, key='events', columns=None): columns: iterable[str] Names of the datasets to read in. If not given read all 1d datasets ''' - with h5py.File(file_path) as f: + with h5py.File(file_path, 'r') as f: group = f.get(key) if group is None: raise IOError('File does not contain group "{}"'.format(key)) @@ -82,7 +82,7 @@ def read_h5py(file_path, key='events', columns=None): def h5py_get_n_events(file_path, key='events'): - with h5py.File(file_path) as f: + with h5py.File(file_path, 'r') as f: group = f.get(key) if group is None: @@ -98,7 +98,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None): When chunksize is None, use 1 chunk ''' - with h5py.File(file_path) as f: + with h5py.File(file_path, 'r') as f: group = f.get(key) if group is None: raise IOError('File does not contain group "{}"'.format(key)) From 40b86f5c5e05feb2a4f5dbe4a980043b98fc57a9 Mon Sep 17 00:00:00 2001 From: Maximilian Noethe Date: Wed, 1 Mar 2017 12:52:41 +0100 Subject: [PATCH 10/16] Fix h5py file modes --- io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io.py b/io.py index 8f13aa1..643f00a 100644 --- a/io.py +++ b/io.py @@ -64,7 +64,7 @@ def read_h5py(file_path, key='events', columns=None): columns: iterable[str] Names of the datasets to read in. If not given read all 1d datasets ''' - with h5py.File(file_path, 'r') as f: + with h5py.File(file_path, 'r+') as f: group = f.get(key) if group is None: raise IOError('File does not contain group "{}"'.format(key)) @@ -82,7 +82,7 @@ def read_h5py(file_path, key='events', columns=None): def h5py_get_n_events(file_path, key='events'): - with h5py.File(file_path, 'r') as f: + with h5py.File(file_path, 'r+') as f: group = f.get(key) if group is None: @@ -98,7 +98,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None): When chunksize is None, use 1 chunk ''' - with h5py.File(file_path, 'r') as f: + with h5py.File(file_path, 'r+') as f: group = f.get(key) if group is None: raise IOError('File does not contain group "{}"'.format(key)) From ab81dd753e9fc25f6032d767c4278ee54eb27a65 Mon Sep 17 00:00:00 2001 From: Maximilian Noethe Date: Thu, 23 Mar 2017 17:08:28 +0100 Subject: [PATCH 11/16] Add support for 2d columns --- io.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/io.py b/io.py index 643f00a..8637631 100644 --- a/io.py +++ b/io.py @@ -8,6 +8,7 @@ import sys import logging import numpy as np +from copy import copy __all__ = [ 'write_data', 'to_native_byteorder', 'read_h5py', 'read_h5py_chunked', @@ -75,7 +76,14 @@ def read_h5py(file_path, key='events', columns=None): df = pd.DataFrame() for col in columns: - df[col] = to_native_byteorder(group[col][:]) + array = to_native_byteorder(group[col][:]) + if array.ndim == 1: + df[col] == array + elif array.ndim == 2: + for i in range(array.shape[1]): + df[col + '_{}'.format(i)] = array[:, i] + else: + log.warning('Skipping column {}, not 1d or 2d'.format(col)) return df @@ -116,6 +124,11 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None): n_chunks = int(np.ceil(n_events / chunksize)) log.info('Splitting data into {} chunks'.format(n_chunks)) + for col in copy(columns): + if group[col].ndim > 2: + columns.remove(col) + log.warning('Ignoring column {}, not 1d or 2d'.format(col)) + for chunk in range(n_chunks): start = chunk * chunksize @@ -124,7 +137,14 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None): df = pd.DataFrame(index=np.arange(start, end)) for col in columns: - df[col] = to_native_byteorder(group[col][start:end]) + array = to_native_byteorder(group[col][start:end]) + + if array.ndim == 1: + df[col] == array + + else: + for i in range(array.shape[1]): + df[col + '_{}'.format(i)] = array[:, i] yield df, start, end From 14feb34f885e72614e1ddaeeb85870ab40972d45 Mon Sep 17 00:00:00 2001 From: Maximilian Noethe Date: Thu, 23 Mar 2017 17:13:01 +0100 Subject: [PATCH 12/16] Fix stupid == typo --- io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io.py b/io.py index 8637631..33fa53b 100644 --- a/io.py +++ b/io.py @@ -78,7 +78,7 @@ def read_h5py(file_path, key='events', columns=None): for col in columns: array = to_native_byteorder(group[col][:]) if array.ndim == 1: - df[col] == array + df[col] = array elif array.ndim == 2: for i in range(array.shape[1]): df[col + '_{}'.format(i)] = array[:, i] @@ -140,7 +140,7 @@ def read_h5py_chunked(file_path, key='events', columns=None, chunksize=None): array = to_native_byteorder(group[col][start:end]) if array.ndim == 1: - df[col] == array + df[col] = array else: for i in range(array.shape[1]): From f37bd88add482f211e8358d97443d58c3c6e4f31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Fri, 24 Mar 2017 13:43:09 +0100 Subject: [PATCH 13/16] Move io.py into fact package --- io.py => fact/io.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename io.py => fact/io.py (100%) diff --git a/io.py b/fact/io.py similarity index 100% rename from io.py rename to fact/io.py From f914ad34137435736cc1c968bd9be61a5a4c643b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Fri, 24 Mar 2017 13:47:05 +0100 Subject: [PATCH 14/16] Adapt io.py to pyfact --- fact/io.py | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/fact/io.py b/fact/io.py index 33fa53b..9975b9d 100644 --- a/fact/io.py +++ b/fact/io.py @@ -1,9 +1,6 @@ from os import path import pandas as pd import json -from sklearn_pandas import DataFrameMapper -from sklearn.externals import joblib -from sklearn2pmml import sklearn2pmml import h5py import sys import logging @@ -11,8 +8,13 @@ from copy import copy __all__ = [ - 'write_data', 'to_native_byteorder', 'read_h5py', 'read_h5py_chunked', - 'read_pandas_hdf5', 'pickle_model', 'check_extension', 'read_data' + 'write_data', + 'to_native_byteorder', + 'read_data', + 'read_h5py', + 'read_h5py_chunked', + 'read_pandas_hdf5', + 'check_extension', ] log = logging.getLogger(__name__) @@ -22,12 +24,12 @@ native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder] -def write_data(df, file_path, hdf_key='table'): +def write_data(df, file_path, key='table'): name, extension = path.splitext(file_path) if extension in ['.hdf', '.hdf5', '.h5']: - df.to_hdf(file_path, key=hdf_key) + df.to_hdf(file_path, key=key, format='table') elif extension == '.json': df.to_json(file_path) @@ -176,6 +178,8 @@ def read_data(file_path, key=None, columns=None): with open(file_path, 'r') as j: d = json.load(j) df = pd.DataFrame(d) + elif extension in ('.jsonl', '.jsonlines'): + df = pd.read_json(file_path, lines=True) else: raise NotImplementedError('Unknown data file extension {}'.format(extension)) @@ -186,21 +190,3 @@ def check_extension(file_path, allowed_extensions=allowed_extensions): p, extension = path.splitext(file_path) if extension not in allowed_extensions: raise IOError('Allowed formats: {}'.format(allowed_extensions)) - - -def pickle_model(classifier, feature_names, model_path, label_text='label'): - p, extension = path.splitext(model_path) - classifier.feature_names = feature_names - if (extension == '.pmml'): - print("Pickling model to {} ...".format(model_path)) - - mapper = DataFrameMapper([ - (feature_names, None), - (label_text, None), - ]) - - joblib.dump(classifier, p + '.pkl', compress=4) - sklearn2pmml(classifier, mapper, model_path) - - else: - joblib.dump(classifier, model_path, compress=4) From a989dd3c6b5ee282676f8ae6b43f95423f646f66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Fri, 24 Mar 2017 13:48:00 +0100 Subject: [PATCH 15/16] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3b4b54a..11c7a43 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='pyfact', - version='0.8.8', + version='0.9.0', description='A module containing useful methods for working with fact', url='http://github.com/fact-project/pyfact', author='Maximilian Noethe, Dominik Neise', From c52563d07793089577c321b26399dbbea8c36995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Tue, 28 Mar 2017 15:17:32 +0200 Subject: [PATCH 16/16] Adapt to kai's comments --- fact/io.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fact/io.py b/fact/io.py index 9975b9d..38c78aa 100644 --- a/fact/io.py +++ b/fact/io.py @@ -20,7 +20,7 @@ log = logging.getLogger(__name__) -allowed_extensions = ('.hdf', '.hdf5', '.h5', '.json', '.csv') +allowed_extensions = ('.hdf', '.hdf5', '.h5', '.json', '.jsonl', '.jsonlines', '.csv') native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder] @@ -34,13 +34,16 @@ def write_data(df, file_path, key='table'): elif extension == '.json': df.to_json(file_path) + elif extension in ('.jsonl', '.jsonline'): + df.to_json(file_path, lines=True, orient='records') + elif extension == '.csv': df.to_csv(file_path, delimiter=',', index=False) else: raise IOError( 'cannot write tabular data with format {}. Allowed formats: {}'.format( - extension, 'hdf5, json, csv' + extension, allowed_extensions, ) )