From f914ad34137435736cc1c968bd9be61a5a4c643b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20N=C3=B6the?= Date: Fri, 24 Mar 2017 13:47:05 +0100 Subject: [PATCH] Adapt io.py to pyfact --- fact/io.py | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/fact/io.py b/fact/io.py index 33fa53b..9975b9d 100644 --- a/fact/io.py +++ b/fact/io.py @@ -1,9 +1,6 @@ from os import path import pandas as pd import json -from sklearn_pandas import DataFrameMapper -from sklearn.externals import joblib -from sklearn2pmml import sklearn2pmml import h5py import sys import logging @@ -11,8 +8,13 @@ from copy import copy __all__ = [ - 'write_data', 'to_native_byteorder', 'read_h5py', 'read_h5py_chunked', - 'read_pandas_hdf5', 'pickle_model', 'check_extension', 'read_data' + 'write_data', + 'to_native_byteorder', + 'read_data', + 'read_h5py', + 'read_h5py_chunked', + 'read_pandas_hdf5', + 'check_extension', ] log = logging.getLogger(__name__) @@ -22,12 +24,12 @@ native_byteorder = native_byteorder = {'little': '<', 'big': '>'}[sys.byteorder] -def write_data(df, file_path, hdf_key='table'): +def write_data(df, file_path, key='table'): name, extension = path.splitext(file_path) if extension in ['.hdf', '.hdf5', '.h5']: - df.to_hdf(file_path, key=hdf_key) + df.to_hdf(file_path, key=key, format='table') elif extension == '.json': df.to_json(file_path) @@ -176,6 +178,8 @@ def read_data(file_path, key=None, columns=None): with open(file_path, 'r') as j: d = json.load(j) df = pd.DataFrame(d) + elif extension in ('.jsonl', '.jsonlines'): + df = pd.read_json(file_path, lines=True) else: raise NotImplementedError('Unknown data file extension {}'.format(extension)) @@ -186,21 +190,3 @@ def check_extension(file_path, allowed_extensions=allowed_extensions): p, extension = path.splitext(file_path) if extension not in allowed_extensions: raise IOError('Allowed formats: {}'.format(allowed_extensions)) - - -def pickle_model(classifier, feature_names, model_path, label_text='label'): - p, extension = path.splitext(model_path) - classifier.feature_names = feature_names - if (extension == '.pmml'): - print("Pickling model to {} ...".format(model_path)) - - mapper = DataFrameMapper([ - (feature_names, None), - (label_text, None), - ]) - - joblib.dump(classifier, p + '.pkl', compress=4) - sklearn2pmml(classifier, mapper, model_path) - - else: - joblib.dump(classifier, model_path, compress=4)