diff --git a/jlab_datascience_toolkit/data_parser/__init__.py b/jlab_datascience_toolkit/data_parser/__init__.py index 44d6f0d..4bb6bae 100644 --- a/jlab_datascience_toolkit/data_parser/__init__.py +++ b/jlab_datascience_toolkit/data_parser/__init__.py @@ -8,6 +8,7 @@ from jlab_datascience_toolkit.data_parser.numpy_parser import NumpyParser register( - id="PandasParser_v0", - entry_point="jlab_datascience_toolkit.data_parser.pandas_parser_v0:PandasParser" + id='CSVParser_v0', + entry_point="jlab_datascience_toolkit.data_parser.parser_to_dataframe:Parser2DataFrame", + kwargs={'registry_config': {'file_format': 'csv'}} ) \ No newline at end of file diff --git a/jlab_datascience_toolkit/data_parser/pandas_parser_v0.py b/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py similarity index 78% rename from jlab_datascience_toolkit/data_parser/pandas_parser_v0.py rename to jlab_datascience_toolkit/data_parser/parser_to_dataframe.py index a90976a..c39d1c8 100644 --- a/jlab_datascience_toolkit/data_parser/pandas_parser_v0.py +++ b/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py @@ -6,7 +6,7 @@ import inspect import os -pandas_parser_log = logging.getLogger('PandasParser_v0 Logger') +parser_log = logging.getLogger('Parser Logger') # Supported file formats pandas_read_functions = dict( @@ -16,7 +16,7 @@ pickle=pd.read_pickle ) -class PandasParser(JDSTDataParser): +class Parser2DataFrame(JDSTDataParser): """Reads a list of files and concatenates them in a Pandas DataFrame. Intialization arguments: @@ -30,8 +30,9 @@ class PandasParser(JDSTDataParser): Format of files to parse. Currently supports csv, feather, json and pickle. Defaults to csv `read_kwargs: dict = {}` - Arguments to be passed + Arguments to be passed to the read function determined by `file_format` `concat_kwargs: dict = {}` + Arguments to be passed to pd.concat() Attributes ---------- @@ -59,10 +60,15 @@ class PandasParser(JDSTDataParser): """ - def __init__(self, config: dict = None): + def __init__(self, config: dict = None, registry_config: dict = None): # It is important not to use default mutable arguments in python # (lists/dictionaries), so we set config to None and update later + # Priority for configurations is: + # 1) config (intended for users) + # 2) registry_config (intended only for the registry) + # 3) defaults (set below) + # Set default config self.config = dict( filepaths=[], @@ -70,8 +76,15 @@ def __init__(self, config: dict = None): read_kwargs = {}, concat_kwargs = {}, ) - # Update configuration with new configuration + + # First update defaults with registry_configuration + if registry_config is not None: + parser_log.debug(f'Updating defaults with: {registry_config}') + self.config.update(registry_config) + + # Now update configuration with new (user) configuration if config is not None: + parser_log.debug(f'Updating registered config with: {config}') self.config.update(config) # To handle strings and lists of strings, we convert the former here @@ -82,7 +95,7 @@ def __init__(self, config: dict = None): @property def name(self): - return 'PandasParser_v0' + return 'Parser2DataFrame_v0' def setup(self): # Set the correct reading function here @@ -90,13 +103,13 @@ def setup(self): self.config['file_format'].lower(), None) if self.read_function is None: - pandas_parser_log.error( + parser_log.error( f'File format {self.config["file_format"]}' 'is not currently supported.') raise ValueError def get_info(self): - """ Prints the docstring for the PandasParser module""" + """ Prints the docstring for the Parser2DataFrame module""" print(inspect.getdoc(self)) def load(self, path: str): @@ -133,7 +146,7 @@ def load_data(self) -> pd.DataFrame: """ data_list = [] for file in self.config['filepaths']: - pandas_parser_log.debug(f'Loading {file} ...') + parser_log.debug(f'Loading {file} ...') data = self.read_function( file, **self.config['read_kwargs']) @@ -141,7 +154,7 @@ def load_data(self) -> pd.DataFrame: # Check for empty data and return nothing if empty if not data_list: - pandas_parser_log.warning( + parser_log.warning( 'load_data() returning None. This is probably not what you ' 'wanted. Ensure that your configuration includes the key ' '"filepaths"') @@ -154,12 +167,12 @@ def load_data(self) -> pd.DataFrame: return output def load_config(self, path: str): - pandas_parser_log.debug('Calling load()...') + parser_log.debug('Calling load()...') return self.load(path) def save_config(self, path: str): - pandas_parser_log.debug('Calling save()...') + parser_log.debug('Calling save()...') return self.save(path) def save_data(self): - return super().save_data() \ No newline at end of file + return super().save_data() diff --git a/jlab_datascience_toolkit/utils/parser_utilities.py b/jlab_datascience_toolkit/utils/parser_utilities.py new file mode 100644 index 0000000..b294ba9 --- /dev/null +++ b/jlab_datascience_toolkit/utils/parser_utilities.py @@ -0,0 +1,41 @@ +import os +import pathlib +import yaml +import pandas as pd + +def save_config_to_yaml(config, path): + save_path = pathlib.Path(path) + os.makedirs(save_path) + with open(save_path.joinpath('config.yaml'), 'w') as f: + yaml.safe_dump(self.config, f) + +def load_yaml_config(path): + base_path = Path(path) + with open(base_path.joinpath('config.yaml'), 'r') as f: + config = yaml.safe_load(f) + return config + +def read_data_to_pandas(filepaths: list, file_format: str, **kwargs) -> pd.DataFrame: + """ Loads all files listed in filepaths and reads them. + All kwargs other than filepaths and file_format will be passed to the read_function + for its associated file_format + + Returns: + pd.DataFrame: A single DataFrame containing list of dataframes + """ + + # Supported file formats + read_functions = dict( + csv=pd.read_csv, + feather=pd.read_feather, + json=pd.read_json, + pickle=pd.read_pickle + ) + + data_list = [] + read_function = read_functions[file_format] + for file in filepaths: + data = read_function(file, **kwargs) + data_list.append(data) + + return data_list \ No newline at end of file diff --git a/utests/utest_pandas_parser_v0.py b/utests/utest_csv_parser.py similarity index 71% rename from utests/utest_pandas_parser_v0.py rename to utests/utest_csv_parser.py index 1d7b5ad..3bc3822 100644 --- a/utests/utest_pandas_parser_v0.py +++ b/utests/utest_csv_parser.py @@ -1,26 +1,33 @@ from jlab_datascience_toolkit.data_parser import make import unittest +import logging import matplotlib.pyplot as plt import pandas as pd import numpy as np +import argparse +import shutil +import sys import os +test_log = logging.Logger('test_logger') + rng = np.random.default_rng(seed=42) +parser_id = 'CSVParser_v0' -class TestPandasParserv0(unittest.TestCase): +class TestCSVParserv0(unittest.TestCase): # Initialize: # ***************************************** def __init__(self, *args, **kwargs): - super(TestPandasParserv0, self).__init__(*args, **kwargs) + super(TestCSVParserv0, self).__init__(*args, **kwargs) @classmethod def setUpClass(self) -> None: print('Setting up all tests...') self.columns = ['R121GMES', 'R122GMES', 'R123GMES', 'R121GSET', 'R122GSET', 'R123GSET'] - self.path = './pandas_parser_utest.csv' + self.path = './csv_parser_utest.csv' self.samples = 100 data = rng.normal(loc=5, scale=1, size=( self.samples, len(self.columns))) @@ -34,7 +41,7 @@ def setUpClass(self) -> None: test_data test_data.to_csv(self.path) - self.path2 = './pandas_parser_utest2.csv' + self.path2 = './csv_parser_utest2.csv' data = rng.normal(loc=9, scale=2, size=( self.samples, len(self.columns))) dates = [] @@ -64,14 +71,14 @@ def tearDown(self) -> None: def test_no_config(self): print('*****No Config Test*****\n') - parser = make('PandasParser_v0') + parser = make(parser_id) output = parser.load_data() self.assertIsNone(output) def test_string_filepaths(self): print('*****String Filepaths Test*****\n') - parser = make('PandasParser_v0', config=dict(filepaths=self.path)) + parser = make(parser_id, config=dict(filepaths=self.path)) output = parser.load_data() print('Output Head:\n', output.head()) @@ -80,14 +87,14 @@ def test_string_filepaths(self): def test_one_item_list_filepaths(self): print('*****One Item List Test*****\n') - parser = make('PandasParser_v0', config=dict(filepaths=[self.path])) + parser = make(parser_id, config=dict(filepaths=[self.path])) output = parser.load_data() print('Output Head:\n', output.head()) self.assertEqual(output.shape, (self.samples, len(self.columns)+1)) def test_two_filepaths(self): print('*****Two Filepaths Test*****\n') - parser = make('PandasParser_v0', config=dict(filepaths=[self.path, self.path2])) + parser = make(parser_id, config=dict(filepaths=[self.path, self.path2])) output = parser.load_data() print('Output Head:\n', output.head()) print('Output shape:', output.shape) @@ -97,7 +104,7 @@ def test_usecols_read_arg(self): print('*****Usecols Read Arg Test*****\n') two_columns = ['R121GMES', 'R121GSET'] - parser = make('PandasParser_v0', config=dict( + parser = make(parser_id, config=dict( filepaths=self.path, read_kwargs=dict(usecols=two_columns))) output = parser.load_data() print('Output Head:\n', output.head()) @@ -110,7 +117,7 @@ def test_use_datetime_index(self): def column_lambda(x): return ('GMES' in x) or (x == 'Date') read_kwargs = dict(usecols=column_lambda, index_col='Date', parse_dates=True) - parser = make('PandasParser_v0', + parser = make(parser_id, config=dict( filepaths=self.path, read_kwargs=read_kwargs) ) @@ -121,7 +128,27 @@ def column_lambda(x): return ('GMES' in x) or (x == 'Date') self.assertTrue('GMES' in column) self.assertIsInstance(output.index, pd.DatetimeIndex) + def test_save_load(self): + print('*****Save/Load Test*****\n') -# Run this file via: python utest_pandas_parser_v0.py + parser = make(parser_id, config=dict(filepaths=self.path, read_kwargs={'usecols': self.columns})) + output = parser.load_data() + save_path = './temp_parser' + try: + parser.save(save_path) + new_parser = make(parser_id) + new_parser.load(save_path) + new_output = new_parser.load_data() + for col in output.columns: + with self.subTest(col=col): + self.assertTrue(np.allclose(output[col], new_output[col])) + finally: + shutil.rmtree(save_path) + pass + +# Run this file via: python utest_csv_parser_v0.py if __name__ == "__main__": + argv = len(sys.argv) > 1 and sys.argv[1] + loglevel = logging.DEBUG if argv == '-v' else logging.WARNING + logging.basicConfig(stream=sys.stdout, level=loglevel) unittest.main()