Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

24 common csv parser #35

Merged
merged 10 commits into from
May 2, 2024
5 changes: 3 additions & 2 deletions jlab_datascience_toolkit/data_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from jlab_datascience_toolkit.data_parser.numpy_parser import NumpyParser

register(
id="PandasParser_v0",
entry_point="jlab_datascience_toolkit.data_parser.pandas_parser_v0:PandasParser"
id='CSVParser_v0',
entry_point="jlab_datascience_toolkit.data_parser.parser_to_dataframe:Parser2DataFrame",
kwargs={'registry_config': {'file_format': 'csv'}}
)
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import inspect
import os

pandas_parser_log = logging.getLogger('PandasParser_v0 Logger')
parser_log = logging.getLogger('Parser Logger')

# Supported file formats
pandas_read_functions = dict(
Expand All @@ -16,7 +16,7 @@
pickle=pd.read_pickle
)

class PandasParser(JDSTDataParser):
class Parser2DataFrame(JDSTDataParser):
"""Reads a list of files and concatenates them in a Pandas DataFrame.

Intialization arguments:
Expand All @@ -30,8 +30,9 @@ class PandasParser(JDSTDataParser):
Format of files to parse. Currently supports csv, feather, json
and pickle. Defaults to csv
`read_kwargs: dict = {}`
Arguments to be passed
Arguments to be passed to the read function determined by `file_format`
`concat_kwargs: dict = {}`
Arguments to be passed to pd.concat()

Attributes
----------
Expand Down Expand Up @@ -59,19 +60,31 @@ class PandasParser(JDSTDataParser):

"""

def __init__(self, config: dict = None):
def __init__(self, config: dict = None, registry_config: dict = None):
# It is important not to use default mutable arguments in python
# (lists/dictionaries), so we set config to None and update later

# Priority for configurations is:
# 1) config (intended for users)
# 2) registry_config (intended only for the registry)
# 3) defaults (set below)

# Set default config
self.config = dict(
filepaths=[],
file_format='csv',
read_kwargs = {},
concat_kwargs = {},
)
# Update configuration with new configuration

# First update defaults with registry_configuration
if registry_config is not None:
parser_log.debug(f'Updating defaults with: {registry_config}')
self.config.update(registry_config)

# Now update configuration with new (user) configuration
if config is not None:
parser_log.debug(f'Updating registered config with: {config}')
self.config.update(config)

# To handle strings and lists of strings, we convert the former here
Expand All @@ -82,21 +95,21 @@ def __init__(self, config: dict = None):

@property
def name(self):
return 'PandasParser_v0'
return 'Parser2DataFrame_v0'

def setup(self):
# Set the correct reading function here
self.read_function = pandas_read_functions.get(
self.config['file_format'].lower(), None)

if self.read_function is None:
pandas_parser_log.error(
parser_log.error(
f'File format {self.config["file_format"]}'
'is not currently supported.')
raise ValueError

def get_info(self):
""" Prints the docstring for the PandasParser module"""
""" Prints the docstring for the Parser2DataFrame module"""
print(inspect.getdoc(self))

def load(self, path: str):
Expand Down Expand Up @@ -133,15 +146,15 @@ def load_data(self) -> pd.DataFrame:
"""
data_list = []
for file in self.config['filepaths']:
pandas_parser_log.debug(f'Loading {file} ...')
parser_log.debug(f'Loading {file} ...')
data = self.read_function(
file,
**self.config['read_kwargs'])
data_list.append(data)

# Check for empty data and return nothing if empty
if not data_list:
pandas_parser_log.warning(
parser_log.warning(
'load_data() returning None. This is probably not what you '
'wanted. Ensure that your configuration includes the key '
'"filepaths"')
Expand All @@ -154,12 +167,12 @@ def load_data(self) -> pd.DataFrame:
return output

def load_config(self, path: str):
pandas_parser_log.debug('Calling load()...')
parser_log.debug('Calling load()...')
return self.load(path)

def save_config(self, path: str):
pandas_parser_log.debug('Calling save()...')
parser_log.debug('Calling save()...')
return self.save(path)

def save_data(self):
return super().save_data()
return super().save_data()
41 changes: 41 additions & 0 deletions jlab_datascience_toolkit/utils/parser_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import pathlib
import yaml
import pandas as pd

def save_config_to_yaml(config, path):
save_path = pathlib.Path(path)
os.makedirs(save_path)
with open(save_path.joinpath('config.yaml'), 'w') as f:
yaml.safe_dump(self.config, f)

def load_yaml_config(path):
base_path = Path(path)
with open(base_path.joinpath('config.yaml'), 'r') as f:
config = yaml.safe_load(f)
return config

def read_data_to_pandas(filepaths: list, file_format: str, **kwargs) -> pd.DataFrame:
""" Loads all files listed in filepaths and reads them.
All kwargs other than filepaths and file_format will be passed to the read_function
for its associated file_format

Returns:
pd.DataFrame: A single DataFrame containing list of dataframes
"""

# Supported file formats
read_functions = dict(
csv=pd.read_csv,
feather=pd.read_feather,
json=pd.read_json,
pickle=pd.read_pickle
)

data_list = []
read_function = read_functions[file_format]
for file in filepaths:
data = read_function(file, **kwargs)
data_list.append(data)

return data_list
49 changes: 38 additions & 11 deletions utests/utest_pandas_parser_v0.py → utests/utest_csv_parser.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
from jlab_datascience_toolkit.data_parser import make
import unittest
import logging
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import argparse
import shutil
import sys
import os

test_log = logging.Logger('test_logger')

rng = np.random.default_rng(seed=42)
parser_id = 'CSVParser_v0'


class TestPandasParserv0(unittest.TestCase):
class TestCSVParserv0(unittest.TestCase):

# Initialize:
# *****************************************
def __init__(self, *args, **kwargs):
super(TestPandasParserv0, self).__init__(*args, **kwargs)
super(TestCSVParserv0, self).__init__(*args, **kwargs)

@classmethod
def setUpClass(self) -> None:
print('Setting up all tests...')
self.columns = ['R121GMES', 'R122GMES',
'R123GMES', 'R121GSET', 'R122GSET', 'R123GSET']
self.path = './pandas_parser_utest.csv'
self.path = './csv_parser_utest.csv'
self.samples = 100
data = rng.normal(loc=5, scale=1, size=(
self.samples, len(self.columns)))
Expand All @@ -34,7 +41,7 @@ def setUpClass(self) -> None:
test_data
test_data.to_csv(self.path)

self.path2 = './pandas_parser_utest2.csv'
self.path2 = './csv_parser_utest2.csv'
data = rng.normal(loc=9, scale=2, size=(
self.samples, len(self.columns)))
dates = []
Expand Down Expand Up @@ -64,14 +71,14 @@ def tearDown(self) -> None:

def test_no_config(self):
print('*****No Config Test*****\n')
parser = make('PandasParser_v0')
parser = make(parser_id)
output = parser.load_data()
self.assertIsNone(output)

def test_string_filepaths(self):
print('*****String Filepaths Test*****\n')

parser = make('PandasParser_v0', config=dict(filepaths=self.path))
parser = make(parser_id, config=dict(filepaths=self.path))
output = parser.load_data()
print('Output Head:\n', output.head())

Expand All @@ -80,14 +87,14 @@ def test_string_filepaths(self):
def test_one_item_list_filepaths(self):
print('*****One Item List Test*****\n')

parser = make('PandasParser_v0', config=dict(filepaths=[self.path]))
parser = make(parser_id, config=dict(filepaths=[self.path]))
output = parser.load_data()
print('Output Head:\n', output.head())
self.assertEqual(output.shape, (self.samples, len(self.columns)+1))

def test_two_filepaths(self):
print('*****Two Filepaths Test*****\n')
parser = make('PandasParser_v0', config=dict(filepaths=[self.path, self.path2]))
parser = make(parser_id, config=dict(filepaths=[self.path, self.path2]))
output = parser.load_data()
print('Output Head:\n', output.head())
print('Output shape:', output.shape)
Expand All @@ -97,7 +104,7 @@ def test_usecols_read_arg(self):
print('*****Usecols Read Arg Test*****\n')

two_columns = ['R121GMES', 'R121GSET']
parser = make('PandasParser_v0', config=dict(
parser = make(parser_id, config=dict(
filepaths=self.path, read_kwargs=dict(usecols=two_columns)))
output = parser.load_data()
print('Output Head:\n', output.head())
Expand All @@ -110,7 +117,7 @@ def test_use_datetime_index(self):
def column_lambda(x): return ('GMES' in x) or (x == 'Date')
read_kwargs = dict(usecols=column_lambda,
index_col='Date', parse_dates=True)
parser = make('PandasParser_v0',
parser = make(parser_id,
config=dict(
filepaths=self.path, read_kwargs=read_kwargs)
)
Expand All @@ -121,7 +128,27 @@ def column_lambda(x): return ('GMES' in x) or (x == 'Date')
self.assertTrue('GMES' in column)
self.assertIsInstance(output.index, pd.DatetimeIndex)

def test_save_load(self):
print('*****Save/Load Test*****\n')

# Run this file via: python utest_pandas_parser_v0.py
parser = make(parser_id, config=dict(filepaths=self.path, read_kwargs={'usecols': self.columns}))
output = parser.load_data()
save_path = './temp_parser'
try:
parser.save(save_path)
new_parser = make(parser_id)
new_parser.load(save_path)
new_output = new_parser.load_data()
for col in output.columns:
with self.subTest(col=col):
self.assertTrue(np.allclose(output[col], new_output[col]))
finally:
shutil.rmtree(save_path)
pass

# Run this file via: python utest_csv_parser_v0.py
if __name__ == "__main__":
argv = len(sys.argv) > 1 and sys.argv[1]
loglevel = logging.DEBUG if argv == '-v' else logging.WARNING
logging.basicConfig(stream=sys.stdout, level=loglevel)
unittest.main()