diff --git a/.gitignore b/.gitignore index ab2d61b..6f359db 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ github_deploy_key* *.swp +.pytest_cache # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/fact/VERSION b/fact/VERSION index 2a0970c..c5523bd 100644 --- a/fact/VERSION +++ b/fact/VERSION @@ -1 +1 @@ -0.16.1 +0.17.0 diff --git a/fact/io.py b/fact/io.py index 21742b2..050a70d 100644 --- a/fact/io.py +++ b/fact/io.py @@ -89,7 +89,14 @@ def to_native_byteorder(array): return array -def read_h5py(file_path, key='data', columns=None, mode='r'): +def read_h5py( + file_path, + key='data', + columns=None, + mode='r', + parse_dates=True, + first=None, + last=None): ''' Read a hdf5 file written with h5py into a dataframe @@ -101,6 +108,13 @@ def read_h5py(file_path, key='data', columns=None, mode='r'): name of the hdf5 group to read in columns: iterable[str] Names of the datasets to read in. If not given read all 1d datasets + parse_dates: bool + Convert columns with attrs['timeformat'] to timestamps + first: int or None + first row to read from the file + last: int or None + last event to read from the file + ''' with h5py.File(file_path, mode) as f: group = f.get(key) @@ -115,12 +129,16 @@ def read_h5py(file_path, key='data', columns=None, mode='r'): df = pd.DataFrame() for col in columns: - array = to_native_byteorder(group[col][:]) + dataset = group[col] + array = to_native_byteorder(dataset[first:last]) # pandas cannot handle bytes, convert to str if array.dtype.kind == 'S': array = array.astype(str) + if parse_dates and dataset.attrs.get('timeformat') is not None: + array = pd.to_datetime(array, infer_datetime_format=True) + if array.ndim == 1: df[col] = array elif array.ndim == 2: @@ -146,7 +164,13 @@ def h5py_get_n_rows(file_path, key='data', mode='r'): return group[next(iter(group.keys()))].shape[0] -def read_h5py_chunked(file_path, key='data', columns=None, chunksize=None, mode='r'): +def read_h5py_chunked( + file_path, + key='data', + columns=None, + chunksize=None, + mode='r', + parse_dates=True): ''' Generator function to read from h5py hdf5 in chunks, returns an iterator over pandas dataframes. @@ -178,28 +202,22 @@ def read_h5py_chunked(file_path, key='data', columns=None, chunksize=None, mode= columns.remove(col) log.warning('Ignoring column {}, not 1d or 2d'.format(col)) - for chunk in range(n_chunks): - - start = chunk * chunksize - end = min(n_rows, (chunk + 1) * chunksize) - - df = pd.DataFrame(index=np.arange(start, end)) - - for col in columns: - array = to_native_byteorder(group[col][start:end]) + for chunk in range(n_chunks): - # pandas cannot handle bytes, convert to str - if array.dtype.kind == 'S': - array = array.astype(str) + start = chunk * chunksize + end = min(n_rows, (chunk + 1) * chunksize) - if array.ndim == 1: - df[col] = array - - else: - for i in range(array.shape[1]): - df[col + '_{}'.format(i)] = array[:, i] + df = read_h5py( + file_path, + key=key, + columns=columns, + parse_dates=parse_dates, + first=start, + last=end + ) + df.index = np.arange(start, end) - yield df, start, end + yield df, start, end def read_data(file_path, key=None, columns=None, **kwargs): @@ -370,6 +388,7 @@ def create_empty_h5py_dataset(array, group, name, **kwargs): dtype = array.dtype maxshape = [None] + list(array.shape)[1:] shape = [0] + list(array.shape)[1:] + attrs = {} if dtype.base == object: if isinstance(array[0], list): @@ -382,6 +401,7 @@ def create_empty_h5py_dataset(array, group, name, **kwargs): elif dtype.type == np.datetime64: # save dates as ISO string, create dummy date to get correct length dt = np.array(0, dtype=dtype).astype('S').dtype + attrs['timeformat'] = 'iso' else: dt = dtype.base @@ -393,6 +413,10 @@ def create_empty_h5py_dataset(array, group, name, **kwargs): dtype=dt, **kwargs ) + + for k, v in attrs.items(): + dataset.attrs[k] = v + return dataset diff --git a/tests/test_io.py b/tests/test_io.py index f4f8d1e..6333712 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -62,9 +62,6 @@ def test_to_h5py_datetime(): to_h5py(df, f.name, key='test') df2 = read_h5py(f.name, key='test') - for col in df2.columns: - df2[col] = pd.to_datetime(df2[col]) - for col in df.columns: assert all(df[col] == df2[col])