Skip to content

Commit

Permalink
Merge pull request #110 from FAST-HEP/BK_issue_109-fix_explode_strings
Browse files Browse the repository at this point in the history
Fix explode method for string arrays
  • Loading branch information
benkrikler authored Feb 16, 2020
2 parents 055c46a + 722ece8 commit 5ab59f0
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [0.16.1] - 2020-02-16
### Fixed
- Unit test that was broken by Pandas >1.0.0
- Bug in explode function when an dimension contains strings, issue #109, PR #110 [@BenKrikler](https://github.com/benkrikler)

## [0.16.0] - 2019-11-1
### Added
Expand Down
5 changes: 5 additions & 0 deletions fast_carpenter/summary/binned_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,9 @@ def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_we
return histogram


_explodable_types = (tuple, list, np.ndarray)


def explode(df):
"""
Based on this answer:
Expand All @@ -270,6 +273,8 @@ def explode(df):
"""
# get the list columns
lst_cols = [col for col, dtype in df.dtypes.items() if is_object_dtype(dtype)]
# Be more specific about which objects are ok
lst_cols = [col for col in lst_cols if isinstance(df[col][0], _explodable_types)]
if not lst_cols:
return df

Expand Down
2 changes: 1 addition & 1 deletion fast_carpenter/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ def split_version(version):
return tuple(result)


__version__ = '0.16.0'
__version__ = '0.16.1'
version_info = split_version(__version__) # noqa
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.16.0
current_version = 0.16.1
commit = True
tag = False

Expand All @@ -18,4 +18,3 @@ test = pytest

[tool:pytest]
collect_ignore = ['setup.py']

19 changes: 16 additions & 3 deletions tests/summary/test_binned_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,15 @@ def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_m
results = collector._prepare_output(dataset_readers_list)

assert results.index.nlevels == 2 + int(dataset_col)
if pad_missing or not dataset_col:
if tuple(map(int, pd.__version__.split("."))) >= (1, 0, 0):
length = (4 * 12) * (1 + int(dataset_col))
else:
length = 95 # When dataset_col True and pad_missing False one bin is missing
assert len(results) == length
# Pre Pandas 1.0.0 the following lengths were needed.
if pad_missing or not dataset_col:
length = (4 * 12) * (1 + int(dataset_col))
else:
length = 95 # When dataset_col True and pad_missing False one bin is missing
assert len(results) == length

totals = results.sum()
# Based on: events->Draw("Jet_Py", "", "goff")
Expand Down Expand Up @@ -233,3 +237,12 @@ def test_explode():
})
exploded = bdf.explode(df2)
assert len(exploded) == 8

df = pd.DataFrame({'number': [1, 8, 3], 'string': ['one', 'eight', 'three']})
exploded = bdf.explode(df)
assert len(exploded) == 3

df["list"] = [list(range(i)) for i in df.number]
exploded = bdf.explode(df)
assert len(exploded) == 1 + 8 + 3
assert np.array_equal(exploded.list, [0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2])

0 comments on commit 5ab59f0

Please sign in to comment.