Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
maikefischer authored Oct 4, 2019
2 parents 2485136 + 423d4c2 commit e650c1a
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 18 deletions.
6 changes: 4 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added

### Changed

### Removed

## [0.14.1] - 2019-10-04
### Added
- Added version flag to CLI, PR #79. [@maikefischer](github.com/maikefischer)
- Prohibit ncores < 1, PR #76 [@annakau](https://github.com/annakau)

### Changed
- Binned dataframes can now be produced from ND jagged arrays

## [0.14.0] - 2019-10-03
### Added
Expand Down
42 changes: 41 additions & 1 deletion fast_carpenter/summary/binned_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
"""
import os
import re
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
from . import binning_config as cfg


Expand Down Expand Up @@ -186,7 +188,8 @@ def event(self, chunk):
else:
weights = None

data = chunk.tree.pandas.df(all_inputs)
data = chunk.tree.pandas.df(all_inputs, flatten=False)
data = explode(data)

binned_values = _bin_values(data, dimensions=self._bin_dims,
binnings=self._binnings,
Expand Down Expand Up @@ -250,3 +253,40 @@ def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_we

histogram.index.set_names(out_dimensions, inplace=True)
return histogram


def explode(df, fill_value=float("nan")):
"""
Based on this answer:
https://stackoverflow.com/questions/12680754/split-explode-pandas\
-dataframe-string-entry-to-separate-rows/40449726#40449726
"""
# get the list columns
lst_cols = [col for col, dtype in df.dtypes.items() if is_object_dtype(dtype)]
if not lst_cols:
return df

# all columns except `lst_cols`
idx_cols = df.columns.difference(lst_cols)

# check all lists have same length
lens = pd.DataFrame({col: df[col].str.len() for col in lst_cols})
different_length = (lens.nunique(axis=1) > 1).any()
if different_length:
raise ValueError("Cannot bin multiple arrays with different jaggedness")
lens = lens[lst_cols[0]]

# create "exploded" DF
flattened = {col: df.loc[lens > 0, col].values for col in lst_cols}
flattened = {col: sum(map(list, vals), []) for col, vals in flattened.items()}
res = pd.DataFrame({col: np.repeat(df[col].values, lens) for col in idx_cols})
res = res.assign(**flattened)

# append those rows that have empty lists
if (lens == 0).any():
# at least one list in cells is empty
res = (res.append(df.loc[lens == 0, idx_cols], sort=False)
.fillna(fill_value))

# Check that rows are fully "exploded"
return explode(res)
20 changes: 7 additions & 13 deletions fast_carpenter/tree_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
minimal coding on my side...
"""
import uproot
from uproot.interp.objects import asgenobj
from uproot.interp.jagged import asjagged
from uproot.interp.numerical import asdtype
from uproot import asjagged, asdtype, asgenobj
import copy
import awkward

Expand All @@ -19,24 +17,20 @@ def recursive_type_wrap(array):
return asdtype(array.dtype.fields)


class asgenobj_then_jagged():
def __init__(self, original):
self.wrapping = original

class wrapped_asgenobj(asgenobj):
def finalize(self, *args, **kwargs):
result = self.wrapping.finalize(*args, **kwargs)
return awkward.JaggedArray.fromiter(result)
result = super(wrapped_asgenobj, self).finalize(*args, **kwargs)
result = awkward.JaggedArray.fromiter(result)
return result

def __getattr__(self, attr):
return getattr(self.wrapping, attr)

uproot.interp.auto.asgenobj = wrapped_asgenobj


def wrapped_interpret(branch, *args, **kwargs):
from uproot.interp.auto import interpret
result = interpret(branch, *args, **kwargs)
if result:
if isinstance(result, asgenobj):
result = asgenobj_then_jagged(result)
return result

if isinstance(branch, WrappedTree.FakeBranch):
Expand Down
2 changes: 1 addition & 1 deletion fast_carpenter/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ def split_version(version):
return tuple(result)


__version__ = '0.14.0'
__version__ = '0.14.1'
version_info = split_version(__version__) # noqa
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.14.0
current_version = 0.14.1
commit = True
tag = False

Expand Down
24 changes: 24 additions & 0 deletions tests/summary/test_binned_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,27 @@ def test_BinnedDataframe_numexpr_similar_branch(binned_df_3, tmpdir, full_wrappe
bin_centers = pd.IntervalIndex(results.index.get_level_values('electron_pT')).mid
mean = np.sum((bin_centers[1:-1] * results['n'][1:-1]) / results['n'][1:-1].sum())
assert mean == pytest.approx(44.32584)


def test_explode():
df = pd.DataFrame({'A': [[1, 2, 3], [9], [], [3, 4]], 'B': 1})
exploded = bdf.explode(df)
assert len(exploded) == 7
assert all(df.B == 1)

df["C"] = df.A.copy()
exploded = bdf.explode(df)
assert len(exploded) == 7
assert all(df.B == 1)
assert all(df.A == df.C)

df["D"] = [[1], [3], [4, 5], []]
with pytest.raises(ValueError) as e:
exploded = bdf.explode(df)
assert "different jaggedness" in str(e)

df2 = pd.DataFrame({'A': [[np.arange(i + 1) + j for i in range((j % 2) + 1)] for j in range(4)],
'B': np.arange(4)[::-1],
})
exploded = bdf.explode(df2)
assert len(exploded) == 8

0 comments on commit e650c1a

Please sign in to comment.