Merge branch 'master' into master

FAST-HEP · Oct 4, 2019 · e650c1a · e650c1a
2 parents 2485136 + 423d4c2
commit e650c1a
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,14 +6,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 ### Added
-
 ### Changed
-
 ### Removed
 
 ## [0.14.1] - 2019-10-04
 ### Added
 - Added version flag to CLI, PR #79. [@maikefischer](github.com/maikefischer)
+- Prohibit ncores < 1, PR #76 [@annakau](https://github.com/annakau)
+
+### Changed
+- Binned dataframes can now be produced from ND jagged arrays
 
 ## [0.14.0] - 2019-10-03
 ### Added

diff --git a/fast_carpenter/summary/binned_dataframe.py b/fast_carpenter/summary/binned_dataframe.py
@@ -3,7 +3,9 @@
 """
 import os
 import re
+import numpy as np
 import pandas as pd
+from pandas.api.types import is_object_dtype
 from . import binning_config as cfg
 
 
@@ -186,7 +188,8 @@ def event(self, chunk):
         else:
             weights = None
 
-        data = chunk.tree.pandas.df(all_inputs)
+        data = chunk.tree.pandas.df(all_inputs, flatten=False)
+        data = explode(data)
 
         binned_values = _bin_values(data, dimensions=self._bin_dims,
                                     binnings=self._binnings,
@@ -250,3 +253,40 @@ def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_we
 
     histogram.index.set_names(out_dimensions, inplace=True)
     return histogram
+
+
+def explode(df, fill_value=float("nan")):
+    """
+    Based on this answer:
+    https://stackoverflow.com/questions/12680754/split-explode-pandas\
+    -dataframe-string-entry-to-separate-rows/40449726#40449726
+    """
+    # get the list columns
+    lst_cols = [col for col, dtype in df.dtypes.items() if is_object_dtype(dtype)]
+    if not lst_cols:
+        return df
+
+    # all columns except `lst_cols`
+    idx_cols = df.columns.difference(lst_cols)
+
+    # check all lists have same length
+    lens = pd.DataFrame({col: df[col].str.len() for col in lst_cols})
+    different_length = (lens.nunique(axis=1) > 1).any()
+    if different_length:
+        raise ValueError("Cannot bin multiple arrays with different jaggedness")
+    lens = lens[lst_cols[0]]
+
+    # create "exploded" DF
+    flattened = {col: df.loc[lens > 0, col].values for col in lst_cols}
+    flattened = {col: sum(map(list, vals), []) for col, vals in flattened.items()}
+    res = pd.DataFrame({col: np.repeat(df[col].values, lens) for col in idx_cols})
+    res = res.assign(**flattened)
+
+    # append those rows that have empty lists
+    if (lens == 0).any():
+        # at least one list in cells is empty
+        res = (res.append(df.loc[lens == 0, idx_cols], sort=False)
+                  .fillna(fill_value))
+
+    # Check that rows are fully "exploded"
+    return explode(res)
diff --git a/fast_carpenter/tree_wrapper.py b/fast_carpenter/tree_wrapper.py
@@ -6,9 +6,7 @@
 minimal coding on my side...
 """
 import uproot
-from uproot.interp.objects import asgenobj
-from uproot.interp.jagged import asjagged
-from uproot.interp.numerical import asdtype
+from uproot import asjagged, asdtype, asgenobj
 import copy
 import awkward
 
@@ -19,24 +17,20 @@ def recursive_type_wrap(array):
     return asdtype(array.dtype.fields)
 
 
-class asgenobj_then_jagged():
-    def __init__(self, original):
-        self.wrapping = original
-
+class wrapped_asgenobj(asgenobj):
     def finalize(self, *args, **kwargs):
-        result = self.wrapping.finalize(*args, **kwargs)
-        return awkward.JaggedArray.fromiter(result)
+        result = super(wrapped_asgenobj, self).finalize(*args, **kwargs)
+        result = awkward.JaggedArray.fromiter(result)
+        return result
 
-    def __getattr__(self, attr):
-        return getattr(self.wrapping, attr)
+
+uproot.interp.auto.asgenobj = wrapped_asgenobj
 
 
 def wrapped_interpret(branch, *args, **kwargs):
     from uproot.interp.auto import interpret
     result = interpret(branch, *args, **kwargs)
     if result:
-        if isinstance(result, asgenobj):
-            result = asgenobj_then_jagged(result)
         return result
 
     if isinstance(branch, WrappedTree.FakeBranch):

diff --git a/fast_carpenter/version.py b/fast_carpenter/version.py
@@ -12,5 +12,5 @@ def split_version(version):
     return tuple(result)
 
 
-__version__ = '0.14.0'
+__version__ = '0.14.1'
 version_info = split_version(__version__) # noqa
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.14.0
+current_version = 0.14.1
 commit = True
 tag = False
 

diff --git a/tests/summary/test_binned_dataframe.py b/tests/summary/test_binned_dataframe.py
@@ -197,3 +197,27 @@ def test_BinnedDataframe_numexpr_similar_branch(binned_df_3, tmpdir, full_wrappe
     bin_centers = pd.IntervalIndex(results.index.get_level_values('electron_pT')).mid
     mean = np.sum((bin_centers[1:-1] * results['n'][1:-1]) / results['n'][1:-1].sum())
     assert mean == pytest.approx(44.32584)
+
+
+def test_explode():
+    df = pd.DataFrame({'A': [[1, 2, 3], [9], [], [3, 4]], 'B': 1})
+    exploded = bdf.explode(df)
+    assert len(exploded) == 7
+    assert all(df.B == 1)
+
+    df["C"] = df.A.copy()
+    exploded = bdf.explode(df)
+    assert len(exploded) == 7
+    assert all(df.B == 1)
+    assert all(df.A == df.C)
+
+    df["D"] = [[1], [3], [4, 5], []]
+    with pytest.raises(ValueError) as e:
+        exploded = bdf.explode(df)
+    assert "different jaggedness" in str(e)
+
+    df2 = pd.DataFrame({'A': [[np.arange(i + 1) + j for i in range((j % 2) + 1)] for j in range(4)],
+                        'B': np.arange(4)[::-1],
+                        })
+    exploded = bdf.explode(df2)
+    assert len(exploded) == 8