fact-project · jebuss · Dec 15, 2017 · Dec 15, 2017 · Dec 15, 2017 · Dec 15, 2017
diff --git a/klaas/apply.py b/klaas/apply.py
@@ -44,6 +44,22 @@ def build_query(selection_config):
     query = '(' + ') & ('.join(queries) + ')'
     return query
 
+def build_run_list_query(run_list_path):
+    queries = []
+
+    run_list = pd.read_csv(run_list_path)
+
+    for (index, row) in run_list.iterrows():
+        queries.append(
+            "(night == {} & run_id == {})".format(
+                row['night'],
+                row['run_id']
+                )
+            )
+    query = " | ".join(queries)
+
+    return query
+
 
 def predict(df, model, features):
     df[features] = convert_to_float32(df[features])
@@ -123,6 +139,38 @@ def create_mask_h5py(input_path, selection_config, key='events', start=None, end
 
     return mask
 
+def create_run_list_mask_h5py(input_path, run_list_path, key='events', start=None, end=None, mode="r"):
+
+    with h5py.File(input_path) as infile:
+
+        n_events = h5py_get_n_rows(input_path, key=key, mode=mode)
+        start = start or 0
+        end = min(n_events, end) if end else n_events
+
+        run_list = pd.read_csv(run_list_path)
+
+        n_selected = end - start
+        mask = np.ones(n_selected, dtype=bool)
+
+        masks = []
+        for (index, row) in run_list.iterrows():
+            masks.append(
+                np.logical_and(
+                    OPERATORS['=='](infile[key]['night'][start:end], row['night']),
+                    OPERATORS['=='](infile[key]['run_id'][start:end], row['run_id'])
+                    )
+                )
+
+        run_list_mask = np.logical_or.reduce(masks)
+
+        before = mask.sum()
+        mask = np.logical_and(mask, run_list_mask)
+        after = mask.sum()
+
+        log.debug('Run list cuts removed {} events'.format( before - after ))
+
+    return mask
+
 
 def apply_cuts_h5py_chunked(
         input_path,
@@ -175,3 +223,56 @@ def apply_cuts_h5py_chunked(
                         group[name][n_old:n_old + n_new, :] = dataset[start:end][mask, :]
                     else:
                         log.warning('Skipping not 1d or 2d column {}'.format(name))
+
+
+def apply_run_list_h5py_chunked(
+        input_path,
+        output_path,
+        run_list_path,
+        key='events',
+        chunksize=100000,
+        progress=True,
+        ):
+    '''
+    Apply a runlist defined in a csv file to input_path and write result to
+    outputpath. Apply cuts to chunksize events at a time.
+    '''
+
+    n_events = h5py_get_n_rows(input_path, key=key, mode="r")
+    n_chunks = int(np.ceil(n_events / chunksize))
+    log.debug('Using {} chunks of size {}'.format(n_chunks, chunksize))
+
+    with h5py.File(input_path, 'r') as infile, h5py.File(output_path, 'a') as outfile:
+        group = outfile.create_group(key)
+
+        for chunk in tqdm(range(n_chunks), disable=not progress):
+            start = chunk * chunksize
+            end = min(n_events, (chunk + 1) * chunksize)
+
+            mask = create_run_list_mask_h5py(
+                input_path, run_list_path, key=key, start=start, end=end
+            )
+
+            for name, dataset in infile[key].items():
+                if chunk == 0:
+                    if dataset.ndim == 1:
+                        group.create_dataset(name, data=dataset[start:end][mask], maxshape=(None, ))
+                    elif dataset.ndim == 2:
+                        group.create_dataset(
+                            name, data=dataset[start:end, :][mask, :], maxshape=(None, 2)
+                        )
+                    else:
+                        log.warning('Skipping not 1d or 2d column {}'.format(name))
+
+                else:
+
+                    n_old = group[name].shape[0]
+                    n_new = mask.sum()
+                    group[name].resize(n_old + n_new, axis=0)
+
+                    if dataset.ndim == 1:
+                        group[name][n_old:n_old + n_new] = dataset[start:end][mask]
+                    elif dataset.ndim == 2:
+                        group[name][n_old:n_old + n_new, :] = dataset[start:end][mask, :]
+                    else:
+                        log.warning('Skipping not 1d or 2d column {}'.format(name))
diff --git a/klaas/scripts/apply_run_list.py b/klaas/scripts/apply_run_list.py
@@ -0,0 +1,105 @@
+import click
+import pandas as pd
+from tqdm import tqdm
+import h5py
+import logging
+
+from fact.io import read_data, h5py_get_n_rows
+from ..apply import create_run_list_mask_h5py, apply_run_list_h5py_chunked, build_run_list_query
+
+
+
+
+@click.command()
+@click.argument('run_list_path', type=click.Path(exists=True, dir_okay=False))
+@click.argument('input_path', type=click.Path(exists=True, dir_okay=False))
+@click.argument('output_path', type=click.Path(exists=False, dir_okay=False))
+@click.option(
+    '-h', '--hdf-style', default='h5py', type=click.Choice(['pandas', 'h5py']),
+    help='Wether to use pandas or h5py for the output file'
+)
+@click.option('-N', '--chunksize', type=int, help='Chunksize to use')
+@click.option('-k', '--key', help='Name of the hdf5 group', default='events')
+@click.option('-m', '--mode', help='Excess mode of the input file', default='r')
+@click.option('-v', '--verbose', help='Verbose log output', is_flag=True)
+def main(run_list_path, input_path, output_path, hdf_style, chunksize, key, mode, verbose):
+    '''
+     Apply a given list of runs from a list (CSV) in RUN_LIST_PATH to the data
+     in INPUT_PATH an write the result to OUTPUT_PATH.
+    '''
+    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
+    log = logging.getLogger()
+
+    if hdf_style == 'pandas':
+        print("pandas!")
+        query = build_run_list_query(run_list_path)
+
+        log.info('Using query: ' + query)
+
+        runs = read_data(input_path, key='runs', mode=mode)
+        runs = runs.query(query)
+        runs.to_hdf(output_path, key='runs')
+
+        if chunksize is None:
+            print("No chunks")
+            df = read_data(input_path, key=key, mode=mode)
+
+            n_events = len(df)
+            df = df.query(query)
+
+            log.info('Before cuts: {}, after cuts: {}'.format(n_events, len(df)))
+            df.to_hdf(output_path, key=key)
+            runs.to_hdf(output_path, key='runs')
+        else:
+            with pd.HDFStore(output_path, mode=mode) as store:
+                it = pd.read_hdf(input_path, key=key, chunksize=chunksize)
+                for df in tqdm(it):
+                    store.append(key, df.query(query))
+
+    else:
+        print("h5py!")
+        if chunksize is None:
+            print("No chunks")
+            apply_runlist_to_data_set(input_path, output_path, key, run_list_path)
+            apply_runlist_to_data_set(input_path, output_path, "runs", run_list_path)
+
+        else:
+            apply_run_list_h5py_chunked(
+                input_path, output_path, run_list_path, chunksize=chunksize, key=key
+            )
+
+            apply_run_list_h5py_chunked(
+                input_path, output_path, run_list_path, chunksize=chunksize, key="runs"
+            )
+
+
+def apply_runlist_to_data_set(input_path, output_path, key, run_list_path):
+    n_events = h5py_get_n_rows(input_path, key=key, mode=mode)
+
+    mask = create_run_list_mask_h5py(input_path, run_list_path, key=key)
+    log.info('Before cuts: {}, after cuts: {}'.format(n_events, mask.sum()))
+
+    with h5py.File(input_path) as infile, h5py.File(output_path, 'w') as outfile:
+        group = outfile.create_group(key)
+        print("banana!")
+        for name, dataset in infile[key].items():
+
+            if dataset.ndim == 1:
+                group.create_dataset(name, data=dataset[mask], maxshape=(None, ))
+            elif dataset.ndim == 2:
+                group.create_dataset(
+                    name, data=dataset[mask, :], maxshape=(None, 2)
+                )
+            else:
+                log.warning('Skipping not 1d or 2d column {}'.format(name))
+
+        for name, dataset in infile["runs"].items():
+
+            if dataset.ndim == 1:
+                runs_group.create_dataset(name, data=dataset[runs_mask], maxshape=(None, ))
+            elif dataset.ndim == 2:
+                runs_group.create_dataset(
+                    name, data=dataset[runs_mask, :], maxshape=(None, 2)
+                )
+            else:
+                log.warning('Skipping not 1d or 2d column {}'.format(name))
diff --git a/setup.py b/setup.py
@@ -44,6 +44,7 @@
             'klaas_plot_separator_performance = klaas.scripts.plot_separator_performance:main',
             'klaas_plot_regressor_performance = klaas.scripts.plot_regressor_performance:main',
             'klaas_apply_cuts = klaas.scripts.apply_cuts:main',
+            'klaas_apply_run_list = klaas.scripts.apply_run_list:main',
             'klaas_convert_pandas2h5py = klaas.scripts.convert_pandas2h5py:main',
         ],
     }