Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add skript to apply a run list to a given data set #48

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions klaas/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,22 @@ def build_query(selection_config):
query = '(' + ') & ('.join(queries) + ')'
return query

def build_run_list_query(run_list_path):
queries = []

run_list = pd.read_csv(run_list_path)

for (index, row) in run_list.iterrows():
queries.append(
"(night == {} & run_id == {})".format(
row['night'],
row['run_id']
)
)
query = " | ".join(queries)

return query


def predict(df, model, features):
df[features] = convert_to_float32(df[features])
Expand Down Expand Up @@ -123,6 +139,38 @@ def create_mask_h5py(input_path, selection_config, key='events', start=None, end

return mask

def create_run_list_mask_h5py(input_path, run_list_path, key='events', start=None, end=None, mode="r"):

with h5py.File(input_path) as infile:

n_events = h5py_get_n_rows(input_path, key=key, mode=mode)
start = start or 0
end = min(n_events, end) if end else n_events

run_list = pd.read_csv(run_list_path)

n_selected = end - start
mask = np.ones(n_selected, dtype=bool)

masks = []
for (index, row) in run_list.iterrows():
masks.append(
np.logical_and(
OPERATORS['=='](infile[key]['night'][start:end], row['night']),
OPERATORS['=='](infile[key]['run_id'][start:end], row['run_id'])
)
)

run_list_mask = np.logical_or.reduce(masks)

before = mask.sum()
mask = np.logical_and(mask, run_list_mask)
after = mask.sum()

log.debug('Run list cuts removed {} events'.format( before - after ))

return mask


def apply_cuts_h5py_chunked(
input_path,
Expand Down Expand Up @@ -175,3 +223,56 @@ def apply_cuts_h5py_chunked(
group[name][n_old:n_old + n_new, :] = dataset[start:end][mask, :]
else:
log.warning('Skipping not 1d or 2d column {}'.format(name))


def apply_run_list_h5py_chunked(
input_path,
output_path,
run_list_path,
key='events',
chunksize=100000,
progress=True,
):
'''
Apply a runlist defined in a csv file to input_path and write result to
outputpath. Apply cuts to chunksize events at a time.
'''

n_events = h5py_get_n_rows(input_path, key=key, mode="r")
n_chunks = int(np.ceil(n_events / chunksize))
log.debug('Using {} chunks of size {}'.format(n_chunks, chunksize))

with h5py.File(input_path, 'r') as infile, h5py.File(output_path, 'a') as outfile:
group = outfile.create_group(key)

for chunk in tqdm(range(n_chunks), disable=not progress):
start = chunk * chunksize
end = min(n_events, (chunk + 1) * chunksize)

mask = create_run_list_mask_h5py(
input_path, run_list_path, key=key, start=start, end=end
)

for name, dataset in infile[key].items():
if chunk == 0:
if dataset.ndim == 1:
group.create_dataset(name, data=dataset[start:end][mask], maxshape=(None, ))
elif dataset.ndim == 2:
group.create_dataset(
name, data=dataset[start:end, :][mask, :], maxshape=(None, 2)
)
else:
log.warning('Skipping not 1d or 2d column {}'.format(name))

else:

n_old = group[name].shape[0]
n_new = mask.sum()
group[name].resize(n_old + n_new, axis=0)

if dataset.ndim == 1:
group[name][n_old:n_old + n_new] = dataset[start:end][mask]
elif dataset.ndim == 2:
group[name][n_old:n_old + n_new, :] = dataset[start:end][mask, :]
else:
log.warning('Skipping not 1d or 2d column {}'.format(name))
105 changes: 105 additions & 0 deletions klaas/scripts/apply_run_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import click
import pandas as pd
from tqdm import tqdm
import h5py
import logging

from fact.io import read_data, h5py_get_n_rows
from ..apply import create_run_list_mask_h5py, apply_run_list_h5py_chunked, build_run_list_query




@click.command()
@click.argument('run_list_path', type=click.Path(exists=True, dir_okay=False))
@click.argument('input_path', type=click.Path(exists=True, dir_okay=False))
@click.argument('output_path', type=click.Path(exists=False, dir_okay=False))
@click.option(
'-h', '--hdf-style', default='h5py', type=click.Choice(['pandas', 'h5py']),
help='Wether to use pandas or h5py for the output file'
)
@click.option('-N', '--chunksize', type=int, help='Chunksize to use')
@click.option('-k', '--key', help='Name of the hdf5 group', default='events')
@click.option('-m', '--mode', help='Excess mode of the input file', default='r')
@click.option('-v', '--verbose', help='Verbose log output', is_flag=True)
def main(run_list_path, input_path, output_path, hdf_style, chunksize, key, mode, verbose):
'''
Apply a given list of runs from a list (CSV) in RUN_LIST_PATH to the data
in INPUT_PATH an write the result to OUTPUT_PATH.
'''
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
log = logging.getLogger()

if hdf_style == 'pandas':
print("pandas!")
query = build_run_list_query(run_list_path)

log.info('Using query: ' + query)

runs = read_data(input_path, key='runs', mode=mode)
runs = runs.query(query)
runs.to_hdf(output_path, key='runs')

if chunksize is None:
print("No chunks")
df = read_data(input_path, key=key, mode=mode)

n_events = len(df)
df = df.query(query)

log.info('Before cuts: {}, after cuts: {}'.format(n_events, len(df)))
df.to_hdf(output_path, key=key)
runs.to_hdf(output_path, key='runs')
else:
with pd.HDFStore(output_path, mode=mode) as store:
it = pd.read_hdf(input_path, key=key, chunksize=chunksize)
for df in tqdm(it):
store.append(key, df.query(query))

else:
print("h5py!")
if chunksize is None:
print("No chunks")
apply_runlist_to_data_set(input_path, output_path, key, run_list_path)
apply_runlist_to_data_set(input_path, output_path, "runs", run_list_path)

else:
apply_run_list_h5py_chunked(
input_path, output_path, run_list_path, chunksize=chunksize, key=key
)

apply_run_list_h5py_chunked(
input_path, output_path, run_list_path, chunksize=chunksize, key="runs"
)


def apply_runlist_to_data_set(input_path, output_path, key, run_list_path):
n_events = h5py_get_n_rows(input_path, key=key, mode=mode)

mask = create_run_list_mask_h5py(input_path, run_list_path, key=key)
log.info('Before cuts: {}, after cuts: {}'.format(n_events, mask.sum()))

with h5py.File(input_path) as infile, h5py.File(output_path, 'w') as outfile:
group = outfile.create_group(key)
print("banana!")
for name, dataset in infile[key].items():

if dataset.ndim == 1:
group.create_dataset(name, data=dataset[mask], maxshape=(None, ))
elif dataset.ndim == 2:
group.create_dataset(
name, data=dataset[mask, :], maxshape=(None, 2)
)
else:
log.warning('Skipping not 1d or 2d column {}'.format(name))

for name, dataset in infile["runs"].items():

if dataset.ndim == 1:
runs_group.create_dataset(name, data=dataset[runs_mask], maxshape=(None, ))
elif dataset.ndim == 2:
runs_group.create_dataset(
name, data=dataset[runs_mask, :], maxshape=(None, 2)
)
else:
log.warning('Skipping not 1d or 2d column {}'.format(name))
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
'klaas_plot_separator_performance = klaas.scripts.plot_separator_performance:main',
'klaas_plot_regressor_performance = klaas.scripts.plot_regressor_performance:main',
'klaas_apply_cuts = klaas.scripts.apply_cuts:main',
'klaas_apply_run_list = klaas.scripts.apply_run_list:main',
'klaas_convert_pandas2h5py = klaas.scripts.convert_pandas2h5py:main',
],
}
Expand Down