Skip to content

Commit

Permalink
Merge pull request westpa#35 from westpa/west_tools_KineticsRewrite
Browse files Browse the repository at this point in the history
West tools kinetics rewrite merged.

Former-commit-id: b07237e
  • Loading branch information
Adam J. Pratt authored Mar 14, 2017
2 parents 8d2c6bd + e0a4326 commit f01e041
Show file tree
Hide file tree
Showing 38 changed files with 96,511 additions and 29,581 deletions.
1 change: 1 addition & 0 deletions bin/w_direct
1 change: 1 addition & 0 deletions bin/w_postanalysis_matrix
1 change: 1 addition & 0 deletions bin/w_postanalysis_reweight
1 change: 1 addition & 0 deletions bin/w_reweight
19,236 changes: 11,085 additions & 8,151 deletions lib/west_tools/fasthist/_fasthist.c

Large diffs are not rendered by default.

153 changes: 133 additions & 20 deletions lib/west_tools/mclib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2013 Matthew C. Zwier and Lillian T. Chong
# Copyright (C) 2017 Matthew C. Zwier and Lillian T. Chong
#
# This file is part of WESTPA.
#
Expand All @@ -23,8 +23,9 @@

from _mclib import autocorrel_elem, mcbs_correltime, get_bssize, mcbs_ci #@UnresolvedImport

def mcbs_ci_correl(dataset, estimator, alpha, n_sets=None, args=None, kwargs=None,
autocorrel_alpha = None, autocorrel_n_sets=None, subsample=None):
def mcbs_ci_correl(estimator_datasets, estimator, alpha, n_sets=None, args=None,
autocorrel_alpha = None, autocorrel_n_sets=None, subsample=None,
do_correl=True, mcbs_enable=None, estimator_kwargs={}):
'''Perform a Monte Carlo bootstrap estimate for the (1-``alpha``) confidence interval
on the given ``dataset`` with the given ``estimator``. This routine is appropriate
for time-correlated data, using the method described in Huber & Kim, "Weighted-ensemble
Expand Down Expand Up @@ -65,31 +66,143 @@ def mcbs_ci_correl(dataset, estimator, alpha, n_sets=None, args=None, kwargs=Non

autocorrel_alpha = alpha if not autocorrel_alpha else autocorrel_alpha

if dataset.ndim != 1:
raise ValueError('correlated time series MCBS analysis available only for 1-dimensional data')
# We're now passing in dataset as a dict, so we need to enforce that for compatibility with older tools.
# This just takes our dataset and puts it into a dict, as it's likely that we're using
# mean or median as our estimators, which take "a" as argument input.
if type(estimator_datasets).__name__ != 'dict':
# Enforcing the data structure.
pre_calculated = estimator_datasets
estimator_datasets = {'a' : estimator_datasets}
# This also probably means our estimator isn't going to handle kwargs, so we'll watch out for that later in testing.
# We may have to replace the 'simple' estimator with a slightly more complex lambda function which simply ditches extra arguments.
for key, dset in estimator_datasets.iteritems():
estimator_datasets[key] = numpy.asanyarray(dset)
dlen = dset.shape[0]

# Why do we have 'estimator_datasets'?
# Estimators may require many different sets of data to properly function; while we can send this in via the kwargs,
# we may wish to decimate only a certain subset (due to the block bootstrapping) of the input parameters.
# Therefore, 'estimator_datasets' should consist of datasets that must be sliced/decimated with the subsampling function.
# Some estimators (such as the reweighting) may not be able to be decimated in a straightforward manner with a subsample function,
# as we cannot pre-estimate the quantity without introducing error or bias. In those cases, we may wish to pass on all the data,
# but ensure that our estimator only includes certain iterations (and only in a certain way).

dataset = numpy.asanyarray(dataset)
dlen = len(dataset)
n_sets = n_sets or get_bssize(alpha)
autocorrel_n_sets = autocorrel_n_sets or get_bssize(autocorrel_alpha)

correl_len = mcbs_correltime(dataset, autocorrel_alpha, autocorrel_n_sets)
if correl_len == len(dataset):
# too correlated for meaningful calculations
return estimator(dataset, *(args or ()), **(kwargs or {})), dataset.min(), dataset.max(), correl_len

if mcbs_enable == False:
# While it's odd to support NOT doing the bootstrap in a library specifically designed for bootstrapping,
# supporting this functionality here makes writing the code a lot easier, as we can just pass in a flag.
# Specifically, this is for situations in which error is not desired (that is, only a reasonable mean is desired).
# It's often useful when doing a quick analysis.
estimator_datasets.update(estimator_kwargs)
try:
estimator_datasets.update( { 'stride': 1 } )
except:
pass

return_set = estimator(**estimator_datasets)
# We don't try and pretend we're doing any error analysis.
return return_set, return_set, return_set, 0, 1

# We need to pre-generate the data; why not do it here? We're already set up for it...
precalc_kwargs = estimator_kwargs.copy()
precalc_kwargs['stride'] = 1
pre_calculated = []
for block in range(1, dlen+1):
for key, dset in estimator_datasets.iteritems():
precalc_kwargs[key] = dset[0:block]
pre_calculated.append(estimator(**precalc_kwargs))
# We need to get rid of any NaNs.
pre_calculated = numpy.asanyarray(pre_calculated)
pre_calculated = pre_calculated[numpy.isfinite(pre_calculated)]
# If this happens, we have a huge NaN problem. That is, our estimator is failing to return meaningful
# numbers. We should catch this when it happens, and so raise an exception, here.
# This is almost certainly due to estimator failure. Double check that calculation.
if pre_calculated.shape == (0,):
raise NameError("Looks like the estimator failed. This is likely a programming issue, and should be reported.")
# If pre-calculated is not None, we'll use that instead of dataset.
# We can also assume that it's a 1 dimensional set with nothing needed, so 'key' should work.
if do_correl == True:
correl_len = mcbs_correltime(pre_calculated, autocorrel_alpha, autocorrel_n_sets)
else:
correl_len = 0
if correl_len == len(pre_calculated):
# too correlated for meaningful calculations
estimator_datasets.update(estimator_kwargs)
try:
estimator_datasets.update( { 'stride': 1 } )
except:
pass

return estimator(**estimator_datasets), pre_calculated.min(), pre_calculated.max(), (numpy.std(pre_calculated)), correl_len

# else, do a blocked bootstrap
stride = correl_len + 1

if stride == 1:
return mcbs_ci(dataset, estimator, alpha, n_sets, args, kwargs, numpy.msort) + (correl_len,)
# Some estimators may require the stride, so we pass it in.
estimator_kwargs['stride'] = stride
return mcbs_ci(dataset=estimator_datasets, estimator=estimator, alpha=alpha, dlen=dlen, n_sets=n_sets, args=args, kwargs=estimator_kwargs, sort=numpy.msort) + (correl_len,)
else:
n_slices = dlen // stride
#if dlen % stride > 0: n_slices += 1
subsample = subsample or (lambda x: x[numpy.random.randint(len(x))])
decim_set = numpy.empty((n_slices,), dtype=dataset.dtype)
for iout, istart in enumerate(xrange(0,dlen-stride+1,stride)):
sl = dataset[istart:istart+stride]
decim_set[iout] = subsample(sl)
# Let's make sure we decimate every array properly...
decim_list = {}
for key,dset in estimator_datasets.iteritems():
dset_shape = list(dset.shape)
n_slices = dset_shape[0] // stride
dset_shape[0] = n_slices
decim_set = numpy.empty((dset_shape), dtype=dset.dtype)
for iout, istart in enumerate(xrange(0,dset.shape[0]-stride+1,stride)):
sl = dset[istart:istart+stride]
# We assume time is the 0th axis.
# Okay, so non-optimal. Population requires the axis subsampling to be done just so...
try:
decim_set[iout] = subsample(sl, axis=0)
except:
decim_set[iout] = subsample(sl)
decim_list[key] = decim_set
dlen = dset_shape[0]
estimator_kwargs['stride'] = stride

return mcbs_ci(decim_set, estimator, alpha, n_sets, args, kwargs, numpy.msort) + (correl_len,)
return mcbs_ci(dataset=decim_list, estimator=estimator, alpha=alpha, dlen=dlen, n_sets=n_sets, args=args, kwargs=estimator_kwargs, sort=numpy.msort) + (correl_len,)


# These are blocks designed to evaluate simple information sets.
# Whether they should go here or in westtoools is somewhat up for debate.
# Currently, nothing actually uses them, so there's that.

def _1D_simple_eval_block(iblock, start, stop, nstates, data_input, name, mcbs_alpha, mcbs_nsets, mcbs_acalpha, do_correl, mcbs_enable, subsample=numpy.mean, **extra):
# This is actually appropriate for anything with a directly measured, 1D dataset, i.e.,
# Fluxes, color populations, and state populations.
results = []
for istate in xrange(nstates):
# Not sure if we need a jstate for these estimators, but we'll see.
kwargs = { 'istate' : istate , 'jstate': 'B'}
estimator_datasets = {'dataset': data_input['dataset'][:,istate]}
ci_res = mcbs_ci_correl(estimator_datasets,estimator=(lambda stride, dataset: numpy.mean(dataset)),
alpha=mcbs_alpha,n_sets=mcbs_nsets,autocorrel_alpha=mcbs_acalpha,
subsample=subsample, do_correl=do_correl, mcbs_enable=mcbs_enable)

results.append((name, iblock,istate,(start,stop)+ci_res))

return results

def _2D_simple_eval_block(iblock, start, stop, nstates, data_input, name, mcbs_alpha, mcbs_nsets, mcbs_acalpha, do_correl, mcbs_enable, subsample=numpy.mean, **extra):
# This is really just a simple 2D block for less complex datasets, but there it is.
# It's probably limited in this use case to conditional_fluxes, but anything that's an i to j process that is directly measured
# is suitable for use with this.
results = []
for istate in xrange(nstates):
for jstate in xrange(nstates):
if istate == jstate: continue
kwargs = { 'istate' : istate, 'jstate': jstate }
#dataset = {'dataset': cond_fluxes[:, istate, jstate]}
estimator_datasets = {'dataset': data_input['dataset'][:, istate, jstate] }
ci_res = mcbs_ci_correl(estimator_datasets,estimator=(lambda stride, dataset: numpy.mean(dataset)),
alpha=mcbs_alpha,n_sets=mcbs_nsets,autocorrel_alpha=mcbs_acalpha,
subsample=subsample, do_correl=do_correl, mcbs_enable=mcbs_enable)

results.append((name, iblock, istate, jstate, (start,stop) + ci_res))

return results
Loading

0 comments on commit f01e041

Please sign in to comment.