From 269f4f4851f3745179d067a22f665dbed0f0c782 Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Thu, 13 Jul 2017 10:22:36 +0300 Subject: [PATCH 1/9] Various improvements (#202) * Logging changes and improvements to docs * Improvements to clients - Improved batch submission policy - Removed unused methods - Improved docs --- elfi/client.py | 26 +++++++++++++------------- elfi/clients/ipyparallel.py | 6 +----- elfi/clients/multiprocessing.py | 6 +----- elfi/clients/native.py | 6 +----- elfi/methods/parameter_inference.py | 11 +++++------ elfi/model/elfi_model.py | 7 +++++-- 6 files changed, 26 insertions(+), 36 deletions(-) diff --git a/elfi/client.py b/elfi/client.py index 3df15c77..1c7f8b1d 100644 --- a/elfi/client.py +++ b/elfi/client.py @@ -53,13 +53,16 @@ def __init__(self, model, output_names=None, client=None): self._next_batch_index = 0 self._pending_batches = OrderedDict() - @property - def has_ready(self, batch_index=None): + def has_ready(self, any=False): + """Check if the next batch in succession is ready""" + if len(self._pending_batches) == 0: + return False + for bi, id in self._pending_batches.items(): - if batch_index and batch_index != bi: - continue if self.client.is_ready(id): return True + if not any: + break return False @property @@ -145,7 +148,7 @@ def wait_next(self): raise ValueError('Cannot wait for a batch, no batches currently submitted') batch_index, task_id = self._pending_batches.popitem(last=False) - batch = self.client.get(task_id) + batch = self.client.get_result(task_id) logger.debug('Received batch {}'.format(batch_index)) self.context.callback(batch, batch_index) @@ -164,21 +167,18 @@ def num_cores(self): class ClientBase: """Client api for serving multiple simultaneous inferences""" - # TODO: add the self.tasks dict available - # TODO: test that client is emptied from tasks as they are received - def apply(self, kallable, *args, **kwargs): - """Returns immediately with an id for the task""" + """Non-blocking apply, returns immediately with an id for the task.""" raise NotImplementedError def apply_sync(self, kallable, *args, **kwargs): - """Returns the result""" + """Blocking apply, returns the result.""" raise NotImplementedError - def get(self, task_id): - raise NotImplementedError + def get_result(self, task_id): + """Get the result of the task. - def wait_next(self, task_ids): + ELFI will call this only once per task_id.""" raise NotImplementedError def is_ready(self, task_id): diff --git a/elfi/clients/ipyparallel.py b/elfi/clients/ipyparallel.py index a64f88e9..37658142 100644 --- a/elfi/clients/ipyparallel.py +++ b/elfi/clients/ipyparallel.py @@ -33,14 +33,10 @@ def apply(self, kallable, *args, **kwargs): def apply_sync(self, kallable, *args, **kwargs): return self.view.apply_sync(kallable, *args, **kwargs) - def get(self, task_id): + def get_result(self, task_id): async_result = self.tasks.pop(task_id) return async_result.get() - def wait_next(self, task_ids): - # TODO: async operation, check ipyparallel.client.asyncresult _unordered_iter - raise NotImplementedError - def is_ready(self, task_id): return self.tasks[task_id].ready() diff --git a/elfi/clients/multiprocessing.py b/elfi/clients/multiprocessing.py index e0cfb202..ffb6c2c4 100644 --- a/elfi/clients/multiprocessing.py +++ b/elfi/clients/multiprocessing.py @@ -55,7 +55,7 @@ def apply_sync(self, kallable, *args, **kwargs): """ return self.pool.apply(kallable, args, kwargs) - def get(self, task_id): + def get_result(self, task_id): """Returns the result from task identified by `task_id` when it arrives. Parameters @@ -66,10 +66,6 @@ def get(self, task_id): async_result = self.tasks.pop(task_id) return async_result.get() - def wait_next(self, task_ids): - for id in task_ids: - return self.get(id) - def is_ready(self, task_id): """Return whether task with identifier `task_id` is ready. diff --git a/elfi/clients/native.py b/elfi/clients/native.py index d427826e..5b57f7a2 100644 --- a/elfi/clients/native.py +++ b/elfi/clients/native.py @@ -30,14 +30,10 @@ def apply(self, kallable, *args, **kwargs): def apply_sync(self, kallable, *args, **kwargs): return kallable(*args, **kwargs) - def get(self, task_id): + def get_result(self, task_id): kallable, args, kwargs = self.tasks.pop(task_id) return kallable(*args, **kwargs) - def wait_next(self, task_ids): - for id in task_ids: - return self.get(id) - def is_ready(self, task_id): return True diff --git a/elfi/methods/parameter_inference.py b/elfi/methods/parameter_inference.py index 353d2ec8..4a00d567 100644 --- a/elfi/methods/parameter_inference.py +++ b/elfi/methods/parameter_inference.py @@ -1,5 +1,4 @@ import logging -from collections import OrderedDict from math import ceil import matplotlib.pyplot as plt @@ -175,7 +174,7 @@ def update(self, batch, batch_index): ------- None """ - logger.debug('Received batch %d' % batch_index) + logger.info('Received batch %d' % batch_index) self.state['n_batches'] += 1 self.state['n_sim'] += self.batch_size @@ -276,6 +275,7 @@ def iterate(self): # Submit new batches if allowed while self._allow_submit(self.batches.next_index): next_batch = self.prepare_new_batch(self.batches.next_index) + logger.info("Submitting batch %d" % self.batches.next_index) self.batches.submit(next_batch) # Handle the next ready batch in succession @@ -289,7 +289,7 @@ def finished(self): def _allow_submit(self, batch_index): return self.max_parallel_batches > self.batches.num_pending and \ self._has_batches_to_submit and \ - (not self.batches.has_ready) + (not self.batches.has_ready()) @property def _has_batches_to_submit(self): @@ -462,6 +462,7 @@ def set_objective(self, n_samples, threshold=None, quantile=None, n_sim=None): self.batches.reset() def update(self, batch, batch_index): + super(Rejection, self).update(batch, batch_index) if self.state['samples'] is None: # Lazy initialization of the outputs dict self._init_samples_lazy(batch) @@ -532,8 +533,6 @@ def _update_state_meta(self): """ o = self.objective s = self.state - s['n_batches'] += 1 - s['n_sim'] += self.batch_size s['threshold'] = s['samples'][self.discrepancy_name][o['n_samples'] - 1].item() s['accept_rate'] = min(1, o['n_samples']/s['n_sim']) @@ -1125,7 +1124,7 @@ def sample(self, n_samples, warmup=None, n_chains=4, threshold=None, initials=No # get results from completed tasks or run sampling (client-specific) chains = [] for id in tasks_ids: - chains.append(self.client.get(id)) + chains.append(self.client.get_result(id)) chains = np.asarray(chains) diff --git a/elfi/model/elfi_model.py b/elfi/model/elfi_model.py index 9fa11d20..84b42419 100644 --- a/elfi/model/elfi_model.py +++ b/elfi/model/elfi_model.py @@ -721,9 +721,12 @@ def __init__(self, fn, *parents, **kwargs): class Discrepancy(NodeReference): - def __init__(self, discrepancy, *parents, **kwargs): - """A discrepancy node of a generative model. + """A discrepancy node of a generative model. + This class provides a convenience node for custom distance operations. + """ + def __init__(self, discrepancy, *parents, **kwargs): + """ Parameters ---------- From eb74b6d2cf8fd5ccfa6fad7388ffccc8ccb88a30 Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Thu, 13 Jul 2017 13:22:02 +0300 Subject: [PATCH 2/9] Update PULL_REQUEST_TEMPLATE.md --- .github/PULL_REQUEST_TEMPLATE.md | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 197ec5ea..2c53e3d6 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,16 +1,9 @@ #### Summary: +Please provide a short summary here -#### Intended Effect: - -#### How to Verify: - -#### Side Effects: - -#### Successful inference in current Notebook examples: - -#### Documentation: - -#### Reviewer Suggestions: +#### Please make sure +- You have updated the CHANGELOG.rst +- You have updated the documentation (if applicable) #### Copyright and Licensing From adc44ac9867933e9c237c18300fde5ed6c60a9be Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Fri, 14 Jul 2017 09:10:36 +0300 Subject: [PATCH 3/9] Pool improvements (#204) * Refactored ComputationContext away from ElfiModel * Added an elfi.new_model method * Refactored the pools * Few fixes * Store refactoring * Refactor result classes * Added one more store test * Few additions --- CHANGELOG.rst | 10 + docs/api.rst | 3 + docs/developer/architecture.rst | 18 +- elfi/client.py | 28 ++- elfi/compiler.py | 14 +- elfi/examples/bdm.py | 2 +- elfi/examples/gauss.py | 2 +- elfi/examples/ma2.py | 2 +- elfi/examples/ricker.py | 2 +- elfi/loader.py | 7 +- elfi/methods/parameter_inference.py | 51 +++-- elfi/methods/results.py | 126 ++++++------ elfi/model/elfi_model.py | 81 +++++--- elfi/model/graphical_model.py | 4 +- elfi/store.py | 249 ++++++++++++++++------- elfi/utils.py | 14 +- tests/conftest.py | 4 +- tests/functional/test_consistency.py | 3 +- tests/functional/test_custom_outputs.py | 8 +- tests/functional/test_inference.py | 4 +- tests/functional/test_post_processing.py | 4 +- tests/functional/test_serialization.py | 3 +- tests/unit/test_bo.py | 2 +- tests/unit/test_client.py | 4 +- tests/unit/test_methods.py | 23 ++- tests/unit/test_results.py | 35 ++-- tests/unit/test_store.py | 48 +++-- tests/unit/test_tools.py | 1 + 28 files changed, 492 insertions(+), 260 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5b883728..2f90d889 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,16 @@ Changelog ========== +dev +--- + +- Fix a bug preventing the reuse of ArrayPool data with a new inference +- Added a possibility to pickle ArrayPool +- Added ArrayPool.open to read a closed pool from disk +- Refactored Sample and SmcSample classes +- Added elfi.new_model method + + 0.6 (2017-07-03) ---------------- diff --git a/docs/api.rst b/docs/api.rst index 2af520bd..2e422203 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -31,6 +31,7 @@ Below is the API for creating generative models. .. currentmodule:: elfi.model.elfi_model .. autosummary:: + elfi.new_model elfi.get_current_model elfi.set_current_model @@ -146,6 +147,8 @@ Modelling API classes .. currentmodule:: elfi.model.elfi_model +.. automethod:: elfi.new_model + .. automethod:: elfi.get_current_model .. automethod:: elfi.set_current_model diff --git a/docs/developer/architecture.rst b/docs/developer/architecture.rst index 30e9eadf..c8ed52ed 100644 --- a/docs/developer/architecture.rst +++ b/docs/developer/architecture.rst @@ -7,14 +7,16 @@ e.g. the inference methods or the data storages. This information is aimed for d and is not essential for using ELFI. We assume the reader is quite familiar with Python and has perhaps already read some of ELFI's source code. -The low level representation of the ELFI model is a ``networkx.DiGraph`` with nodes -represented as Python dictionaries that are called node state dictionaries. This -representation is held in ``ElfiModel.source_net``. Before the ELFI model can be ran, it -needs to be compiled and loaded with data (e.g. observed data, precomputed data, batch -index, batch size etc). The compilation and loading of data is the responsibility of the -``Client`` implementation and makes it possible in essence to translate ``ElfiModel`` to -any kind of computational backend. Finally the class ``Executor`` is responsible for -running the compiled and loaded model and producing the outputs of the nodes. +The low level representation of the ELFI model is a ``networkx.DiGraph`` with node names +as the nodes. The representation of the node is stored to the corresponding attribute +dictionary of the ``networkx.DiGraph``. We call this attribute dictionary the node *state* +dictionary. The ``networkx.DiGraph`` representation can be found from +``ElfiModel.source_net``. Before the ELFI model can be ran, it needs to be compiled and +loaded with data (e.g. observed data, precomputed data, batch index, batch size etc). The +compilation and loading of data is the responsibility of the ``Client`` implementation and +makes it possible in essence to translate ``ElfiModel`` to any kind of computational +backend. Finally the class ``Executor`` is responsible for running the compiled and loaded +model and producing the outputs of the nodes. A user typically creates this low level representation by working with subclasses of ``NodeReference``. These are easy to use UI classes of ELFI such as the ``elfi.Simulator`` or diff --git a/elfi/client.py b/elfi/client.py index 1c7f8b1d..00087002 100644 --- a/elfi/client.py +++ b/elfi/client.py @@ -7,7 +7,8 @@ from elfi.executor import Executor from elfi.compiler import OutputCompiler, ObservedCompiler, AdditionalNodesCompiler, \ ReduceCompiler, RandomStateCompiler -from elfi.loader import ObservedLoader, AdditionalNodesLoader, RandomStateLoader, PoolLoader +from elfi.loader import ObservedLoader, AdditionalNodesLoader, RandomStateLoader, \ + PoolLoader from elfi.store import OutputPool logger = logging.getLogger(__name__) @@ -45,10 +46,12 @@ class BatchHandler: Responsible for sending computational graphs to be executed in an Executor """ - def __init__(self, model, output_names=None, client=None): - self.client = client or get_client() - self.compiled_net = self.client.compile(model.source_net, output_names) - self.context = model.computation_context + def __init__(self, model, context, output_names=None, client=None): + client = client or get_client() + + self.compiled_net = client.compile(model.source_net, output_names) + self.context = context + self.client = client self._next_batch_index = 0 self._pending_batches = OrderedDict() @@ -168,7 +171,17 @@ class ClientBase: """Client api for serving multiple simultaneous inferences""" def apply(self, kallable, *args, **kwargs): - """Non-blocking apply, returns immediately with an id for the task.""" + """Non-blocking apply, returns immediately with an id for the task. + + Parameters + ---------- + kallable : callable + args + Positional arguments for the kallable + kwargs + Keyword arguments for the kallable + + """ raise NotImplementedError def apply_sync(self, kallable, *args, **kwargs): @@ -223,7 +236,8 @@ def compile(cls, source_net, outputs=None): logger.warning("Compiling for no outputs!") outputs = outputs if isinstance(outputs, list) else [outputs] - compiled_net = nx.DiGraph(outputs=outputs, name=source_net.graph['name']) + compiled_net = nx.DiGraph(outputs=outputs, name=source_net.graph['name'], + observed=source_net.graph['observed']) compiled_net = OutputCompiler.compile(source_net, compiled_net) compiled_net = ObservedCompiler.compile(source_net, compiled_net) diff --git a/elfi/compiler.py b/elfi/compiler.py index e278cc86..49c3dbd9 100644 --- a/elfi/compiler.py +++ b/elfi/compiler.py @@ -41,14 +41,16 @@ def compile(cls, source_net, compiled_net): for name, data in compiled_net.nodes_iter(data=True): state = source_net.node[name] if '_output' in state and '_operation' in state: - raise ValueError("Cannot compile: both _output and _operation present for node '{}'".format(name)) + raise ValueError("Cannot compile: both _output and _operation present " + "for node '{}'".format(name)) if '_output' in state: data['output'] = state['_output'] elif '_operation' in state: data['operation'] = state['_operation'] else: - raise ValueError("Cannot compile, no _output or _operation present for node '{}'".format(name)) + raise ValueError("Cannot compile, no _output or _operation present for " + "node '{}'".format(name)) return compiled_net @@ -85,17 +87,17 @@ def compile(cls, source_net, compiled_net): else: link_parent = parent - compiled_net.add_edge(link_parent, obs_node, source_net[parent][node].copy()) + compiled_net.add_edge(link_parent, obs_node, + source_net[parent][node].copy()) # Check that there are no stochastic nodes in the ancestors - # TODO: move to loading phase when checking that stochastic observables get their data? for node in uses_observed: # Use the observed version to query observed ancestors in the compiled_net obs_node = observed_name(node) for ancestor_node in nx.ancestors(compiled_net, obs_node): if '_stochastic' in source_net.node.get(ancestor_node, {}): - raise ValueError("Observed nodes must be deterministic. Observed data" - "depends on a non-deterministic node {}." + raise ValueError("Observed nodes must be deterministic. Observed " + "data depends on a non-deterministic node {}." .format(ancestor_node)) return compiled_net diff --git a/elfi/examples/bdm.py b/elfi/examples/bdm.py index f6c7f894..0d0c0966 100644 --- a/elfi/examples/bdm.py +++ b/elfi/examples/bdm.py @@ -118,7 +118,7 @@ def get_model(alpha=0.2, delta=0, tau=0.198, N=20, seed_obs=None): else: y = BDM(alpha, delta, tau, N, random_state=np.random.RandomState(seed_obs)) - m = elfi.ElfiModel(name='bdm', set_current=False) + m = elfi.ElfiModel(name='bdm') elfi.Prior('uniform', .005, 2, model=m, name='alpha') elfi.Simulator(BDM, m['alpha'], delta, tau, N, observed=y, name='BDM') elfi.Summary(T1, m['BDM'], name='T1') diff --git a/elfi/examples/gauss.py b/elfi/examples/gauss.py index b732a145..4e29d6b3 100644 --- a/elfi/examples/gauss.py +++ b/elfi/examples/gauss.py @@ -56,7 +56,7 @@ def get_model(n_obs=50, true_params=None, seed_obs=None): random_state=np.random.RandomState(seed_obs)) sim_fn = partial(Gauss, n_obs=n_obs) - m = elfi.ElfiModel(set_current=False) + m = elfi.ElfiModel() elfi.Prior('uniform', -10, 50, model=m, name='mu') elfi.Prior('truncnorm', 0.01, 5, model=m, name='sigma') elfi.Simulator(sim_fn, m['mu'], m['sigma'], observed=y_obs, name='Gauss') diff --git a/elfi/examples/ma2.py b/elfi/examples/ma2.py index 7c04943b..ec9182c7 100644 --- a/elfi/examples/ma2.py +++ b/elfi/examples/ma2.py @@ -64,7 +64,7 @@ def get_model(n_obs=100, true_params=None, seed_obs=None): y = MA2(*true_params, n_obs=n_obs, random_state=np.random.RandomState(seed_obs)) sim_fn = partial(MA2, n_obs=n_obs) - m = elfi.ElfiModel(set_current=False) + m = elfi.ElfiModel() elfi.Prior(CustomPrior1, 2, model=m, name='t1') elfi.Prior(CustomPrior2, m['t1'], 1, name='t2') elfi.Simulator(sim_fn, m['t1'], m['t2'], observed=y, name='MA2') diff --git a/elfi/examples/ricker.py b/elfi/examples/ricker.py index 0d24f6f8..df2918df 100644 --- a/elfi/examples/ricker.py +++ b/elfi/examples/ricker.py @@ -109,7 +109,7 @@ def get_model(n_obs=50, true_params=None, seed_obs=None, stochastic=True): if true_params is None: true_params = [3.8] - m = elfi.ElfiModel(set_current=False) + m = elfi.ElfiModel() y_obs = simulator(*true_params, n_obs=n_obs, random_state=np.random.RandomState(seed_obs)) sim_fn = partial(simulator, n_obs=n_obs) sumstats = [] diff --git a/elfi/loader.py b/elfi/loader.py index b2f73e12..f2ea859c 100644 --- a/elfi/loader.py +++ b/elfi/loader.py @@ -32,12 +32,15 @@ class ObservedLoader(Loader): @classmethod def load(cls, context, compiled_net, batch_index): - for name, obs in context.observed.items(): + observed = compiled_net.graph['observed'] + + for name, obs in observed.items(): obs_name = observed_name(name) if not compiled_net.has_node(obs_name): continue compiled_net.node[obs_name] = dict(output=obs) + del compiled_net.graph['observed'] return compiled_net @@ -69,7 +72,7 @@ def load(cls, context, compiled_net, batch_index): batch = context.pool.get_batch(batch_index) - for node in context.pool.output_stores: + for node in context.pool.stores: if not compiled_net.has_node(node): continue elif node in batch: diff --git a/elfi/methods/parameter_inference.py b/elfi/methods/parameter_inference.py index 4a00d567..d4eeea2c 100644 --- a/elfi/methods/parameter_inference.py +++ b/elfi/methods/parameter_inference.py @@ -87,16 +87,14 @@ def __init__(self, model, output_names, batch_size=1000, seed=None, pool=None, self.model = model.copy() self.output_names = self._check_outputs(output_names) - # Prepare the computation_context - context = ComputationContext( - batch_size=batch_size, - seed=seed, - observed=model.computation_context.observed, - pool=pool - ) - self.model.computation_context = context self.client = elfi.client.get_client() - self.batches = elfi.client.BatchHandler(self.model, output_names=output_names, client=self.client) + + # Prepare the computation_context + context = ComputationContext(batch_size=batch_size, seed=seed, pool=pool) + self.batches = elfi.client.BatchHandler(self.model, context=context, + output_names=output_names, + client=self.client) + self.computation_context = context self.max_parallel_batches = max_parallel_batches or self.client.num_cores if self.max_parallel_batches <= 0: @@ -116,12 +114,12 @@ def __init__(self, model, output_names, batch_size=1000, seed=None, pool=None, @property def pool(self): """Return the output pool of the inference.""" - return self.model.computation_context.pool + return self.computation_context.pool @property def seed(self): """Return the seed of the inference.""" - return self.model.computation_context.seed + return self.computation_context.seed @property def parameter_names(self): @@ -131,7 +129,7 @@ def parameter_names(self): @property def batch_size(self): """Return the current batch_size.""" - return self.model.computation_context.batch_size + return self.computation_context.batch_size def set_objective(self, *args, **kwargs): """Set the objective of the inference. @@ -600,9 +598,17 @@ def set_objective(self, n_samples, thresholds): self._init_new_round() def extract_result(self): + """ + + Returns + ------- + SmcSample + """ pop = self._extract_population() return SmcSample(outputs=pop.outputs, populations=self._populations.copy() + [pop], + weights=pop.weights, + n_batches=pop.n_batches, **self._extract_result_kwargs()) def update(self, batch, batch_index): @@ -651,13 +657,14 @@ def _init_new_round(self): threshold=self.current_population_threshold) def _extract_population(self): - pop = self._rejection.extract_result() - pop.method_name = "Rejection within SMC-ABC" - w, cov = self._compute_weights_and_cov(pop) - pop.weights = w - pop.cov = cov - pop.n_batches = self._rejection.state['n_batches'] - return pop + sample = self._rejection.extract_result() + # Append the sample object + sample.method_name = "Rejection within SMC-ABC" + w, cov = self._compute_weights_and_cov(sample) + sample.weights = w + sample.meta['cov'] = cov + sample.meta['n_batches'] = self._rejection.state['n_batches'] + return sample def _compute_weights_and_cov(self, pop): params = np.column_stack(tuple([pop.outputs[p] for p in self.parameter_names])) @@ -701,9 +708,9 @@ def _update_objective(self): @property def _gm_params(self): - pop_ = self._populations[-1] - params_ = np.column_stack(tuple([pop_.samples[p] for p in self.parameter_names])) - return params_, pop_.cov, pop_.weights + sample = self._populations[-1] + params = sample.samples_array + return params, sample.cov, sample.weights @property def current_population_threshold(self): diff --git a/elfi/methods/results.py b/elfi/methods/results.py index 1869d3d8..08c062a8 100644 --- a/elfi/methods/results.py +++ b/elfi/methods/results.py @@ -49,13 +49,12 @@ def __init__(self, x_min, **kwargs): self.x_min = x_min -# TODO: refactor class Sample(ParameterInferenceResult): """Sampling results from the methods. """ def __init__(self, method_name, outputs, parameter_names, discrepancy_name=None, - **kwargs): + weights=None, **kwargs): """ Parameters @@ -71,21 +70,17 @@ def __init__(self, method_name, outputs, parameter_names, discrepancy_name=None, self.samples = OrderedDict() for n in self.parameter_names: self.samples[n] = self.outputs[n] - if discrepancy_name is not None: - self.discrepancy = self.outputs[discrepancy_name] - self.n_samples = len(self.outputs[self.parameter_names[0]]) - self.n_params = len(self.parameter_names) + self.discrepancy_name = discrepancy_name + self.weights = weights def __getattr__(self, item): """Allows more convenient access to items under self.meta. """ - if item in self.__dict__: - return self.item - elif item in self.meta.keys(): + if item in self.meta.keys(): return self.meta[item] else: - raise AttributeError("No attribute '{}' in this Result".format(item)) + raise AttributeError("No attribute '{}' in this sample".format(item)) def __dir__(self): """Allows autocompletion for items under self.meta. @@ -96,62 +91,71 @@ def __dir__(self): return items @property - def samples_list(self): - """ - Return the samples as a list in the same order as in the OrderedDict samples. + def n_samples(self): + return len(self.outputs[self.parameter_names[0]]) - Returns - ------- - list of np.arrays - """ - return list(self.samples.values()) + @property + def dim(self): + return len(self.parameter_names) + + @property + def discrepancies(self): + return None if self.discrepancy_name is None else \ + self.outputs[self.discrepancy_name] @property - def names_list(self): + def samples_array(self): """ - Return the parameter names as a list in the same order as in the OrderedDict samples. + Return the samples as an array with columns in the same order as in + self.parameter_names. Returns ------- - list of strings + list of np.arrays """ - return list(self.samples.keys()) + return np.column_stack(tuple(self.samples.values())) def __str__(self): # create a buffer for capturing the output from summary's print statement stdout0 = sys.stdout buffer = io.StringIO() sys.stdout = buffer - self.summary + self.summary() sys.stdout = stdout0 # revert to original stdout return buffer.getvalue() def __repr__(self): return self.__str__() - @property def summary(self): """Print a verbose summary of contained results. """ # TODO: include __str__ of Inference Task, seed? - desc = "Method: {}\nNumber of posterior samples: {}\n"\ + desc = "Method: {}\nNumber of samples: {}\n"\ .format(self.method_name, self.n_samples) if hasattr(self, 'n_sim'): desc += "Number of simulations: {}\n".format(self.n_sim) if hasattr(self, 'threshold'): desc += "Threshold: {:.3g}\n".format(self.threshold) print(desc, end='') - self.posterior_means + self.summary_sample_means() - # TODO: return the actual values, add axis=0 - @property - def posterior_means(self): + def summary_sample_means(self): """Print a representation of posterior means. """ - s = "Posterior means: " - s += ', '.join(["{}: {:.3g}".format(k, np.mean(v)) for k,v in self.samples.items()]) + s = "Sample means: " + s += ', '.join(["{}: {:.3g}".format(k, v) for k,v in self.sample_means.items()]) print(s) + @property + def sample_means(self): + return OrderedDict([(k, np.average(v, axis=0, weights=self.weights)) for \ + k,v in self.samples.items()]) + + @property + def sample_means_array(self): + return np.array(list(self.sample_means.values())) + def plot_marginals(self, selector=None, bins=20, axes=None, **kwargs): """Plot marginal distributions for parameters. @@ -192,41 +196,45 @@ def plot_pairs(self, selector=None, bins=20, axes=None, **kwargs): class SmcSample(Sample): """Container for results from SMC-ABC. """ - def __init__(self, *args, **kwargs): - super(SmcSample, self).__init__(*args, **kwargs) - self.n_populations = len(self.populations) + def __init__(self, method_name, outputs, parameter_names, populations, *args, + **kwargs): + """ - @property - def posterior_means(self): - """Print a representation of posterior means. + Parameters + ---------- + method_name : str + outputs : dict + parameter_names : list + populations : list[Sample] + List of Sample objects + args + kwargs """ - s = self.populations[-1].samples_list - w = self.populations[-1].weights - n = self.names_list - out = '' - out += "Posterior means for final population: " - out += ', '.join(["{}: {:.3g}".format(n[jj], np.average(s[jj], weights=w, axis=0)) - for jj in range(self.n_params)]) - print(out) + super(SmcSample, self).__init__(method_name=method_name, outputs=outputs, + parameter_names=parameter_names, *args, **kwargs) + self.populations = populations + + if self.weights is None: + raise ValueError("No weights provided for the sample") @property - def posterior_means_all_populations(self): - """Print a representation of posterior means for all populations. + def n_populations(self): + return len(self.populations) - Returns - ------- - out : string + @property + def sample_means_all_populations(self): + """Return a list of sample means for all populations """ - samples = [pop.samples_list for pop in self.populations] - weights = [pop.weights for pop in self.populations] - n = self.names_list + means = [] + for i in range(self.n_populations): + means.append(self.populations[i].sample_means) + return means + + def summary_sample_means_all_populations(self): out = '' - for ii in range(self.n_populations): - s = samples[ii] - w = weights[ii] - out += "Posterior means for population {}: ".format(ii) - out += ', '.join(["{}: {:.3g}".format(n[jj], np.average(s[jj], weights=w, axis=0)) - for jj in range(self.n_params)]) + for i, means in enumerate(self.sample_means_all_populations): + out += "Sample means for population {}: ".format(i) + out += ', '.join(["{}: {:.3g}".format(k, v) for k, v in means.items()]) out += '\n' print(out) diff --git a/elfi/model/elfi_model.py b/elfi/model/elfi_model.py index 84b42419..4249ee5c 100644 --- a/elfi/model/elfi_model.py +++ b/elfi/model/elfi_model.py @@ -11,12 +11,12 @@ from elfi.model.graphical_model import GraphicalModel from elfi.model.utils import rvs_from_distribution, distance_as_discrepancy from elfi.store import OutputPool -from elfi.utils import scipy_from_str, observed_name +from elfi.utils import scipy_from_str, observed_name, random_seed __all__ = ['ElfiModel', 'ComputationContext', 'NodeReference', 'Constant', 'Operation', 'RandomVariable', 'Prior', 'Simulator', 'Summary', 'Discrepancy', 'Distance', - 'get_current_model', 'set_current_model'] + 'get_current_model', 'set_current_model', 'new_model'] """This module contains the classes for creating generative models in ELFI. The class that @@ -47,11 +47,18 @@ def set_current_model(model=None): _current_model = model +def new_model(set_current=True): + model = ElfiModel() + if set_current: + set_current_model(model) + return model + + def random_name(length=4, prefix=''): return prefix + str(uuid.uuid4().hex[0:length]) -# TODO: make this a property of algorithm that runs the inference +# TODO: move to another file class ComputationContext: """ @@ -59,14 +66,13 @@ class ComputationContext: ---------- seed : int batch_size : int - observed : dict pool : elfi.OutputPool num_submissions : int Number of submissions using this context. """ - def __init__(self, batch_size=None, seed=None, observed=None, pool=None): + def __init__(self, batch_size=None, seed=None, pool=None): """ Parameters @@ -79,12 +85,16 @@ def __init__(self, batch_size=None, seed=None, observed=None, pool=None): pool : elfi.OutputPool """ - - # Extract the seed from numpy RandomState. Alternative would be to use - # os.urandom(4) casted as int. - self.seed = np.random.RandomState().get_state()[1][0] if seed is None else seed self.batch_size = batch_size or 1 - self.observed = observed or {} + + # Synchronize the seed with the pool + if seed is None: + if pool is not None and pool.seed: + seed = pool.seed + else: + seed = random_seed() + + self.seed = seed # Count the number of submissions from this context self.num_submissions = 0 @@ -98,41 +108,39 @@ def pool(self): @pool.setter def pool(self, pool): if pool is not None: - pool.init_context(self) + if not pool.context_set: + pool.set_context(self) + if pool.seed != self.seed: + raise ValueError('Pool seed differs from the given seed!') self._pool = pool def callback(self, batch, batch_index): - if self.pool: + if self.pool is not None: self.pool.add_batch(batch, batch_index) def copy(self): return copy.copy(self) -# TODO: make a `elfi.new_model` function and move the `set_current` functionality to there class ElfiModel(GraphicalModel): """A generative model for LFI """ - def __init__(self, name=None, source_net=None, computation_context=None, - set_current=True): + def __init__(self, name=None, observed=None, source_net=None): """ Parameters ---------- name : str, optional + observed : dict, optional + Observed data with node names as keys. source_net : nx.DiGraph, optional - computation_context : elfi.ComputationContext, optional set_current : bool, optional Sets this model as the current ELFI model """ super(ElfiModel, self).__init__(source_net) self.name = name or "model_{}".format(random_name()) - # TODO: remove computation context from model - self.computation_context = computation_context or ComputationContext() - - if set_current: - set_current_model(self) + self.observed = observed or {} @property def name(self): @@ -144,6 +152,25 @@ def name(self, name): """Sets the name of the model""" self.source_net.graph['name'] = name + @property + def observed(self): + """The observed data for the nodes in a dictionary.""" + return self.source_net.graph['observed'] + + @observed.setter + def observed(self, observed): + """Set the observed data of the model + + Parameters + ---------- + observed : dict + + """ + if not isinstance(observed, dict): + raise ValueError("Observed data must be given in a dictionary with the node" + "name as the key") + self.source_net.graph['observed'] = observed + def generate(self, batch_size=1, outputs=None, with_values=None): """Generates a batch of outputs using the global seed. @@ -165,7 +192,7 @@ def generate(self, batch_size=1, outputs=None, with_values=None): if not isinstance(outputs, list): raise ValueError('Outputs must be a list of node names') - context = self.computation_context.copy() + context = ComputationContext() # Use the global random_state context.seed = 'global' context.batch_size = batch_size @@ -225,11 +252,6 @@ def remove_node(self, name): self.observed.pop(name) super(ElfiModel, self).remove_node(name) - @property - def observed(self): - """The observed data for the nodes in a dictionary.""" - return self.computation_context.observed - @property def parameter_names(self): """A list of model parameter names in an alphabetical order.""" @@ -270,8 +292,7 @@ def copy(self): ElfiModel """ - kopy = super(ElfiModel, self).copy(set_current=False) - kopy.computation_context = self.computation_context.copy() + kopy = super(ElfiModel, self).copy() kopy.name = "{}_copy_{}".format(self.name, random_name()) return kopy @@ -546,7 +567,7 @@ def __init__(self, *parents, state, observed=None, **kwargs): # Set the observed value if observed is not None: - self.model.computation_context.observed[self.name] = observed + self.model.observed[self.name] = observed @property def observed(self): diff --git a/elfi/model/graphical_model.py b/elfi/model/graphical_model.py index 046453c0..6ff000b3 100644 --- a/elfi/model/graphical_model.py +++ b/elfi/model/graphical_model.py @@ -107,8 +107,8 @@ def nodes(self): """Returns a list of nodes""" return self.source_net.nodes() - def copy(self, *args, **kwargs): - kopy = self.__class__(*args, **kwargs) + def copy(self): + kopy = self.__class__() # Copy the source net kopy.source_net = nx.DiGraph(self.source_net) return kopy diff --git a/elfi/store.py b/elfi/store.py index ca2decf0..36439c84 100644 --- a/elfi/store.py +++ b/elfi/store.py @@ -1,6 +1,7 @@ import os import io import shutil +import pickle import numpy as np import numpy.lib.format as npformat @@ -17,6 +18,7 @@ class OutputPool: compatible store. """ + def __init__(self, outputs=None): """ @@ -28,23 +30,33 @@ def __init__(self, outputs=None): Parameters ---------- - outputs : list - list of node names which to store. `OutputPool` will create a regular - dictionary as a store for those nodes. + outputs : list, dict, optional + list of node names which to store or a dictionary with existing stores. The + stores are created on demand. Returns ------- instance : OutputPool """ - self.output_stores = dict() - outputs = outputs or {} - for output in outputs: - self.output_stores[output] = dict() + if outputs is None: + stores = {} + elif isinstance(outputs, dict): + stores = outputs + else: + stores = dict.fromkeys(outputs) + + self.stores = stores + + # Context information self.batch_size = None self.seed = None - def init_context(self, context): + @property + def context_set(self): + return self.seed is not None and self.batch_size is not None + + def set_context(self, context): """Sets the context of the pool for identifying the batch size and seed for which these results are computed. @@ -56,11 +68,14 @@ def init_context(self, context): ------- None """ + if self.context_set: + raise ValueError('Context is already set') + self.batch_size = context.batch_size self.seed = context.seed def get_batch(self, batch_index, outputs=None): - """Returns a batch from the stores. + """Returns a batch from the stores of the pool. Parameters ---------- @@ -72,33 +87,42 @@ def get_batch(self, batch_index, outputs=None): ------- batch : dict """ + outputs = outputs or self.outputs batch = dict() for output in outputs: - store = self.output_stores[output] + store = self.stores[output] + if store is None: + continue if batch_index in store: batch[output] = store[batch_index] return batch - def __getitem__(self, node): - return self.output_stores[node] - def add_batch(self, batch, batch_index): """Adds the outputs from the batch to their stores.""" - for node, store in self.output_stores.items(): - if node not in batch: + for node, values in batch.items(): + if node not in self.stores: continue - # Do not add again. With the same pool the results should be the same. + store = self._get_store_for(node) + + # Do not add again. The output should be the same. if batch_index in store: continue - store[batch_index] = batch[node] + + store[batch_index] = values def remove_batch(self, batch_index): - """Removes the batch from all stores.""" - for store in self.output_stores.values(): + """Removes the batch from all the stores.""" + for store in self.stores.values(): if batch_index in store: del store[batch_index] + def has_store(self, name): + return name in self.stores + + def get_store(self, name): + return self.stores[name] + def add_store(self, name, store=None): """Adds a store object for a node with name `name`. @@ -112,8 +136,11 @@ def add_store(self, name, store=None): None """ - store = store or {} - self.output_stores[name] = store + if name in self.stores and self.stores[name] is not None: + raise ValueError("Store for '{}' already exists".format(name)) + + store = store or self._make_store_for(name) + self[name] = store def remove_store(self, name): """Removes a store from the pool @@ -128,34 +155,57 @@ def remove_store(self, name): store The removed store """ - store = self.output_stores.pop(name) + store = self.stores.pop(name) return store - def __setitem__(self, node, store): - self.output_stores[node] = store + def _get_store_for(self, name): + if self.stores[name] is None: + self.stores[name] = self._make_store_for(name) + return self.stores[name] - def __contains__(self, node): - return node in self.output_stores + def _make_store_for(self, name): + """Make a default store for a node + + All the default stores will be created through this method. + """ + return {} + + def __len__(self): + """Largest batch index in any of the stores""" + l = 0 + for output, store in self.stores.items(): + if store is None: + continue + l = max(l, len(store)) + return l + + def __getitem__(self, batch_index): + """Return the batch""" + return self.get_batch(batch_index) + + def __setitem__(self, batch_index, batch): + return self.add_batch(batch, batch_index) + + def __contains__(self, batch_index): + return len(self) > batch_index def clear(self): """Removes all data from the stores""" - for store in self.output_stores.values(): + for store in self.stores.values(): store.clear() @property def outputs(self): - return self.output_stores.keys() + return list(self.stores.keys()) # TODO: Make it easier to load ArrayPool with just a name. # we could store the context to the pool folder, and drop the use of a seed in the # folder name -# TODO: Extend to general arrays. -# This probably requires using a mask class ArrayPool(OutputPool): - """Store node outputs to arrays. + """Store node outputs to .npy arrays. - The default medium for output data is a numpy binary `.npy` file, that stores array + The default store for output data is a numpy binary `.npy` file, that stores array data. Separate files will be created for different nodes. Notes @@ -163,12 +213,10 @@ class ArrayPool(OutputPool): Internally the `elfi.ArrayPool` will create an `elfi.store.BatchArrayStore' object wrapping a `NpyPersistedArray` for each output. The `elfi.store.NpyPersistedArray` - object is responsible for managing the `npy` file. - - One can use also any other type of array with `elfi.store.BatchArrayStore`. The only - requirement is that the array supports Python list indexing to access the data.""" + object is responsible for managing the `.npy` file. + """ - def __init__(self, outputs, name='arraypool', basepath=None): + def __init__(self, outputs, name=None, path=None): """ Parameters @@ -177,9 +225,9 @@ def __init__(self, outputs, name='arraypool', basepath=None): name of nodes whose output to store to a numpy .npy file. name : str Name of the pool. This will be part of the path where the data are stored. - basepath : str + path : str Path to directory under which `elfi.ArrayPool` will place its folders and - files. Default is ~/.elfi + files. Default is ./pools, where . is the current working directory. Returns ------- @@ -187,55 +235,96 @@ def __init__(self, outputs, name='arraypool', basepath=None): """ super(ArrayPool, self).__init__(outputs) + if name is not None: + # TODO: load the pool with this name + pass self.name = name - self.basepath = basepath or os.path.join(os.path.expanduser('~'), '.elfi') - os.makedirs(self.basepath, exist_ok=True) - - def init_context(self, context): - super(ArrayPool, self).init_context(context) - - os.makedirs(self.path) - - # Create the arrays and replace the output dicts with arrays - for output in self.outputs: - filename = os.path.join(self.path, output) - array = NpyPersistedArray(filename) - self.output_stores[output] = BatchArrayStore(array, self.batch_size) + self.path = path or self._default_path() + os.makedirs(self.path, exist_ok=True) @property - def path(self): + def arraypath(self): """Path to where the array files are stored. Returns ------- path : str """ - if self.seed is None: - raise ValueError('Pool must be initialized with a context (pool.init_context)') - return os.path.join(self.basepath, self.name, str(self.seed)) + if self.name is None: + return None + return self._arraypath(self.name, self.path) + + def _make_store_for(self, name): + if not self.context_set: + raise ValueError('Arraypool has no context set') + if self.name is None: + self.name = 'arraypool_{}'.format(self.seed) + os.makedirs(self.arraypath) + + filename = os.path.join(self.arraypath, name) + array = NpyPersistedArray(filename) + return BatchArrayStore(array, self.batch_size) def delete(self): """Removes the folder and all the data in this pool.""" - try: - path = self.path - except: - # Pool was not initialized + if self.arraypath is None: return + self.close() - shutil.rmtree(path) + shutil.rmtree(self.arraypath) def close(self): - """Closes the array files of the stores.""" - for store in self.output_stores.values(): + """Closes the array files of the stores and store the pool to the disk. + + You can reopen the pool with ArrayPool.open. + """ + for store in self.stores.values(): if hasattr(store, 'array') and hasattr(store.array, 'close'): store.array.close() + try: + filename = os.path.join(self.arraypath, self._pkl_name()) + pickle.dump(self, open(filename, "wb" ) ) + except: + raise ValueError('Pickling of the pool object failed. Please check that your ' + 'stores and data are pickelable. Arrays were stored to disk ' + 'succesfully.') def flush(self): """Flushes all array files of the stores.""" - for store in self.output_stores.values(): + for store in self.stores.values(): if hasattr(store, 'array') and hasattr(store.array, 'flush'): store.array.flush() + @classmethod + def open(cls, name, path=None): + """Open a closed ArrayPool from disk + + Parameters + ---------- + name : str + path : str, optional + + Returns + ------- + + """ + path = path or cls._default_path() + filename = os.path.join(cls._arraypath(name, path), cls._pkl_name()) + return pickle.load(open(filename, "rb" )) + + @classmethod + def _pkl_name(cls): + return cls.__name__.lower() + '.pkl' + + @classmethod + def _arraypath(cls, name, path): + return os.path.join(path, name) + + @classmethod + def _default_path(cls): + return os.path.join(os.getcwd(), 'pools') + + class BatchStore: """Stores batches for a single node""" @@ -418,10 +507,10 @@ def size(self): def append(self, array): """Append data from array to self.""" - if self.fs is None: - raise ValueError('Array has been deleted') + if self.closed: + raise ValueError('Array is not opened.') - if self.header_length is None: + if not self.initialized: self._init_from_array(array) if array.shape[1:] != self.shape[1:]: @@ -514,7 +603,7 @@ def truncate(self, length=0): self.fs.truncate() def close(self): - if self.header_length: + if self.initialized: self._write_header_data() self.fs.close() @@ -523,7 +612,7 @@ def clear(self): def delete(self): """Removes the file and invalidates this array""" - if not self.fs: + if self.deleted: return name = self.fs.name self.close() @@ -571,3 +660,25 @@ def _write_header_data(self): # Flag bytes off as they are now written self._header_bytes_to_write = None + + @property + def deleted(self): + return self.fs is None + + @property + def closed(self): + return self.deleted or self.fs.closed + + @property + def initialized(self): + return (not self.closed) and (self.header_length is not None) + + def __getstate__(self): + if not self.fs.closed: + self.flush() + return {'name': self.name} + + def __setstate__(self, state): + name = state.pop('name') + self.__init__(name) + diff --git a/elfi/utils.py b/elfi/utils.py index c2c621c1..7d35200e 100644 --- a/elfi/utils.py +++ b/elfi/utils.py @@ -1,3 +1,5 @@ +import uuid + import scipy.stats as ss import numpy as np import networkx as nx @@ -18,6 +20,16 @@ def scipy_from_str(name): return getattr(ss, name) +def random_seed(): + # Extract the seed from numpy RandomState. Alternative would be to use + # os.urandom(4) casted as int. + return np.random.RandomState().get_state()[1][0] + + +def random_name(length=4, prefix=''): + return prefix + str(uuid.uuid4().hex[0:length]) + + def observed_name(name): return "_{}_observed".format(name) @@ -83,4 +95,4 @@ def get_sub_seed(random_state, sub_seed_index, high=2**31): seen.update(sub_seeds) n_unique = len(seen) - return sub_seeds[-1] \ No newline at end of file + return sub_seeds[-1] diff --git a/tests/conftest.py b/tests/conftest.py index 8d111f24..9d775dc5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,6 @@ import logging import time import os -import sys import numpy as np import pytest @@ -70,7 +69,8 @@ def use_logging(): @pytest.fixture() def skip_travis(): if "TRAVIS" in os.environ and os.environ['TRAVIS'] == "true": - pytest.skip("Skipping this test in Travis CI due to very slow run-time. Tested locally!") + pytest.skip("Skipping this test in Travis CI due to very slow run-time. Tested " + "locally!") """Model fixtures""" diff --git a/tests/functional/test_consistency.py b/tests/functional/test_consistency.py index 5e6b39b4..a32a3a16 100644 --- a/tests/functional/test_consistency.py +++ b/tests/functional/test_consistency.py @@ -90,7 +90,8 @@ def test_bo(ma2): assert np.array_equal(res.x_min, res_same.x_min) -@pytest.mark.usefixtures('with_all_clients') +# TODO: skipped in travis due to NUTS initialization failing too often. Should be fixed. +@pytest.mark.usefixtures('with_all_clients', 'skip_travis') def test_bolfi(ma2): bs = 2 n_samples = 4 diff --git a/tests/functional/test_custom_outputs.py b/tests/functional/test_custom_outputs.py index 75549850..58b004d3 100644 --- a/tests/functional/test_custom_outputs.py +++ b/tests/functional/test_custom_outputs.py @@ -41,7 +41,7 @@ def test_dict_output(): obs = simulator([.2, .8]) - elfi.ElfiModel() + elfi.new_model() p = elfi.Prior('dirichlet', [2, 2]) sim = elfi.Simulator(vsim, p, observed=obs) S = elfi.Summary(vsum, sim) @@ -53,7 +53,7 @@ def test_dict_output(): mean = np.mean(sample.samples['p'], axis=0) # Crude test - assert mean[1] - mean[0] > .2 + assert mean[1] > mean[0] def test_list_output(): @@ -72,7 +72,7 @@ def test_list_output(): obs = lsimulator([.2, .8]) - elfi.ElfiModel() + elfi.new_model() p = elfi.Prior('dirichlet', [2, 2]) sim = elfi.Simulator(vsim, p, observed=obs) S = elfi.Summary(vsum, sim) @@ -84,4 +84,4 @@ def test_list_output(): mean = np.mean(sample.samples['p'], axis=0) # Crude test - assert mean[1] - mean[0] > .2 + assert mean[1] > mean[0] diff --git a/tests/functional/test_inference.py b/tests/functional/test_inference.py index dee8da00..162d5db6 100644 --- a/tests/functional/test_inference.py +++ b/tests/functional/test_inference.py @@ -57,7 +57,7 @@ def test_rejection_with_quantile(): check_inference_with_informative_data(res.samples, N, true_params) # Check that there are no repeating values indicating a seeding problem - assert len(np.unique(res.discrepancy)) == N + assert len(np.unique(res.discrepancies)) == N assert res.accept_rate == quantile assert res.n_sim == int(N/quantile) @@ -76,7 +76,7 @@ def test_rejection_with_threshold(): assert res.threshold <= t # Test that we got unique samples (no repeating of batches). - assert len(np.unique(res.discrepancy)) == N + assert len(np.unique(res.discrepancies)) == N @pytest.mark.usefixtures('with_all_clients') diff --git a/tests/functional/test_post_processing.py b/tests/functional/test_post_processing.py index f5e47031..35060856 100644 --- a/tests/functional/test_post_processing.py +++ b/tests/functional/test_post_processing.py @@ -38,7 +38,7 @@ def test_single_parameter_linear_adjustment(): sigma1 = (1/sigma0**2 + n/sigma**2)**(-0.5) # Model - m = elfi.ElfiModel(set_current=False) + m = elfi.ElfiModel() elfi.Prior('norm', mu0, sigma0, model=m, name='mu') elfi.Simulator(sim_fn, m['mu'], observed=y_obs, name='Gauss') elfi.Summary(lambda x: x.mean(axis=1), m['Gauss'], name='S1') @@ -74,7 +74,7 @@ def test_nonfinite_values(): sigma1 = (1/sigma0**2 + n/sigma**2)**(-0.5) # Model - m = elfi.ElfiModel(set_current=False) + m = elfi.ElfiModel() elfi.Prior('norm', mu0, sigma0, model=m, name='mu') elfi.Simulator(sim_fn, m['mu'], observed=y_obs, name='Gauss') elfi.Summary(lambda x: x.mean(axis=1), m['Gauss'], name='S1') diff --git a/tests/functional/test_serialization.py b/tests/functional/test_serialization.py index c0b4fa17..ac840cd9 100644 --- a/tests/functional/test_serialization.py +++ b/tests/functional/test_serialization.py @@ -2,6 +2,7 @@ import numpy as np +from elfi.model.elfi_model import ComputationContext from elfi.examples import ma2 from elfi.client import ClientBase from elfi.executor import Executor @@ -26,7 +27,7 @@ def test_pickle_ma2(): def test_pickle_ma2_compiled_and_loaded(ma2): compiled = ClientBase.compile(ma2.source_net, ['d']) - loaded = ClientBase.load_data(compiled, ma2.computation_context, 0) + loaded = ClientBase.load_data(compiled, ComputationContext(), 0) np.random.seed(0) result = Executor.execute(loaded) diff --git a/tests/unit/test_bo.py b/tests/unit/test_bo.py index e6b10620..45287f07 100644 --- a/tests/unit/test_bo.py +++ b/tests/unit/test_bo.py @@ -40,7 +40,7 @@ def test_BO(ma2): assert bo.n_precomputed_evidence == n_init assert bo.n_initial_evidence == n_init - assert np.array_equal(bo.target_model._gp.X[:n_init, 0], res_init.samples_list[0]) + assert np.array_equal(bo.target_model._gp.X[:n_init, 0], res_init.samples_array[:,0]) @pytest.mark.usefixtures('with_all_clients') diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index ca0bbd24..a5ff59dd 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -10,8 +10,8 @@ def test_batch_handler(simple_model): m = simple_model - m.computation_context = elfi.ComputationContext(seed=123, batch_size=10) - batches = elfi.client.BatchHandler(m, 'k2') + computation_context = elfi.ComputationContext(seed=123, batch_size=10) + batches = elfi.client.BatchHandler(m, computation_context, 'k2') batches.submit() out0, i0 = batches.wait_next() diff --git a/tests/unit/test_methods.py b/tests/unit/test_methods.py index 089ab0a3..6e12c2e4 100644 --- a/tests/unit/test_methods.py +++ b/tests/unit/test_methods.py @@ -15,8 +15,8 @@ def test_no_model_parameters(simple_model): @pytest.mark.usefixtures('with_all_clients') -def test_smc_prior_use(ma2): - thresholds = [.5] +def test_smc(ma2): + thresholds = [.5, .2] N = 1000 smc = elfi.SMC(ma2['d'], batch_size=20000) res = smc.sample(N, thresholds=thresholds) @@ -24,8 +24,21 @@ def test_smc_prior_use(ma2): # Test that the density is uniform assert np.allclose(dens, dens[0]) + # Test that weighted mean is computed + w = res.weights + assert w is not None -# very superficial test to compensate for test_inference.test_BOLFI not being run on Travis + samples = res.samples_array + means = res.sample_means_array + assert np.allclose(means, np.average(samples, 0, w)) + assert not np.allclose(means, np.mean(samples, 0)) + + # Test result summaries + res.summary() + res.summary_sample_means_all_populations() + + +# A superficial test to compensate for test_inference.test_BOLFI not being run on Travis @pytest.mark.usefixtures('with_all_clients') def test_BOLFI_short(ma2, distribution_test): @@ -51,8 +64,8 @@ def test_BOLFI_short(ma2, distribution_test): n_samples = 10 n_chains = 2 res_sampling = bolfi.sample(n_samples, n_chains=n_chains) - assert len(res_sampling.samples_list) == 2 - assert len(res_sampling.samples_list[0]) == n_samples//2 * n_chains + assert res_sampling.samples_array.shape[1] == 2 + assert len(res_sampling.samples_array) == n_samples//2 * n_chains # check the cached predictions for RBF x = np.random.random((1, 2)) diff --git a/tests/unit/test_results.py b/tests/unit/test_results.py index 72e488e4..086f33e1 100644 --- a/tests/unit/test_results.py +++ b/tests/unit/test_results.py @@ -1,16 +1,15 @@ import pytest from elfi.methods.results import * -from elfi.methods.posteriors import BolfiPosterior -def test_Result(): +def test_sample(): n_samples = 10 parameter_names = ['a', 'b'] distance_name = 'dist' samples = [np.random.random(n_samples), np.random.random(n_samples), np.random.random(n_samples)] outputs = dict(zip(parameter_names + [distance_name], samples)) - result = Sample(method_name="TestRes", + sample = Sample(method_name="TestRes", outputs=outputs, parameter_names=parameter_names, discrepancy_name=distance_name, @@ -19,22 +18,26 @@ def test_Result(): n_sim=0, ) - assert result.method_name == "TestRes" - assert hasattr(result, 'samples') - assert result.n_samples == n_samples - assert result.n_params == len(parameter_names) + assert sample.method_name == "TestRes" + assert hasattr(sample, 'samples') + assert sample.n_samples == n_samples + assert sample.dim == len(parameter_names) - assert np.allclose(samples[:-1], result.samples_list) - assert np.allclose(samples[-1], result.discrepancy) + assert np.allclose(samples[0], sample.samples_array[:,0]) + assert np.allclose(samples[1], sample.samples_array[:,1]) + assert np.allclose(samples[-1], sample.discrepancies) - assert hasattr(result, 'something') - assert result.something_else == 'y' + assert hasattr(sample, 'something') + assert sample.something_else == 'y' with pytest.raises(AttributeError): - result.not_here + sample.not_here + # Test summary + sample.summary() -def test_ResultBOLFI(): + +def test_bolfi_sample(): n_chains = 3 n_iters = 10 warmup = 5 @@ -55,7 +58,7 @@ def test_ResultBOLFI(): assert hasattr(result, 'chains') assert hasattr(result, 'outputs') assert result.n_samples == n_chains * (n_iters - warmup) - assert result.n_params == len(parameter_names) + assert result.dim == len(parameter_names) # verify that chains are merged correctly s0 = np.concatenate([chains[i, warmup:, 0] for i in range(n_chains)]) @@ -65,7 +68,3 @@ def test_ResultBOLFI(): assert hasattr(result, 'something') assert result.something_else == 'y' - - -def test_bolfi_posterior(ma2): - pass \ No newline at end of file diff --git a/tests/unit/test_store.py b/tests/unit/test_store.py index 031b59cf..30e587d3 100644 --- a/tests/unit/test_store.py +++ b/tests/unit/test_store.py @@ -1,4 +1,5 @@ import os +import pickle import numpy as np @@ -6,13 +7,13 @@ from elfi.store import OutputPool, NpyPersistedArray, ArrayPool -# TODO: npy_persisted_array rewriting of data. - - def test_npy_persisted_array(): filename = 'test.npy' original = np.random.rand(3, 2) + append = np.random.rand(10, 2) + ones = np.ones((10,2)) + append2 = np.random.rand(23, 2) arr = NpyPersistedArray(filename, truncate=True) arr.append(original) @@ -23,19 +24,29 @@ def test_npy_persisted_array(): # Test appending and reading arr = NpyPersistedArray(filename) - append = np.random.rand(100, 2) arr.append(append) arr.flush() loaded = np.load(filename) assert np.array_equal(np.r_[original, append], loaded) - append2 = np.random.rand(23, 2) arr.append(append2) assert np.array_equal(np.r_[original, append, append2], arr[:]) - arr.close() + arr.flush() loaded = np.load(filename) assert np.array_equal(np.r_[original, append, append2], loaded) + # Test rewriting + arr[3:13, :] = ones + arr.close() + loaded = np.load(filename) + assert np.array_equal(np.r_[original, ones, append2], loaded) + + # Test pickling + arr = NpyPersistedArray(filename) + serialized = pickle.dumps(arr) + arr = pickle.loads(serialized) + assert np.array_equal(np.r_[original, ones, append2], arr[:]) + # Test truncate method arr = NpyPersistedArray(filename) arr.truncate(len(original)) @@ -60,23 +71,36 @@ def test_array_pool(ma2): bs = 100 total = 1000 rej_pool = elfi.Rejection(ma2['d'], batch_size=bs, pool=pool) - rej_pool.sample(N, n_sim=total) + means = rej_pool.sample(N, n_sim=total).sample_means_array - assert len(pool['MA2']) == total/bs - assert len(pool['S1']) == total/bs - assert not 't1' in pool + assert len(pool.stores['MA2']) == total/bs + assert len(pool.stores['S1']) == total/bs + assert len(pool) == total/bs + assert not 't1' in pool.stores # Test against in memory pool with using batches pool2 = OutputPool(['MA2', 'S1']) rej = elfi.Rejection(ma2['d'], batch_size=bs, pool=pool2, seed=pool.seed) rej.sample(N, n_sim=total) for bi in range(int(total/bs)): - assert np.array_equal(pool['S1'][bi], pool2['S1'][bi]) + assert np.array_equal(pool.stores['S1'][bi], pool2.stores['S1'][bi]) # Test running the inference again rej_pool.sample(N, n_sim=total) + # Test using the same pool with another sampler + rej_pool_new = elfi.Rejection(ma2['d'], batch_size=bs, pool=pool) + assert len(pool) == total/bs + assert np.array_equal(means, rej_pool_new.sample(N, n_sim=total).sample_means_array) + + # Test closing and opening the pool + pool.close() + pool = ArrayPool.open(pool.name) + assert len(pool) == total/bs + + # Test removing the pool pool.delete() - assert not os.path.exists(pool.path) + assert not os.path.exists(pool.arraypath) + diff --git a/tests/unit/test_tools.py b/tests/unit/test_tools.py index 5e144787..6500c7a7 100644 --- a/tests/unit/test_tools.py +++ b/tests/unit/test_tools.py @@ -49,6 +49,7 @@ def test_vectorized_pickling(): def test_external_operation(): # Note that the test string has intentionally not uniform formatting with spaces + elfi.new_model() op = elfi.tools.external_operation('echo 1 {0} 4 5 6 {seed}') constant = elfi.Constant(123) simulator = elfi.Simulator(op, constant) From eb48b9b095e2200e1f7acbc6f448ff39f88c4145 Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Fri, 14 Jul 2017 09:23:47 +0300 Subject: [PATCH 4/9] Fix (#205) --- CHANGELOG.rst | 4 +++ elfi/model/elfi_model.py | 64 +++++++++++++++++++++++++++------------- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2f90d889..730f7007 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,10 @@ Changelog ========== +dev +--- + +- Fix elfi.Prior and NoneType error #203 dev --- diff --git a/elfi/model/elfi_model.py b/elfi/model/elfi_model.py index 4249ee5c..656c72d4 100644 --- a/elfi/model/elfi_model.py +++ b/elfi/model/elfi_model.py @@ -1,3 +1,4 @@ +import logging import copy import inspect import re @@ -13,6 +14,8 @@ from elfi.store import OutputPool from elfi.utils import scipy_from_str, observed_name, random_seed +logger = logging.getLogger(__name__) + __all__ = ['ElfiModel', 'ComputationContext', 'NodeReference', 'Constant', 'Operation', 'RandomVariable', 'Prior', 'Simulator', 'Summary', 'Discrepancy', 'Distance', @@ -479,33 +482,52 @@ def _give_name(self, name, model): name = self._new_name(name[:-1], model) return name + try: + name = self._inspect_name() + except: + logger.warning("Automatic name inspection failed, using a random name " + "instead. This may be caused by using an interactive Python " + "shell. You can provide a name parameter e.g. " + "elfi.Prior('uniform', name='nodename') to suppress this " + "warning.") + name = None + + if name is None or model.has_node(name): + name = self._new_name(model=model) + + return name + + def _inspect_name(self): + """Magic method in trying to infer the name from the code. + + Does not work in interactive python shell.""" + # Test if context info is available and try to give the same name as the variable # Please note that this is only a convenience method which is not guaranteed to # work in all cases. If you require a specific name, pass the name argument. frame = inspect.currentframe() - if frame: - # Frames are available - # Take the callers frame - frame = frame.f_back.f_back + if frame is None: + return None + + # Frames are available + # Take the callers frame + frame = frame.f_back.f_back.f_back + info = inspect.getframeinfo(frame, 1) + + # Skip super calls to find the assignment frame + while re.match('\s*super\(', info.code_context[0]): + frame = frame.f_back info = inspect.getframeinfo(frame, 1) - # Skip super calls to find the assignment frame - while re.match('\s*super\(', info.code_context[0]): - frame = frame.f_back - info = inspect.getframeinfo(frame, 1) - - # Match simple direct assignment with the class name, no commas or semicolons - # Also do not accept a name starting with an underscore - rex = '\s*([^\W_][\w]*)\s*=\s*\w?[\w\.]*{}\('.format(self.__class__.__name__) - match = re.match(rex, info.code_context[0]) - if match: - name = match.groups()[0] - # Return the same name as the assgined reference - if not model.has_node(name): - return name - - # Inspecting the name failed, return a random name - return self._new_name(model=model) + # Match simple direct assignment with the class name, no commas or semicolons + # Also do not accept a name starting with an underscore + rex = '\s*([^\W_][\w]*)\s*=\s*\w?[\w\.]*{}\('.format(self.__class__.__name__) + match = re.match(rex, info.code_context[0]) + if match: + name = match.groups()[0] + return name + else: + return None def _new_name(self, basename='', model=None): model = model or self.model From 273fe4886f352a9ef63769a1e98e423bc7918278 Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Tue, 18 Jul 2017 11:07:36 +0300 Subject: [PATCH 5/9] Further pool improvements (#206) * WIP * Further refactorings to pools and stores * Small changes * Allow the pool folder to be moved * Few last changes * Review changes --- elfi/methods/utils.py | 14 +- elfi/model/elfi_model.py | 73 +++--- elfi/store.py | 491 +++++++++++++++++++++++++-------------- tests/unit/test_store.py | 36 ++- 4 files changed, 390 insertions(+), 224 deletions(-) diff --git a/elfi/methods/utils.py b/elfi/methods/utils.py index b1516d29..600e04cf 100644 --- a/elfi/methods/utils.py +++ b/elfi/methods/utils.py @@ -260,8 +260,6 @@ def __init__(self, model): self.dim = len(self.parameter_names) self.client = Client() - self.context = ComputationContext() - # Prepare nets for the pdf methods self._pdf_node = augmenter.add_pdf_nodes(model, log=False)[0] self._logpdf_node = augmenter.add_pdf_nodes(model, log=True)[0] @@ -272,11 +270,9 @@ def __init__(self, model): def rvs(self, size=None, random_state=None): random_state = random_state or np.random + context = ComputationContext(size or 1, get_sub_seed(random_state, 0)) - self.context.batch_size = size or 1 - self.context.seed = get_sub_seed(random_state, 0) - - loaded_net = self.client.load_data(self._rvs_net, self.context, batch_index=0) + loaded_net = self.client.load_data(self._rvs_net, context, batch_index=0) batch = self.client.compute(loaded_net) rvs = np.column_stack([batch[p] for p in self.parameter_names]) @@ -304,8 +300,10 @@ def _evaluate_pdf(self, x, log=False): x = x.reshape((-1, self.dim)) batch = self._to_batch(x) - self.context.batch_size = len(x) - loaded_net = self.client.load_data(net, self.context, batch_index=0) + # TODO: we could add a seed value that would load a "random state" instance + # throwing an error if it is used, for instance seed="not used". + context = ComputationContext(len(x), seed=0) + loaded_net = self.client.load_data(net, context, batch_index=0) # Override for k, v in batch.items(): loaded_net.node[k] = {'output': v} diff --git a/elfi/model/elfi_model.py b/elfi/model/elfi_model.py index 656c72d4..2bcb08f8 100644 --- a/elfi/model/elfi_model.py +++ b/elfi/model/elfi_model.py @@ -4,8 +4,8 @@ import re import uuid from functools import partial +import logging -import numpy as np import scipy.spatial import elfi.client @@ -22,6 +22,9 @@ 'get_current_model', 'set_current_model', 'new_model'] +logger = logging.getLogger(__name__) + + """This module contains the classes for creating generative models in ELFI. The class that describes the generative model is named `ElfiModel`.""" @@ -50,8 +53,8 @@ def set_current_model(model=None): _current_model = model -def new_model(set_current=True): - model = ElfiModel() +def new_model(name=None, set_current=True): + model = ElfiModel(name=name) if set_current: set_current_model(model) return model @@ -63,7 +66,7 @@ def random_name(length=4, prefix=''): # TODO: move to another file class ComputationContext: - """ + """Container object for key components for consistent computation results. Attributes ---------- @@ -73,6 +76,9 @@ class ComputationContext: num_submissions : int Number of submissions using this context. + Notes + ----- + The attributes are immutable. """ def __init__(self, batch_size=None, seed=None, pool=None): @@ -82,47 +88,49 @@ def __init__(self, batch_size=None, seed=None, pool=None): ---------- batch_size : int seed : int, None, 'global' - When None generates a random integer seed. When `'global'` uses the global numpy random state. Only - recommended for debugging - observed : dict + When None generates a random integer seed. When `'global'` uses the global + numpy random state. Only recommended for debugging pool : elfi.OutputPool """ - self.batch_size = batch_size or 1 - # Synchronize the seed with the pool - if seed is None: - if pool is not None and pool.seed: + # Check pool context + if pool is not None and pool.has_context: + if batch_size is None: + batch_size = pool.batch_size + elif batch_size != pool.batch_size: + raise ValueError('Pool batch_size differs from the given batch_size!') + + if seed is None: seed = pool.seed - else: - seed = random_seed() + elif seed != pool.seed: + raise ValueError('Pool seed differs from the given seed!') - self.seed = seed + self._batch_size = batch_size or 1 + self._seed = random_seed() if seed is None else seed + self._pool = pool # Count the number of submissions from this context self.num_submissions = 0 - # Must be the last one to be set because uses this context in initialization - self.pool = pool + + if pool is not None and not pool.has_context: + self._pool.set_context(self) @property def pool(self): return self._pool - @pool.setter - def pool(self, pool): - if pool is not None: - if not pool.context_set: - pool.set_context(self) - if pool.seed != self.seed: - raise ValueError('Pool seed differs from the given seed!') - self._pool = pool + @property + def batch_size(self): + return self._batch_size - def callback(self, batch, batch_index): - if self.pool is not None: - self.pool.add_batch(batch, batch_index) + @property + def seed(self): + return self._seed - def copy(self): - return copy.copy(self) + def callback(self, batch, batch_index): + if self._pool is not None: + self._pool.add_batch(batch, batch_index) class ElfiModel(GraphicalModel): @@ -195,14 +203,11 @@ def generate(self, batch_size=1, outputs=None, with_values=None): if not isinstance(outputs, list): raise ValueError('Outputs must be a list of node names') - context = ComputationContext() - # Use the global random_state - context.seed = 'global' - context.batch_size = batch_size + pool = None if with_values is not None: pool = OutputPool(with_values.keys()) pool.add_batch(with_values, 0) - context.pool = pool + context = ComputationContext(batch_size, seed='global', pool=pool) client = elfi.client.get_client() compiled_net = client.compile(self.source_net, outputs) diff --git a/elfi/store.py b/elfi/store.py index 36439c84..1c511a58 100644 --- a/elfi/store.py +++ b/elfi/store.py @@ -1,3 +1,4 @@ +import logging import os import io import shutil @@ -6,6 +7,11 @@ import numpy as np import numpy.lib.format as npformat +logger = logging.getLogger(__name__) + + +_default_prefix = 'pools' + class OutputPool: """Store node outputs to dictionary-like stores. @@ -14,12 +20,18 @@ class OutputPool: Notes ----- - See the `elfi.store.BatchStore` interface if you wish to implement your own ELFI - compatible store. + Saving the store requires that all the stores are pickleable. + + Arbitrary objects that support simple array indexing can be used as stores by using + the `elfi.store.ArrayObjectStore` class. + + See the `elfi.store.StoreBase` interfaces if you wish to implement your own ELFI + compatible store. Basically any object that fulfills the Pythons dictionary + api will work as a store in the pool. """ - def __init__(self, outputs=None): + def __init__(self, outputs=None, name=None, prefix=None): """ Depending on the algorithm, some of these values may be reused @@ -33,6 +45,11 @@ def __init__(self, outputs=None): outputs : list, dict, optional list of node names which to store or a dictionary with existing stores. The stores are created on demand. + name : str, optional + Name of the pool. Used to open a saved pool from disk. + prefix : str, optional + Path to directory under which `elfi.ArrayPool` will place its folder. + Default is a relative path ./pools. Returns ------- @@ -52,13 +69,28 @@ def __init__(self, outputs=None): self.batch_size = None self.seed = None + self.name = name + self.prefix = prefix or _default_prefix + if self.path and os.path.exists(self.path): + raise ValueError("A pool with this name already exists in {}. You can use " + "OutputPool.open() to open it.".format(self.prefix)) + @property - def context_set(self): + def output_names(self): + return list(self.stores.keys()) + + @property + def has_context(self): return self.seed is not None and self.batch_size is not None def set_context(self, context): - """Sets the context of the pool for identifying the batch size and seed for which - these results are computed. + """Sets the context of the pool. + + The pool needs to know the batch_size and the seed. + + Notes + ----- + Also sets the name of the pool if not set already. Parameters ---------- @@ -68,19 +100,22 @@ def set_context(self, context): ------- None """ - if self.context_set: + if self.has_context: raise ValueError('Context is already set') self.batch_size = context.batch_size self.seed = context.seed - def get_batch(self, batch_index, outputs=None): + if self.name is None: + self.name = "{}_{}".format(self.__class__.__name__.lower(), self.seed) + + def get_batch(self, batch_index, output_names=None): """Returns a batch from the stores of the pool. Parameters ---------- batch_index : int - outputs : list + output_names : list which outputs to include to the batch Returns @@ -88,9 +123,9 @@ def get_batch(self, batch_index, outputs=None): batch : dict """ - outputs = outputs or self.outputs + output_names = output_names or self.output_names batch = dict() - for output in outputs: + for output in output_names: store = self.stores[output] if store is None: continue @@ -117,53 +152,53 @@ def remove_batch(self, batch_index): if batch_index in store: del store[batch_index] - def has_store(self, name): - return name in self.stores + def has_store(self, node): + return node in self.stores - def get_store(self, name): - return self.stores[name] + def get_store(self, node): + return self.stores[node] - def add_store(self, name, store=None): - """Adds a store object for a node with name `name`. + def add_store(self, node, store=None): + """Adds a store object for the node. Parameters ---------- - name : str - store : dict, BatchStore + node : str + store : dict, StoreBase, optional Returns ------- None """ - if name in self.stores and self.stores[name] is not None: - raise ValueError("Store for '{}' already exists".format(name)) + if node in self.stores and self.stores[node] is not None: + raise ValueError("Store for '{}' already exists".format(node)) - store = store or self._make_store_for(name) - self[name] = store + store = store if store is not None else self._make_store_for(node) + self.stores[node] = store - def remove_store(self, name): + def remove_store(self, node): """Removes a store from the pool Parameters ---------- - name : str - Store name + node : str Returns ------- store The removed store """ - store = self.stores.pop(name) + store = self.stores.pop(node) return store - def _get_store_for(self, name): - if self.stores[name] is None: - self.stores[name] = self._make_store_for(name) - return self.stores[name] + def _get_store_for(self, node): + """Gets or makes a store.""" + if self.stores[node] is None: + self.stores[node] = self._make_store_for(node) + return self.stores[node] - def _make_store_for(self, name): + def _make_store_for(self, node): """Make a default store for a node All the default stores will be created through this method. @@ -194,140 +229,186 @@ def clear(self): for store in self.stores.values(): store.clear() - @property - def outputs(self): - return list(self.stores.keys()) + def save(self): + """Save the pool to disk. + This will use pickle to store the pool under self.path. + """ + if not self.has_context: + raise ValueError("Pool context is not set, cannot save. Please see the " + "set_context method.") -# TODO: Make it easier to load ArrayPool with just a name. -# we could store the context to the pool folder, and drop the use of a seed in the -# folder name -class ArrayPool(OutputPool): - """Store node outputs to .npy arrays. + os.makedirs(self.path, exist_ok=True) - The default store for output data is a numpy binary `.npy` file, that stores array - data. Separate files will be created for different nodes. + # Pickle the stores separately + for node, store in self.stores.items(): + filename = os.path.join(self.path, node + '.pkl') + try: + pickle.dump(store, open(filename, 'wb')) + except: + raise IOError('Failed to pickle the store for node {}, please check that ' + 'it is pickleable or remove it before saving.'.format(node)) + + # Save the pool itself with stores replaced with Nones + stores = self.stores + self.stores = dict.fromkeys(stores.keys()) + filename = os.path.join(self.path, self._get_pkl_name()) + pickle.dump(self, open(filename, "wb")) + # Restore the original to the object + self.stores = stores - Notes - ----- + def close(self): + """Save and close the stores that support it. - Internally the `elfi.ArrayPool` will create an `elfi.store.BatchArrayStore' object - wrapping a `NpyPersistedArray` for each output. The `elfi.store.NpyPersistedArray` - object is responsible for managing the `.npy` file. - """ + The pool will not be usable afterwards.""" + self.save() + + for store in self.stores.values(): + if hasattr(store, 'close'): + store.close() + + def flush(self): + """Flushes all data from the stores. - def __init__(self, outputs, name=None, path=None): + If the store does not support flushing, does nothing. """ + for store in self.stores.values(): + if hasattr(store, 'flush'): + store.flush() + + def delete(self): + """Remove all persisted data from disk.""" + for store in self.stores.values(): + if hasattr(store, 'close'): + store.close() + + if self.path is None: + return + elif not os.path.exists(self.path): + return + + shutil.rmtree(self.path) + + @classmethod + def open(cls, name, prefix=None): + """Open a closed or saved ArrayPool from disk. Parameters ---------- - outputs : list - name of nodes whose output to store to a numpy .npy file. name : str - Name of the pool. This will be part of the path where the data are stored. - path : str - Path to directory under which `elfi.ArrayPool` will place its folders and - files. Default is ./pools, where . is the current working directory. - + prefix : str, optional + Returns ------- - instance : ArrayPool + """ - super(ArrayPool, self).__init__(outputs) + prefix = prefix or _default_prefix + path = cls._make_path(name, prefix) + filename = os.path.join(path, cls._get_pkl_name()) + + pool = pickle.load(open(filename, "rb")) + + # Load the stores. Change the working directory temporarily so that pickled stores + # can find their data dependencies even if the folder has been renamed. + cwd = os.getcwd() + os.chdir(path) + for node in list(pool.stores.keys()): + filename = node + '.pkl' + try: + store = pickle.load(open(filename, 'rb')) + except Exception as e: + logger.warning('Failed to load the store for node {}. Reason: {}' + .format(node, str(e))) + del pool.stores[node] + continue + pool.stores[node] = store + os.chdir(cwd) - if name is not None: - # TODO: load the pool with this name - pass - self.name = name - self.path = path or self._default_path() - os.makedirs(self.path, exist_ok=True) + # Update the name and prefix in case the pool folder was moved + pool.name = name + pool.prefix = prefix + return pool + + @classmethod + def _make_path(cls, name, prefix): + return os.path.join(prefix, name) @property - def arraypath(self): - """Path to where the array files are stored. - - Returns - ------- - path : str - """ + def path(self): if self.name is None: return None - return self._arraypath(self.name, self.path) - def _make_store_for(self, name): - if not self.context_set: - raise ValueError('Arraypool has no context set') - if self.name is None: - self.name = 'arraypool_{}'.format(self.seed) - os.makedirs(self.arraypath) + return self._make_path(self.name, self.prefix) - filename = os.path.join(self.arraypath, name) - array = NpyPersistedArray(filename) - return BatchArrayStore(array, self.batch_size) + @classmethod + def _get_pkl_name(cls): + return '_outputpool.pkl' - def delete(self): - """Removes the folder and all the data in this pool.""" - if self.arraypath is None: - return - self.close() - shutil.rmtree(self.arraypath) +class ArrayPool(OutputPool): + """OutputPool that uses binary .npy files as default stores. - def close(self): - """Closes the array files of the stores and store the pool to the disk. + The default store medium for output data is a NumPy binary `.npy` file for NumPy + array data. You can however also add other types of stores as well. - You can reopen the pool with ArrayPool.open. - """ - for store in self.stores.values(): - if hasattr(store, 'array') and hasattr(store.array, 'close'): - store.array.close() - try: - filename = os.path.join(self.arraypath, self._pkl_name()) - pickle.dump(self, open(filename, "wb" ) ) - except: - raise ValueError('Pickling of the pool object failed. Please check that your ' - 'stores and data are pickelable. Arrays were stored to disk ' - 'succesfully.') + Notes + ----- + The default store is elfi.store.NpyStore that uses NpyArrays as stores. The latter is + a wrapper over NumPy .npy binary file for array data. - def flush(self): - """Flushes all array files of the stores.""" - for store in self.stores.values(): - if hasattr(store, 'array') and hasattr(store.array, 'flush'): - store.array.flush() + """ - @classmethod - def open(cls, name, path=None): - """Open a closed ArrayPool from disk + def _make_store_for(self, node): + if not self.has_context: + raise ValueError('ArrayPool has no context set') + + # Make the directory for the array pools + os.makedirs(self.path, exist_ok=True) + + filename = os.path.join(self.path, node) + return NpyStore(filename, self.batch_size) + + def load_npy_file(self, node, n_batches=None): + """ Parameters ---------- - name : str - path : str, optional - - Returns - ------- + node : str + The node.npy file must be in self.path + n_batches : int, optional + How many batches to load. Default is as many as the array has length. """ - path = path or cls._default_path() - filename = os.path.join(cls._arraypath(name, path), cls._pkl_name()) - return pickle.load(open(filename, "rb" )) - @classmethod - def _pkl_name(cls): - return cls.__name__.lower() + '.pkl' + filename = os.path.join(self.path, node + '.npy') + if not os.path.exists(filename): + raise FileNotFoundError("Could not find file {}".format(filename)) - @classmethod - def _arraypath(cls, name, path): - return os.path.join(path, name) + self.add_store(node) + store = self.get_store(node) + + if n_batches is None: + if len(store.array) % self.batch_size != 0: + self.remove_store(node) + raise ValueError('The array length is not divisible by the batch size') + n_batches = len(store.array) // self.batch_size + else: + store.array.truncate(n_batches*self.batch_size) + + store.n_batches = n_batches - @classmethod - def _default_path(cls): - return os.path.join(os.getcwd(), 'pools') +class StoreBase: + """Base class for output stores for the pools. + Stores store the outputs of a single node in ElfiModel. This is a subset of the + Python dictionary api. -class BatchStore: - """Stores batches for a single node""" + Notes + ----- + Any dictionary like object will work directly as an ELFI store. + + """ def __getitem__(self, batch_index): raise NotImplementedError @@ -348,12 +429,29 @@ def clear(self): """Remove all batches from the store""" raise NotImplementedError + def close(self): + """Close the store. + + Optional method. Useful for closing i.e. file streams""" + pass + + def flush(self): + """Flush the store + + Optional to implement. + """ + pass + -# TODO: add mask for missing items. It should replace the use of `current_index`. +# TODO: add mask for missing items. It should replace the use of `n_batches`. # This should make it possible to also append further than directly to the end -# of current index or length of the array. -class BatchArrayStore(BatchStore): - """Helper class to use arrays as data stores in ELFI""" +# of current n_batches index. +class ArrayStore(StoreBase): + """Convert any array object to ELFI store to be used within a pool. + + This class is intended to make it easy to use objects that support array indexing + as outputs stores for nodes. + """ def __init__(self, array, batch_size, n_batches=0): """ @@ -371,49 +469,37 @@ def __init__(self, array, batch_size, n_batches=0): self.batch_size = batch_size self.n_batches = n_batches - def __contains__(self, batch_index): - b = self._to_slice(batch_index).stop - return batch_index < self.n_batches and b <= len(self.array) - def __getitem__(self, batch_index): sl = self._to_slice(batch_index) return self.array[sl] def __setitem__(self, batch_index, data): + if batch_index > self.n_batches: + raise IndexError("Appending further than to the end of the store array is " + "currently not supported.") + sl = self._to_slice(batch_index) + if sl.stop > len(self.array): + raise IndexError("There is not enough space left in the store array.") - if batch_index in self: - self.array[sl] = data - elif batch_index == self.n_batches: - # Append a new batch - if sl.stop <= len(self.array): - self.array[sl] = data - elif sl.start == len(self.array) and hasattr(self.array, 'append'): - # NpyPersistedArray supports appending - self.array.append(data) - else: - raise ValueError("There is not enough space in the array") - self.n_batches += 1 - else: - raise ValueError("Appending further than the end of the array is not yet " - "supported") + self.array[sl] = data + self.n_batches += 1 + + def __contains__(self, batch_index): + return batch_index < self.n_batches def __delitem__(self, batch_index): if batch_index not in self: raise IndexError("Cannot remove, batch index {} is not in the array" .format(batch_index)) elif batch_index != self.n_batches: - raise IndexError("It is not yet possible to remove batches from the middle " - "of the array") - - if hasattr(self.array, 'truncate'): - sl = self._to_slice(batch_index) - self.array.truncate(sl.start) + raise IndexError("Removing batches from the middle of the store array is " + "currently not supported.") self.n_batches -= 1 def __len__(self): - return int(len(self.array)/self.batch_size) + return self.n_batches def _to_slice(self, batch_index): a = self.batch_size*batch_index @@ -424,8 +510,50 @@ def clear(self): self.array.clear() self.n_batches = 0 + def flush(self): + if hasattr(self.array, 'flush'): + self.array.flush() + + def close(self): + if hasattr(self.array, 'close'): + self.array.close() + -class NpyPersistedArray: +class NpyStore(ArrayStore): + """Store data to binary .npy files + + Uses the NpyArray objects as an array store. + """ + + def __init__(self, filename, batch_size): + """ + + Parameters + ---------- + filename : str + Path to the file, relative or absolute + batch_size + """ + array = NpyArray(filename) + super(NpyStore, self).__init__(array, batch_size) + + def __setitem__(self, batch_index, data): + sl = self._to_slice(batch_index) + # NpyArray supports appending + if batch_index == self.n_batches and sl.start == len(self.array): + self.array.append(data) + self.n_batches += 1 + return + + super(NpyStore, self).__setitem__(batch_index, data) + + def __delitem__(self, batch_index): + super(NpyStore, self).__delitem__(batch_index) + sl = self._to_slice(batch_index) + self.array.truncate(sl.start) + + +class NpyArray: """ Notes @@ -440,12 +568,12 @@ class NpyPersistedArray: HEADER_DATA_OFFSET = 12 HEADER_DATA_SIZE_OFFSET = 8 - def __init__(self, name, array=None, truncate=False): + def __init__(self, filename, array=None, truncate=False): """ Parameters ---------- - name : str + filename : str File name array : ndarray, optional Initial array @@ -466,18 +594,18 @@ def __init__(self, name, array=None, truncate=False): # being closed on exception and would corrupt the .npy file. self._header_bytes_to_write = None - if name[-4:] != '.npy': - name += '.npy' - self.name = name + if filename[-4:] != '.npy': + filename += '.npy' + self.filename = filename self.fs = None - if truncate is False and os.path.exists(self.name): - self.fs = open(self.name, 'r+b') + if truncate is False and os.path.exists(self.filename): + self.fs = open(self.filename, 'r+b') self._init_from_file_header() else: - self.fs = open(self.name, 'w+b') + self.fs = open(self.filename, 'w+b') - if array: + if array is not None: self.append(array) self.flush() @@ -527,10 +655,15 @@ def append(self, array): self._prepare_header_data() def _init_from_file_header(self): - """Initialize the object from existing file""" + """Initialize the object from an existing file""" self.fs.seek(self.HEADER_DATA_SIZE_OFFSET) - self.shape, fortran_order, self.dtype = npformat.read_array_header_2_0( - self.fs) + try: + self.shape, fortran_order, self.dtype = \ + npformat.read_array_header_2_0(self.fs) + except ValueError: + raise ValueError('Npy file {} header is not 2.0 format. You can make the ' + 'conversion using elfi.store.NpyFile by passing the ' + 'preloaded array as an argument.'.format(self.filename)) self.header_length = self.fs.tell() if fortran_order: @@ -642,7 +775,7 @@ def _prepare_header_data(self): fill_len = self.header_length - h_bytes.tell() if fill_len < 0: raise OverflowError("File {} cannot be appended. The header is too short.". - format(self.name)) + format(self.filename)) elif fill_len > 0: h_bytes.write(b'\x20' * fill_len) @@ -676,9 +809,17 @@ def initialized(self): def __getstate__(self): if not self.fs.closed: self.flush() - return {'name': self.name} + return {'filename': self.filename} def __setstate__(self, state): - name = state.pop('name') - self.__init__(name) + filename = state.pop('filename') + basename = os.path.basename(filename) + if os.path.exists(filename): + self.__init__(filename) + elif os.path.exists(basename): + self.__init__(basename) + else: + self.fs = None + raise FileNotFoundError('Could not find the file {}'.format(filename)) + diff --git a/tests/unit/test_store.py b/tests/unit/test_store.py index 30e587d3..86b03d25 100644 --- a/tests/unit/test_store.py +++ b/tests/unit/test_store.py @@ -1,10 +1,11 @@ import os import pickle +import pytest import numpy as np import elfi -from elfi.store import OutputPool, NpyPersistedArray, ArrayPool +from elfi.store import OutputPool, NpyArray, ArrayPool def test_npy_persisted_array(): @@ -15,7 +16,7 @@ def test_npy_persisted_array(): ones = np.ones((10,2)) append2 = np.random.rand(23, 2) - arr = NpyPersistedArray(filename, truncate=True) + arr = NpyArray(filename, truncate=True) arr.append(original) assert np.array_equal(original, arr[:]) arr.close() @@ -23,7 +24,7 @@ def test_npy_persisted_array(): assert np.array_equal(original, loaded) # Test appending and reading - arr = NpyPersistedArray(filename) + arr = NpyArray(filename) arr.append(append) arr.flush() loaded = np.load(filename) @@ -42,13 +43,13 @@ def test_npy_persisted_array(): assert np.array_equal(np.r_[original, ones, append2], loaded) # Test pickling - arr = NpyPersistedArray(filename) + arr = NpyArray(filename) serialized = pickle.dumps(arr) arr = pickle.loads(serialized) assert np.array_equal(np.r_[original, ones, append2], arr[:]) # Test truncate method - arr = NpyPersistedArray(filename) + arr = NpyArray(filename) arr.truncate(len(original)) assert np.array_equal(original, arr[:]) arr.close() @@ -56,7 +57,7 @@ def test_npy_persisted_array(): assert np.array_equal(original, loaded) # Try that truncation in initialization works - arr = NpyPersistedArray(filename, truncate=True) + arr = NpyArray(filename, truncate=True) arr.append(append) arr.close() loaded = np.load(filename) @@ -97,10 +98,31 @@ def test_array_pool(ma2): pool.close() pool = ArrayPool.open(pool.name) assert len(pool) == total/bs + pool.close() + + # Test opening from a moved location + os.rename(pool.path, pool.path + '_move') + pool = ArrayPool.open(pool.name + '_move') + assert len(pool) == total/bs + + # Test adding a nonexistent file + with pytest.raises(FileNotFoundError): + pool.load_npy_file('test') + + # Test adding a random .npy file + r = np.random.rand(3*bs) + newfile = os.path.join(pool.path, 'test.npy') + NpyArray(newfile, r).close() + pool.load_npy_file('test') + assert len(pool.get_store('test')) == 3 + assert np.array_equal(pool[2]['test'], r[-bs:]) # Test removing the pool pool.delete() - assert not os.path.exists(pool.arraypath) + assert not os.path.exists(pool.path) + + # Remove the pool container folder + os.rmdir(pool.prefix) From 44dc2b0e71616c40a6a49546f323e679b45582cd Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Tue, 18 Jul 2017 12:05:08 +0300 Subject: [PATCH 6/9] Easier client switching (#207) * Allow client change by string * Changelog --- CHANGELOG.rst | 5 +---- elfi/client.py | 7 ++++++- tests/conftest.py | 1 - 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 730f7007..d4e8c5a0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,15 +5,12 @@ dev --- - Fix elfi.Prior and NoneType error #203 - -dev ---- - - Fix a bug preventing the reuse of ArrayPool data with a new inference - Added a possibility to pickle ArrayPool - Added ArrayPool.open to read a closed pool from disk - Refactored Sample and SmcSample classes - Added elfi.new_model method +- Made elfi.set_client method to accept clients as strings. 0.6 (2017-07-03) diff --git a/elfi/client.py b/elfi/client.py index 00087002..e2ad4c3c 100644 --- a/elfi/client.py +++ b/elfi/client.py @@ -1,4 +1,5 @@ import logging +import importlib from types import ModuleType from collections import OrderedDict @@ -9,7 +10,6 @@ ReduceCompiler, RandomStateCompiler from elfi.loader import ObservedLoader, AdditionalNodesLoader, RandomStateLoader, \ PoolLoader -from elfi.store import OutputPool logger = logging.getLogger(__name__) @@ -31,6 +31,11 @@ def get_client(): def set_client(client=None): """Set the current ELFI client instance.""" global _client + + if isinstance(client, str): + m = importlib.import_module('elfi.clients.{}'.format(client)) + client = m.Client() + _client = client diff --git a/tests/conftest.py b/tests/conftest.py index 9d775dc5..5edb104a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -188,6 +188,5 @@ def run(distribution, *args, rvs=None, **kwargs): if hasattr(distribution, 'gradient_logpdf'): glpdf_none, glpdf1, glpdf2 = test_non_rvs_attr('gradient_logpdf', distribution, rvs, *args, **kwargs) - return run From 045d8e2f2a7d3d6418bc2f5477174359e84a9b01 Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Tue, 18 Jul 2017 13:36:35 +0300 Subject: [PATCH 7/9] Arraystore improvements and fixes (#208) * Add tests for stores * Bug fixes from ArrayStore * Some minor refactorings * Review change * Changelog * Update CHANGELOG.rst --- CHANGELOG.rst | 9 +- elfi/store.py | 141 ++++++++++++---------- tests/functional/test_randomness.py | 3 - tests/functional/test_simulation_reuse.py | 41 ++++++- tests/unit/test_store.py | 119 ++++++++++++++++-- 5 files changed, 234 insertions(+), 79 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d4e8c5a0..35b37414 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,11 +6,14 @@ dev - Fix elfi.Prior and NoneType error #203 - Fix a bug preventing the reuse of ArrayPool data with a new inference -- Added a possibility to pickle ArrayPool -- Added ArrayPool.open to read a closed pool from disk +- Added pickling for OutputPool:s +- Added OutputPool.open to read a closed pool from disk - Refactored Sample and SmcSample classes - Added elfi.new_model method -- Made elfi.set_client method to accept clients as strings. +- Made elfi.set_client method to accept clients as strings for easier client switching +- Fixed a bug in NpyArray that would lead to an inconsistent state if multiple + simultaneous instances were opened. +- Added the ability to move the pool data folder 0.6 (2017-07-03) diff --git a/elfi/store.py b/elfi/store.py index 1c511a58..47ecfa0f 100644 --- a/elfi/store.py +++ b/elfi/store.py @@ -31,6 +31,8 @@ class OutputPool: """ + _pkl_name = '_outputpool.pkl' + def __init__(self, outputs=None, name=None, prefix=None): """ @@ -240,19 +242,24 @@ def save(self): os.makedirs(self.path, exist_ok=True) + # Change the working directory so that relative paths to the pool data folder can + # be reliably used. This allows moving and renaming of the folder. + cwd = os.getcwd() + os.chdir(self.path) # Pickle the stores separately for node, store in self.stores.items(): - filename = os.path.join(self.path, node + '.pkl') + filename = node + '.pkl' try: pickle.dump(store, open(filename, 'wb')) except: raise IOError('Failed to pickle the store for node {}, please check that ' 'it is pickleable or remove it before saving.'.format(node)) + os.chdir(cwd) # Save the pool itself with stores replaced with Nones stores = self.stores self.stores = dict.fromkeys(stores.keys()) - filename = os.path.join(self.path, self._get_pkl_name()) + filename = os.path.join(self.path, self._pkl_name) pickle.dump(self, open(filename, "wb")) # Restore the original to the object self.stores = stores @@ -300,11 +307,12 @@ def open(cls, name, prefix=None): Returns ------- + ArrayPool """ prefix = prefix or _default_prefix path = cls._make_path(name, prefix) - filename = os.path.join(path, cls._get_pkl_name()) + filename = os.path.join(path, cls._pkl_name) pool = pickle.load(open(filename, "rb")) @@ -340,10 +348,6 @@ def path(self): return self._make_path(self.name, self.prefix) - @classmethod - def _get_pkl_name(cls): - return '_outputpool.pkl' - class ArrayPool(OutputPool): """OutputPool that uses binary .npy files as default stores. @@ -353,8 +357,9 @@ class ArrayPool(OutputPool): Notes ----- - The default store is elfi.store.NpyStore that uses NpyArrays as stores. The latter is - a wrapper over NumPy .npy binary file for array data. + The default store is implemented in elfi.store.NpyStore that uses NpyArrays as stores. + The NpyArray is a wrapper over NumPy .npy binary file for array data and supports + appending the .npy file. It uses the .npy format 2.0 files. """ @@ -368,35 +373,6 @@ def _make_store_for(self, node): filename = os.path.join(self.path, node) return NpyStore(filename, self.batch_size) - def load_npy_file(self, node, n_batches=None): - """ - - Parameters - ---------- - node : str - The node.npy file must be in self.path - n_batches : int, optional - How many batches to load. Default is as many as the array has length. - - """ - - filename = os.path.join(self.path, node + '.npy') - if not os.path.exists(filename): - raise FileNotFoundError("Could not find file {}".format(filename)) - - self.add_store(node) - store = self.get_store(node) - - if n_batches is None: - if len(store.array) % self.batch_size != 0: - self.remove_store(node) - raise ValueError('The array length is not divisible by the batch size') - n_batches = len(store.array) // self.batch_size - else: - store.array.truncate(n_batches*self.batch_size) - - store.n_batches = n_batches - class StoreBase: """Base class for output stores for the pools. @@ -451,8 +427,16 @@ class ArrayStore(StoreBase): This class is intended to make it easy to use objects that support array indexing as outputs stores for nodes. + + Attributes + ---------- + array : array-like + The array that the batches are stored to + batch_size : int + n_batches : int + How many batches are available from the underlying array. """ - def __init__(self, array, batch_size, n_batches=0): + def __init__(self, array, batch_size, n_batches=-1): """ Parameters @@ -461,10 +445,16 @@ def __init__(self, array, batch_size, n_batches=0): Any array like object supporting Python list indexing batch_size : int Size of a batch of data - n_batches : int - When using pre allocated arrays, this keeps track of the number of batches - currently stored to the array. + n_batches : int, optional + How many batches should be made available from the array. Default is -1 + meaning all available batches. """ + + if n_batches == -1: + if len(array) % batch_size != 0: + logger.warning("The array length is not divisible by the batch size.") + n_batches = len(array) // batch_size + self.array = array self.batch_size = batch_size self.n_batches = n_batches @@ -483,7 +473,9 @@ def __setitem__(self, batch_index, data): raise IndexError("There is not enough space left in the store array.") self.array[sl] = data - self.n_batches += 1 + + if batch_index == self.n_batches: + self.n_batches += 1 def __contains__(self, batch_index): return batch_index < self.n_batches @@ -492,11 +484,13 @@ def __delitem__(self, batch_index): if batch_index not in self: raise IndexError("Cannot remove, batch index {} is not in the array" .format(batch_index)) - elif batch_index != self.n_batches: + elif batch_index != self.n_batches - 1: raise IndexError("Removing batches from the middle of the store array is " "currently not supported.") - self.n_batches -= 1 + # Move the n_batches index down + if batch_index == self.n_batches - 1: + self.n_batches -= 1 def __len__(self): return self.n_batches @@ -518,6 +512,9 @@ def close(self): if hasattr(self.array, 'close'): self.array.close() + def __del__(self): + self.close() + class NpyStore(ArrayStore): """Store data to binary .npy files @@ -525,17 +522,20 @@ class NpyStore(ArrayStore): Uses the NpyArray objects as an array store. """ - def __init__(self, filename, batch_size): + def __init__(self, file, batch_size, n_batches=-1): """ Parameters ---------- - filename : str - Path to the file, relative or absolute + file : NpyArray or str + NpyArray object or path to the .npy file batch_size + n_batches : int, optional + How many batches to make available from the file. Default -1 indicates that + all available batches. """ - array = NpyArray(filename) - super(NpyStore, self).__init__(array, batch_size) + array = file if isinstance(file, NpyArray) else NpyArray(file) + super(NpyStore, self).__init__(array, batch_size, n_batches) def __setitem__(self, batch_index, data): sl = self._to_slice(batch_index) @@ -552,6 +552,9 @@ def __delitem__(self, batch_index): sl = self._to_slice(batch_index) self.array.truncate(sl.start) + def delete(self): + self.array.delete() + class NpyArray: """ @@ -598,6 +601,9 @@ def __init__(self, filename, array=None, truncate=False): filename += '.npy' self.filename = filename + if array is not None: + truncate = True + self.fs = None if truncate is False and os.path.exists(self.filename): self.fs = open(self.filename, 'r+b') @@ -610,8 +616,8 @@ def __init__(self, filename, array=None, truncate=False): self.flush() def __getitem__(self, sl): - if self.header_length is None: - raise IndexError() + if not self.initialized: + raise IndexError("NpyArray is not initialized") order = 'F' if self.fortran_order else 'C' # TODO: do not recreate if nothing has changed mmap = np.memmap(self.fs, dtype=self.dtype, shape=self.shape, @@ -619,8 +625,8 @@ def __getitem__(self, sl): return mmap[sl] def __setitem__(self, sl, value): - if self.header_length is None: - raise IndexError() + if not self.initialized: + raise IndexError("NpyArray is not initialized") order = 'F' if self.fortran_order else 'C' mmap = np.memmap(self.fs, dtype=self.dtype, shape=self.shape, offset=self.header_length, order=order) @@ -631,6 +637,7 @@ def __len__(self): @property def size(self): + """Number of items in the array""" return np.prod(self.shape) def append(self, array): @@ -639,15 +646,16 @@ def append(self, array): raise ValueError('Array is not opened.') if not self.initialized: - self._init_from_array(array) + self.init_from_array(array) if array.shape[1:] != self.shape[1:]: - raise ValueError("Appended array is of different shape") + raise ValueError("Appended array is of different shape.") elif array.dtype != self.dtype: - raise ValueError("Appended array is of different dtype") + raise ValueError("Appended array is of different dtype.") # Append new data - self.fs.seek(0, 2) + pos = self.header_length + self.size*self.itemsize + self.fs.seek(pos) self.fs.write(array.tobytes('C')) self.shape = (self.shape[0] + len(array),) + self.shape[1:] @@ -674,7 +682,7 @@ def _init_from_file_header(self): shape = (0,) + self.shape[1:] self.itemsize = np.empty(shape=shape, dtype=self.dtype).itemsize - def _init_from_array(self, array): + def init_from_array(self, array): """Initialize the object from an array. Sets the the header_length so large that it is possible to append to the array. @@ -685,6 +693,10 @@ def _init_from_array(self, array): Contains the oversized header bytes """ + + if self.initialized: + raise ValueError("The array has been initialized already!") + self.shape = (0,) + array.shape[1:] self.dtype = array.dtype self.itemsize = array.itemsize @@ -722,15 +734,16 @@ def truncate(self, length=0): ------- """ - if self.fs is None: - raise ValueError('Array has been deleted') - elif self.fs.closed: - raise ValueError('Array has been closed') + if not self.initialized: + raise ValueError('The array must be initialized before it can be truncated. ' + 'Please see init_from_array.') + + if self.closed: + raise ValueError('The array has been closed.') # Reset length self.shape = (length,) + self.shape[1:] self._prepare_header_data() - self._write_header_data() self.fs.seek(self.header_length + self.size*self.itemsize) self.fs.truncate() diff --git a/tests/functional/test_randomness.py b/tests/functional/test_randomness.py index 53fd3285..38598fd5 100644 --- a/tests/functional/test_randomness.py +++ b/tests/functional/test_randomness.py @@ -7,9 +7,6 @@ from elfi.utils import get_sub_seed - - - def test_randomness(simple_model): k1 = simple_model['k1'] diff --git a/tests/functional/test_simulation_reuse.py b/tests/functional/test_simulation_reuse.py index b413eea1..c7eb9c53 100644 --- a/tests/functional/test_simulation_reuse.py +++ b/tests/functional/test_simulation_reuse.py @@ -1,11 +1,14 @@ import pytest import time +import os + +import numpy as np import elfi @pytest.mark.parametrize('sleep_model', [.2], indirect=['sleep_model']) -def test_pool(sleep_model): +def test_pool_usage(sleep_model): # Add nodes to the pool pool = elfi.OutputPool(outputs=sleep_model.parameter_names + ['slept', 'summary', 'd']) @@ -45,14 +48,50 @@ def test_pool(sleep_model): assert td < 1.2 +def test_pool_restarts(ma2): + pool = elfi.ArrayPool(['t1', 'd'], name='test') + rej = elfi.Rejection(ma2, 'd', batch_size=10, pool=pool, seed=123) + + rej.sample(1, n_sim=30) + pool.save() + + # Do not save the pool... + rej = elfi.Rejection(ma2, 'd', batch_size=10, pool=pool) + rej.set_objective(3, n_sim=60) + while not rej.finished: + rej.iterate() + # ...but just flush the array content + pool.get_store('t1').array.fs.flush() + pool.get_store('d').array.fs.flush() + + assert(len(pool)==6) + assert(len(pool.stores['t1'].array)==60) + pool2 = elfi.ArrayPool.open('test') + assert(len(pool2)==3) + assert(len(pool2.stores['t1'].array)==30) + rej = elfi.Rejection(ma2, 'd', batch_size=10, pool=pool2) + s9pool = rej.sample(3, n_sim=90) + pool2.save() + pool2 = elfi.ArrayPool.open('test') + rej = elfi.Rejection(ma2, 'd', batch_size=10, pool=pool2) + s9pool_loaded = rej.sample(3, n_sim=90) + rej = elfi.Rejection(ma2, 'd', batch_size=10, seed=123) + s9 = rej.sample(3, n_sim=90) + assert np.array_equal(s9pool.samples['t1'], s9.samples['t1']) + assert np.array_equal(s9pool.discrepancies, s9.discrepancies) + assert np.array_equal(s9pool.samples['t1'], s9pool_loaded.samples['t1']) + assert np.array_equal(s9pool.discrepancies, s9pool_loaded.discrepancies) + pool.delete() + pool2.delete() + os.rmdir(pool.prefix) diff --git a/tests/unit/test_store.py b/tests/unit/test_store.py index 86b03d25..b508cec7 100644 --- a/tests/unit/test_store.py +++ b/tests/unit/test_store.py @@ -5,10 +5,10 @@ import numpy as np import elfi -from elfi.store import OutputPool, NpyArray, ArrayPool +from elfi.store import OutputPool, NpyArray, ArrayPool, ArrayStore, NpyStore -def test_npy_persisted_array(): +def test_npy_array(): filename = 'test.npy' original = np.random.rand(3, 2) @@ -30,12 +30,17 @@ def test_npy_persisted_array(): loaded = np.load(filename) assert np.array_equal(np.r_[original, append], loaded) + # Test further appending arr.append(append2) assert np.array_equal(np.r_[original, append, append2], arr[:]) arr.flush() loaded = np.load(filename) assert np.array_equal(np.r_[original, append, append2], loaded) + # Test that writing over the array fails + with pytest.raises(Exception): + arr[len(loaded):len(loaded)+10, :] = ones + # Test rewriting arr[3:13, :] = ones arr.close() @@ -66,6 +71,33 @@ def test_npy_persisted_array(): os.remove(filename) +def test_npy_array_multiple_instances(): + original = np.random.rand(3, 2) + append = np.random.rand(10, 2) + append_clone = np.random.rand(10, 2) + + filename = 'test.npy' + + # Test appending and reading + arr = NpyArray(filename, array=original) + arr.flush() + arr.append(append) + assert(len(arr) == 13) + + arr.fs.flush() + + # Make a second instance and a simultaneous append + arr_clone = NpyArray(filename) + arr_clone.append(append_clone) + assert len(arr_clone) == 13 + assert np.array_equal(arr_clone[:], np.r_[original, append_clone]) + + arr.close() + arr_clone.close() + + os.remove(filename) + + def test_array_pool(ma2): pool = ArrayPool(['MA2', 'S1']) N = 100 @@ -79,6 +111,8 @@ def test_array_pool(ma2): assert len(pool) == total/bs assert not 't1' in pool.stores + batch2 = pool[2] + # Test against in memory pool with using batches pool2 = OutputPool(['MA2', 'S1']) rej = elfi.Rejection(ma2['d'], batch_size=bs, pool=pool2, seed=pool.seed) @@ -104,16 +138,13 @@ def test_array_pool(ma2): os.rename(pool.path, pool.path + '_move') pool = ArrayPool.open(pool.name + '_move') assert len(pool) == total/bs - - # Test adding a nonexistent file - with pytest.raises(FileNotFoundError): - pool.load_npy_file('test') + assert np.array_equal(pool[2]['S1'], batch2['S1']) # Test adding a random .npy file r = np.random.rand(3*bs) newfile = os.path.join(pool.path, 'test.npy') - NpyArray(newfile, r).close() - pool.load_npy_file('test') + arr = NpyArray(newfile, r) + pool.add_store('test', ArrayStore(arr, bs)) assert len(pool.get_store('test')) == 3 assert np.array_equal(pool[2]['test'], r[-bs:]) @@ -125,4 +156,76 @@ def test_array_pool(ma2): os.rmdir(pool.prefix) +def run_basic_store_tests(store, content): + """ + + Parameters + ---------- + store : StoreBase + content : nd.array + + Returns + ------- + + """ + bs = store.batch_size + shape = content.shape[1:] + batch = np.random.rand(bs, *shape) + l = len(content)//bs + + assert len(store) == l + + assert np.array_equal(store[1], content[bs:2*bs]) + + store[1] = batch + + assert len(store) == l + assert np.array_equal(store[1], batch) + + del store[l-1] + + assert len(store) == l-1 + + store[l-1] = batch + assert len(store) == l + + store.clear() + assert len(store) == 0 + + # Return the original condition + for i in range(l): + store[i] = content[i*bs:(i+1)*bs] + + assert len(store) == l + + return store + + +def test_array_store(): + arr = np.random.rand(40,2) + store = ArrayStore(arr, batch_size=10, n_batches=3) + + with pytest.raises(IndexError): + store[4] = np.zeros((10,2)) + + run_basic_store_tests(store, arr[:30]) + + +def test_npy_store(): + filename = 'test' + arr = np.random.rand(40,2) + NpyArray(filename, arr).close() + store = NpyStore(filename, batch_size=10, n_batches=4) + + run_basic_store_tests(store, arr) + + batch = np.random.rand(10, 2) + store[4] = batch + store[5] = 2*batch + + assert np.array_equal(store[5], 2*batch) + + with pytest.raises(IndexError): + store[7] = 3*batch + store.delete() From 4f72585d5c98abe971f0bf8d7704c17388d00584 Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Thu, 20 Jul 2017 12:07:58 +0300 Subject: [PATCH 8/9] Prepare version 0.6.1 (#211) * Version 0.6.1 refactorings * Visualization improvements * CHANGELOG --- CHANGELOG.rst | 2 + docs/installation.rst | 5 ++- elfi/methods/parameter_inference.py | 25 +++++------ elfi/methods/results.py | 68 ++++++++++++++++------------- elfi/visualization/visualization.py | 9 ++-- scripts/MA2_run.py | 7 +-- tests/unit/test_methods.py | 5 ++- 7 files changed, 64 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 35b37414..e0c73168 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,8 @@ dev - Fixed a bug in NpyArray that would lead to an inconsistent state if multiple simultaneous instances were opened. - Added the ability to move the pool data folder +- Sample.summary is now a method instead of a property +- SmcSample methods takes the keyword argument 'all' to show results of all populations 0.6 (2017-07-03) diff --git a/docs/installation.rst b/docs/installation.rst index fc650b3d..f9816dd5 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -73,13 +73,14 @@ Or download the development `tarball`_: Note that for development it is recommended to base your work on the `dev` branch instead of `master`. -Once you have a copy of the source, you can install it with: +Once you have a copy of the source, go to the folder and type: .. code-block:: console pip install -e . -This will install ELFI along with its default requirements. +This will install ELFI along with its default requirements. Note that the dot in the end +means the current folder. .. _Github repo: https://github.com/elfi-dev/elfi .. _tarball: https://github.com/elfi-dev/elfi/tarball/dev diff --git a/elfi/methods/parameter_inference.py b/elfi/methods/parameter_inference.py index d4eeea2c..98ed0fca 100644 --- a/elfi/methods/parameter_inference.py +++ b/elfi/methods/parameter_inference.py @@ -172,7 +172,6 @@ def update(self, batch, batch_index): ------- None """ - logger.info('Received batch %d' % batch_index) self.state['n_batches'] += 1 self.state['n_sim'] += self.batch_size @@ -278,6 +277,7 @@ def iterate(self): # Handle the next ready batch in succession batch, batch_index = self.batches.wait_next() + logger.info('Received batch %d' % batch_index) self.update(batch, batch_index) @property @@ -314,6 +314,7 @@ def _extract_result_kwargs(self): 'parameter_names': self.parameter_names, 'seed': self.seed, 'n_sim': self.state['n_sim'], + 'n_batches': self.state['n_batches'] } @staticmethod @@ -559,6 +560,10 @@ def _update_objective_n_batches(self): logger.debug('Estimated objective n_batches=%d' % self.objective['n_batches']) def plot_state(self, **options): + """Plot the current state of the inference algorithm. + + This feature is still experimental and only supports 1d or 2d cases. + """ displays = [] if options.get('interactive'): from IPython import display @@ -604,14 +609,16 @@ def extract_result(self): ------- SmcSample """ + # Extract information from the population pop = self._extract_population() return SmcSample(outputs=pop.outputs, populations=self._populations.copy() + [pop], weights=pop.weights, - n_batches=pop.n_batches, + threshold=pop.threshold, **self._extract_result_kwargs()) def update(self, batch, batch_index): + super(SMC, self).update(batch, batch_index) self._rejection.update(batch, batch_index) if self._rejection.finished: @@ -621,7 +628,6 @@ def update(self, batch, batch_index): self.state['round'] += 1 self._init_new_round() - self._update_state() self._update_objective() def prepare_new_batch(self, batch_index): @@ -663,7 +669,6 @@ def _extract_population(self): w, cov = self._compute_weights_and_cov(sample) sample.weights = w sample.meta['cov'] = cov - sample.meta['n_batches'] = self._rejection.state['n_batches'] return sample def _compute_weights_and_cov(self, pop): @@ -691,16 +696,6 @@ def _compute_weights_and_cov(self, pop): return w, cov - def _update_state(self): - """Updates n_sim, threshold, and accept_rate - """ - s = self.state - s['n_batches'] += 1 - s['n_sim'] += self.batch_size - # TODO: use overall estimates - s['threshold'] = self._rejection.state['threshold'] - s['accept_rate'] = self._rejection.state['accept_rate'] - def _update_objective(self): """Updates the objective n_batches""" n_batches = sum([pop.n_batches for pop in self._populations]) @@ -944,7 +939,7 @@ def _report_batch(self, batch_index, params, distances): def plot_state(self, **options): """Plot the GP surface - Currently supports only 2D cases. + This feature is still experimental and currently supports only 2D cases. """ f = plt.gcf() diff --git a/elfi/methods/results.py b/elfi/methods/results.py index 08c062a8..4d59f0b2 100644 --- a/elfi/methods/results.py +++ b/elfi/methods/results.py @@ -138,9 +138,9 @@ def summary(self): if hasattr(self, 'threshold'): desc += "Threshold: {:.3g}\n".format(self.threshold) print(desc, end='') - self.summary_sample_means() + self.sample_means_summary() - def summary_sample_means(self): + def sample_means_summary(self): """Print a representation of posterior means. """ s = "Sample means: " @@ -221,24 +221,27 @@ def __init__(self, method_name, outputs, parameter_names, populations, *args, def n_populations(self): return len(self.populations) - @property - def sample_means_all_populations(self): - """Return a list of sample means for all populations - """ - means = [] - for i in range(self.n_populations): - means.append(self.populations[i].sample_means) - return means + def summary(self, all=False): + super(SmcSample, self).summary() + + if all: + for i, pop in enumerate(self.populations): + print('\nPopulation {}:'.format(i)) + pop.summary() + + def sample_means_summary(self, all=False): + if all is False: + super(SmcSample, self).sample_means_summary() + return - def summary_sample_means_all_populations(self): out = '' - for i, means in enumerate(self.sample_means_all_populations): + for i, pop in enumerate(self.populations): out += "Sample means for population {}: ".format(i) - out += ', '.join(["{}: {:.3g}".format(k, v) for k, v in means.items()]) + out += ', '.join(["{}: {:.3g}".format(k, v) for k, v in pop.sample_means.items()]) out += '\n' print(out) - def plot_marginals_all_populations(self, selector=None, bins=20, axes=None, **kwargs): + def plot_marginals(self, selector=None, bins=20, axes=None, all=False, **kwargs): """Plot marginal distributions for parameters for all populations. Parameters @@ -248,17 +251,19 @@ def plot_marginals_all_populations(self, selector=None, bins=20, axes=None, **kw bins : int, optional Number of bins in histograms. axes : one or an iterable of plt.Axes, optional + all : bool, optional + Plot the marginals of all populations """ - samples = [pop.samples_list for pop in self.populations] + if all is False: + super(SmcSample, self).plot_marginals() + return + fontsize = kwargs.pop('fontsize', 13) - for ii in range(self.n_populations): - s = OrderedDict() - for jj, n in enumerate(self.names_list): - s[n] = samples[ii][jj] - ax = vis.plot_marginals(s, selector, bins, axes, **kwargs) - plt.suptitle("Population {}".format(ii), fontsize=fontsize) - - def plot_pairs_all_populations(self, selector=None, bins=20, axes=None, **kwargs): + for i, pop in enumerate(self.populations): + pop.plot_marginals(selector=selector, bins=bins, axes=axes) + plt.suptitle("Population {}".format(i), fontsize=fontsize) + + def plot_pairs(self, selector=None, bins=20, axes=None, all=False, **kwargs): """Plot pairwise relationships as a matrix with marginals on the diagonal for all populations. The y-axis of marginal histograms are scaled. @@ -270,15 +275,18 @@ def plot_pairs_all_populations(self, selector=None, bins=20, axes=None, **kwargs bins : int, optional Number of bins in histograms. axes : one or an iterable of plt.Axes, optional + all : bool, optional + Plot for all populations """ - samples = self.samples_history + [self.samples_list] + + if all is False: + super(SmcSample, self).plot_marginals() + return + fontsize = kwargs.pop('fontsize', 13) - for ii in range(self.n_populations): - s = OrderedDict() - for jj, n in enumerate(self.names_list): - s[n] = samples[ii][jj] - ax = vis.plot_pairs(s, selector, bins, axes, **kwargs) - plt.suptitle("Population {}".format(ii), fontsize=fontsize) + for i, pop in enumerate(self.populations): + pop.plot_pairs(selector=selector, bins=bins, axes=axes) + plt.suptitle("Population {}".format(i), fontsize=fontsize) class BolfiSample(Sample): diff --git a/elfi/visualization/visualization.py b/elfi/visualization/visualization.py index fa844355..82818946 100644 --- a/elfi/visualization/visualization.py +++ b/elfi/visualization/visualization.py @@ -2,7 +2,7 @@ import matplotlib.pyplot as plt from collections import OrderedDict -from elfi.model.elfi_model import ElfiModel, NodeReference +from elfi.model.elfi_model import ElfiModel, NodeReference, Constant def nx_draw(G, internal=False, param_names=False, filename=None, format=None): @@ -44,8 +44,11 @@ def nx_draw(G, internal=False, param_names=False, filename=None, format=None): dot = Digraph(format=format) + hidden = set() + for n, state in G.nodes_iter(data=True): - if not internal and n[0] == '_': + if not internal and n[0] == '_' and state.get('_class') == Constant: + hidden.add(n) continue _format = {'shape': 'circle', 'fillcolor': 'gray80', 'style': 'solid'} if state.get('_observable'): @@ -54,7 +57,7 @@ def nx_draw(G, internal=False, param_names=False, filename=None, format=None): # add edges to graph for u, v, label in G.edges_iter(data='param', default=''): - if not internal and (u[0] == '_' or v[0] == '_'): + if not internal and u in hidden: continue label = label if param_names else '' diff --git a/scripts/MA2_run.py b/scripts/MA2_run.py index a55c0a56..924b1ef4 100644 --- a/scripts/MA2_run.py +++ b/scripts/MA2_run.py @@ -1,5 +1,3 @@ -import matplotlib.pyplot as plt - import elfi from elfi.examples import ma2 @@ -12,8 +10,5 @@ result = rej.sample(1000, quantile=0.01) # show summary of results on stdout -result.summary +result.summary() -# save a figure of results -# result.plot_pairs() -# plt.savefig('ma2.png') diff --git a/tests/unit/test_methods.py b/tests/unit/test_methods.py index 6e12c2e4..4f880db1 100644 --- a/tests/unit/test_methods.py +++ b/tests/unit/test_methods.py @@ -35,7 +35,10 @@ def test_smc(ma2): # Test result summaries res.summary() - res.summary_sample_means_all_populations() + res.summary(all=True) + + res.sample_means_summary() + res.sample_means_summary(all=True) # A superficial test to compensate for test_inference.test_BOLFI not being run on Travis From 7baa4093a8f7b9f2b87afc1b2832b27a74527edf Mon Sep 17 00:00:00 2001 From: Jarno Lintusaari Date: Fri, 21 Jul 2017 10:50:54 +0300 Subject: [PATCH 9/9] Notebook doc updates (#212) * Notebook doc updates * CHANGELOG * Update documentation, README and CHANGELOG * Rebase * Change __init__ version * review changes --- CHANGELOG.rst | 6 +- Makefile | 12 +- README.md | 7 +- docs/quickstart.rst | 31 +-- docs/usage/BOLFI.rst | 114 +++++++---- docs/usage/external.rst | 98 ++++----- docs/usage/parallelization.rst | 171 +++++++++------- docs/usage/tutorial.rst | 353 +++++++++++++++++++++------------ elfi/__init__.py | 2 +- 9 files changed, 481 insertions(+), 313 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e0c73168..625ed2ef 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,8 @@ Changelog ========== -dev ---- +0.6.1 (2017-07-21) +------------------ - Fix elfi.Prior and NoneType error #203 - Fix a bug preventing the reuse of ArrayPool data with a new inference @@ -16,7 +16,7 @@ dev - Added the ability to move the pool data folder - Sample.summary is now a method instead of a property - SmcSample methods takes the keyword argument 'all' to show results of all populations - +- Added a section about iterative advancing to documentation 0.6 (2017-07-03) ---------------- diff --git a/Makefile b/Makefile index ce7f881d..7695e67b 100644 --- a/Makefile +++ b/Makefile @@ -69,23 +69,23 @@ docs: ## generate Sphinx HTML documentation, including API docs $(MAKE) -C docs html # $(BROWSER) docs/_build/html/index.html -CONTENT_URL := http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/ +CONTENT_URL := http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/ notebook-docs: ## Conver notebooks to rst docs. Assumes you have them in `notebooks` directory. jupyter nbconvert --to rst ../notebooks/quickstart.ipynb --output-dir docs - sed -i '' 's|\(quickstart_files/quickstart.*\.\)|'${CONTENT_URL}'\1|g' docs/quickstart.rst + sed -i 's|\(quickstart_files/quickstart.*\.\)|'${CONTENT_URL}'\1|g' docs/quickstart.rst jupyter nbconvert --to rst ../notebooks/tutorial.ipynb --output-dir docs/usage - sed -i '' 's|\(tutorial_files/tutorial.*\.\)|'${CONTENT_URL}usage/'\1|g' docs/usage/tutorial.rst + sed -i 's|\(tutorial_files/tutorial.*\.\)|'${CONTENT_URL}usage/'\1|g' docs/usage/tutorial.rst jupyter nbconvert --to rst ../notebooks/BOLFI.ipynb --output-dir docs/usage - sed -i '' 's|\(BOLFI_files/BOLFI.*\.\)|'${CONTENT_URL}usage/'\1|g' docs/usage/BOLFI.rst + sed -i 's|\(BOLFI_files/BOLFI.*\.\)|'${CONTENT_URL}usage/'\1|g' docs/usage/BOLFI.rst jupyter nbconvert --to rst ../notebooks/parallelization.ipynb --output-dir docs/usage - sed -i '' 's|\(parallelization_files/parallelization.*\.\)|'${CONTENT_URL}usage/'\1|g' docs/usage/parallelization.rst + sed -i 's|\(parallelization_files/parallelization.*\.\)|'${CONTENT_URL}usage/'\1|g' docs/usage/parallelization.rst jupyter nbconvert --to rst ../notebooks/non_python_operations.ipynb --output-dir docs/usage --output=external - sed -i '' 's|\(external_files/external.*\.\)|'${CONTENT_URL}usage/'\1|g' docs/usage/external.rst + sed -i 's|\(external_files/external.*\.\)|'${CONTENT_URL}usage/'\1|g' docs/usage/external.rst # release: clean ## package and upload a release # python setup.py sdist upload diff --git a/README.md b/README.md index 6c327565..e42d73a2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -**Version 0.6 released!** See the CHANGELOG and [notebooks](https://github.com/elfi-dev/notebooks). +**Version 0.6.1 released!** See the CHANGELOG and [notebooks](https://github.com/elfi-dev/notebooks). ELFI - Engine for Likelihood-Free Inference =========================================== @@ -35,7 +35,7 @@ is preferable. Installation ------------ -ELFI requires and is tested with Python 3.5-3.6. You can install ELFI by typing in your terminal: +ELFI requires Python 3.5 or greater. You can install ELFI by typing in your terminal: ``` pip install elfi @@ -78,5 +78,4 @@ Resolving these may sometimes go wrong: - If you receive an error about `yaml.load`, install `pyyaml`. - On OS X with Anaconda virtual environment say `conda install python.app` and then use `pythonw` instead of `python`. -- Note that ELFI currently supports Python 3.5-3.6 only, although 3.x may work as well, -so try `pip3 install elfi`. +- Note that ELFI requires Python 3.5 or greater so try `pip3 install elfi`. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 2536b0de..2efbe7a9 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -7,7 +7,7 @@ First ensure you have Python 3.5 (or greater) and ELFI. After installation you can start using ELFI: -.. code:: ipython3 +.. code:: python import elfi @@ -15,7 +15,7 @@ ELFI includes an easy to use generative modeling syntax, where the generative model is specified as a directed acyclic graph (DAG). Let’s create two prior nodes: -.. code:: ipython3 +.. code:: python mu = elfi.Prior('uniform', -2, 4) sigma = elfi.Prior('uniform', 1, 4) @@ -30,7 +30,7 @@ summary statistics for the data. As an example, lets define the simulator as 30 draws from a Gaussian distribution with a given mean and standard deviation. Let's use mean and variance as our summaries: -.. code:: ipython3 +.. code:: python import scipy.stats as ss import numpy as np @@ -48,7 +48,7 @@ standard deviation. Let's use mean and variance as our summaries: Let’s now assume we have some observed data ``y0`` (here we just create some with the simulator): -.. code:: ipython3 +.. code:: python # Set the generating parameters that we will try to infer mean0 = 1 @@ -73,7 +73,7 @@ Now we have all the components needed. Let’s complete our model by adding the simulator, the observed data, summaries and a distance to our model: -.. code:: ipython3 +.. code:: python # Add the simulator node and observed data to the model sim = elfi.Simulator(simulator, mu, sigma, observed=y0) @@ -85,23 +85,30 @@ model: # Specify distance as euclidean between summary vectors (S1, S2) from simulated and # observed data d = elfi.Distance('euclidean', S1, S2) - + +If you have ``graphviz`` installed to your system, you can also +visualize the model: + +.. code:: python + # Plot the complete model (requires graphviz) elfi.draw(d) -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/quickstart_files/quickstart_9_0.svg +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/quickstart_files/quickstart_11_0.svg + +.. Note:: The automatic naming of nodes may not work in all environments e.g. in interactive Python shells. You can alternatively provide a name argument for the nodes, e.g. ``S1 = elfi.Summary(mean, sim, name='S1')``. We can try to infer the true generating parameters ``mean0`` and ``std0`` above with any of ELFI’s inference methods. Let’s use ABC Rejection sampling and sample 1000 samples from the approximate posterior using threshold value 0.5: -.. code:: ipython3 +.. code:: python rej = elfi.Rejection(d, batch_size=10000, seed=30052017) res = rej.sample(1000, threshold=.5) @@ -111,16 +118,16 @@ posterior using threshold value 0.5: .. parsed-literal:: Method: Rejection - Number of posterior samples: 1000 + Number of samples: 1000 Number of simulations: 120000 Threshold: 0.492 - Posterior means: mu: 0.748, sigma: 3.1 + Sample means: mu: 0.748, sigma: 3.1 Let's plot also the marginal distributions for the parameters: -.. code:: ipython3 +.. code:: python import matplotlib.pyplot as plt res.plot_marginals() @@ -128,5 +135,5 @@ Let's plot also the marginal distributions for the parameters: -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/quickstart_files/quickstart_13_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/quickstart_files/quickstart_16_0.png diff --git a/docs/usage/BOLFI.rst b/docs/usage/BOLFI.rst index 95ed4092..e00f756f 100644 --- a/docs/usage/BOLFI.rst +++ b/docs/usage/BOLFI.rst @@ -20,7 +20,7 @@ by several orders of magnitude. This tutorial demonstrates how to use BOLFI to do LFI in ELFI. -.. code:: ipython3 +.. code:: python import numpy as np import scipy.stats @@ -43,7 +43,7 @@ Although BOLFI is best used with complicated simulators, for demonstration purposes we will use the familiar MA2 model introduced in the basic tutorial, and load it from ready-made examples: -.. code:: ipython3 +.. code:: python from elfi.examples import ma2 model = ma2.get_model(seed_obs=seed) @@ -52,7 +52,7 @@ the basic tutorial, and load it from ready-made examples: -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/BOLFI_files/BOLFI_5_0.svg +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/BOLFI_files/BOLFI_5_0.svg @@ -67,7 +67,7 @@ negative or even -Inf distances occurring especially if it is likely that there can be exact matches between simulated and observed data.) In ELFI such transformed node can be created easily: -.. code:: ipython3 +.. code:: python log_d = elfi.Operation(np.log, model['d']) @@ -88,7 +88,7 @@ before starting to optimize the acquisition of points, optimized, and ``acq_noise_var`` which defines the diagonal covariance of noise added to the acquired points. -.. code:: ipython3 +.. code:: python bolfi = elfi.BOLFI(log_d, batch_size=5, initial_evidence=20, update_interval=10, bounds={'t1':(-2, 2), 't2':(-1, 1)}, acq_noise_var=[0.1, 0.1], seed=seed) @@ -102,7 +102,7 @@ the relationship between parameter values and the resulting discrepancies. We'll request only 100 evidence points (including the ``initial_evidence`` defined above). -.. code:: ipython3 +.. code:: python %time post = bolfi.fit(n_evidence=100) @@ -110,13 +110,53 @@ discrepancies. We'll request only 100 evidence points (including the .. parsed-literal:: INFO:elfi.methods.parameter_inference:BOLFI: Fitting the surrogate model... + INFO:elfi.methods.parameter_inference:Submitting batch 0 + INFO:elfi.methods.parameter_inference:Received batch 0 + INFO:elfi.methods.parameter_inference:Submitting batch 1 + INFO:elfi.methods.parameter_inference:Received batch 1 + INFO:elfi.methods.parameter_inference:Submitting batch 2 + INFO:elfi.methods.parameter_inference:Received batch 2 + INFO:elfi.methods.parameter_inference:Submitting batch 3 + INFO:elfi.methods.parameter_inference:Received batch 3 + INFO:elfi.methods.parameter_inference:Submitting batch 4 + INFO:elfi.methods.parameter_inference:Received batch 4 + INFO:elfi.methods.parameter_inference:Submitting batch 5 + INFO:elfi.methods.parameter_inference:Received batch 5 + INFO:elfi.methods.parameter_inference:Submitting batch 6 + INFO:elfi.methods.parameter_inference:Received batch 6 + INFO:elfi.methods.parameter_inference:Submitting batch 7 + INFO:elfi.methods.parameter_inference:Received batch 7 + INFO:elfi.methods.parameter_inference:Submitting batch 8 + INFO:elfi.methods.parameter_inference:Received batch 8 + INFO:elfi.methods.parameter_inference:Submitting batch 9 + INFO:elfi.methods.parameter_inference:Received batch 9 + INFO:elfi.methods.parameter_inference:Submitting batch 10 + INFO:elfi.methods.parameter_inference:Received batch 10 + INFO:elfi.methods.parameter_inference:Submitting batch 11 + INFO:elfi.methods.parameter_inference:Received batch 11 + INFO:elfi.methods.parameter_inference:Submitting batch 12 + INFO:elfi.methods.parameter_inference:Received batch 12 + INFO:elfi.methods.parameter_inference:Submitting batch 13 + INFO:elfi.methods.parameter_inference:Received batch 13 + INFO:elfi.methods.parameter_inference:Submitting batch 14 + INFO:elfi.methods.parameter_inference:Received batch 14 + INFO:elfi.methods.parameter_inference:Submitting batch 15 + INFO:elfi.methods.parameter_inference:Received batch 15 + INFO:elfi.methods.parameter_inference:Submitting batch 16 + INFO:elfi.methods.parameter_inference:Received batch 16 + INFO:elfi.methods.parameter_inference:Submitting batch 17 + INFO:elfi.methods.parameter_inference:Received batch 17 + INFO:elfi.methods.parameter_inference:Submitting batch 18 + INFO:elfi.methods.parameter_inference:Received batch 18 + INFO:elfi.methods.parameter_inference:Submitting batch 19 + INFO:elfi.methods.parameter_inference:Received batch 19 INFO:elfi.methods.posteriors:Using optimized minimum value (-1.4121) of the GP discrepancy mean function as a threshold .. parsed-literal:: - CPU times: user 13.2 s, sys: 139 ms, total: 13.3 s - Wall time: 7.09 s + CPU times: user 1min 25s, sys: 2.03 s, total: 1min 27s + Wall time: 12.2 s (More on the returned ``BolfiPosterior`` object @@ -129,7 +169,7 @@ for scenarios where the simulator takes a lot of time to run. The fitted ``target_model`` uses the GPy library, and can be investigated further: -.. code:: ipython3 +.. code:: python bolfi.target_model @@ -140,20 +180,20 @@ investigated further: Name : GP regression - Objective : 92.664837723526 + Objective : 92.66483764086814 Number of Parameters : 4 Number of Optimization Parameters : 4 Updates : True Parameters: GP_regression.  | value | constraints | priors - sum.rbf.variance  | 0.326569075912 | +ve | Ga(0.096, 1) - sum.rbf.lengthscale  | 0.552572833732 | +ve | Ga(1.3, 1) - sum.bias.variance  | 0.0878317664626 | +ve | Ga(0.024, 1) - Gaussian_noise.variance | 0.21318627419 | +ve | + sum.rbf.variance  | 0.326569075885 | +ve | Ga(0.096, 1) + sum.rbf.lengthscale  | 0.552572835397 | +ve | Ga(1.3, 1) + sum.bias.variance  | 0.0878317673385 | +ve | Ga(0.024, 1) + Gaussian_noise.variance | 0.213186273967 | +ve | -.. code:: ipython3 +.. code:: python bolfi.plot_state(); @@ -161,23 +201,23 @@ investigated further: .. parsed-literal:: - + -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/BOLFI_files/BOLFI_15_1.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/BOLFI_files/BOLFI_15_1.png It may be useful to see the acquired parameter values and the resulting discrepancies: -.. code:: ipython3 +.. code:: python bolfi.plot_discrepancy(); -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/BOLFI_files/BOLFI_17_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/BOLFI_files/BOLFI_17_0.png There could be an unnecessarily high number of points at parameter @@ -198,20 +238,20 @@ details). The ``fit`` method accepts a threshold parameter; if none is given, ELFI will use the minimum value of discrepancy estimate mean. Afterwards, one may request for a posterior with a different threshold: -.. code:: ipython3 +.. code:: python post2 = bolfi.extract_posterior(-1.) One can visualize a posterior directly (remember that the priors form a triangle): -.. code:: ipython3 +.. code:: python post.plot(logpdf=True) -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/BOLFI_files/BOLFI_23_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/BOLFI_files/BOLFI_23_0.png Sampling @@ -223,7 +263,7 @@ are spent in adaptation/warmup. Note that depending on the smoothness of the GP approximation, the number of priors, their gradients etc., this may be slow. -.. code:: ipython3 +.. code:: python %time result_BOLFI = bolfi.sample(1000, info_freq=1000) @@ -233,13 +273,13 @@ may be slow. INFO:elfi.methods.posteriors:Using optimized minimum value (-1.4121) of the GP discrepancy mean function as a threshold INFO:elfi.methods.mcmc:NUTS: Performing 1000 iterations with 500 adaptation steps. INFO:elfi.methods.mcmc:NUTS: Adaptation/warmup finished. Sampling... - INFO:elfi.methods.mcmc:NUTS: Acceptance ratio: 0.422. After warmup 80 proposals were outside of the region allowed by priors and rejected, decreasing acceptance ratio. + INFO:elfi.methods.mcmc:NUTS: Acceptance ratio: 0.423. After warmup 79 proposals were outside of the region allowed by priors and rejected, decreasing acceptance ratio. INFO:elfi.methods.mcmc:NUTS: Performing 1000 iterations with 500 adaptation steps. INFO:elfi.methods.mcmc:NUTS: Adaptation/warmup finished. Sampling... - INFO:elfi.methods.mcmc:NUTS: Acceptance ratio: 0.414. After warmup 85 proposals were outside of the region allowed by priors and rejected, decreasing acceptance ratio. + INFO:elfi.methods.mcmc:NUTS: Acceptance ratio: 0.427. After warmup 80 proposals were outside of the region allowed by priors and rejected, decreasing acceptance ratio. INFO:elfi.methods.mcmc:NUTS: Performing 1000 iterations with 500 adaptation steps. INFO:elfi.methods.mcmc:NUTS: Adaptation/warmup finished. Sampling... - INFO:elfi.methods.mcmc:NUTS: Acceptance ratio: 0.408. After warmup 73 proposals were outside of the region allowed by priors and rejected, decreasing acceptance ratio. + INFO:elfi.methods.mcmc:NUTS: Acceptance ratio: 0.435. After warmup 74 proposals were outside of the region allowed by priors and rejected, decreasing acceptance ratio. INFO:elfi.methods.mcmc:NUTS: Performing 1000 iterations with 500 adaptation steps. INFO:elfi.methods.mcmc:NUTS: Adaptation/warmup finished. Sampling... INFO:elfi.methods.mcmc:NUTS: Acceptance ratio: 0.404. After warmup 74 proposals were outside of the region allowed by priors and rejected, decreasing acceptance ratio. @@ -248,10 +288,10 @@ may be slow. .. parsed-literal:: 4 chains of 1000 iterations acquired. Effective sample size and Rhat for each parameter: - t1 1848.12533825 0.999883608451 - t2 2060.13369699 0.999774254928 - CPU times: user 1min 27s, sys: 1.21 s, total: 1min 28s - Wall time: 46.6 s + t1 1719.09995679 1.00101719174 + t2 1786.71042938 1.00178507347 + CPU times: user 3min 8s, sys: 2.71 s, total: 3min 11s + Wall time: 47.1 s The sampling algorithms may be fine-tuned with some parameters. The @@ -276,7 +316,7 @@ example of a difficult model for the NUTS algorithm. Now we finally have a ``Sample`` object again, which has several convenience methods: -.. code:: ipython3 +.. code:: python result_BOLFI @@ -286,30 +326,30 @@ convenience methods: .. parsed-literal:: Method: BOLFI - Number of posterior samples: 2000 + Number of samples: 2000 Number of simulations: 100 Threshold: -1.41 - Posterior means: t1: 0.564, t2: 0.28 + Sample means: t1: 0.577, t2: 0.27 -.. code:: ipython3 +.. code:: python result_BOLFI.plot_traces(); -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/BOLFI_files/BOLFI_29_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/BOLFI_files/BOLFI_29_0.png The black vertical lines indicate the end of warmup, which by default is half of the number of iterations. -.. code:: ipython3 +.. code:: python result_BOLFI.plot_marginals(); -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/BOLFI_files/BOLFI_31_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/BOLFI_files/BOLFI_31_0.png diff --git a/docs/usage/external.rst b/docs/usage/external.rst index 007b9557..9d6ab916 100644 --- a/docs/usage/external.rst +++ b/docs/usage/external.rst @@ -16,7 +16,7 @@ briefly demonstrates how to do this in three common scenarios: Let's begin by importing some libraries that we will be using: -.. code:: ipython3 +.. code:: python import os import numpy as np @@ -45,7 +45,7 @@ use ``elfi.tools.external_operation`` tool to wrap executables as a Python callables (function). Let's first investigate how it works with a simple shell ``echo`` command: -.. code:: ipython3 +.. code:: python # Make an external command. {0} {1} are positional arguments and {seed} a keyword argument `seed`. command = 'echo {0} {1} {seed}' @@ -70,7 +70,7 @@ Currently ``echo_sim`` only accepts scalar arguments. In order to work in ELFI, ``echo_sim`` needs to be vectorized so that we can pass to it a vector of arguments. ELFI provides a handy tool for this as well: -.. code:: ipython3 +.. code:: python # Vectorize it with elfi tools echo_sim_vec = elfi.tools.vectorize(echo_sim) @@ -88,9 +88,9 @@ vector of arguments. ELFI provides a handy tool for this as well: .. parsed-literal:: - array([[ 1.78154613e+00, 0.00000000e+00, 8.49425160e+08], - [ 1.48064044e+00, 0.00000000e+00, 8.49425160e+08], - [ 1.94733396e+00, 0.00000000e+00, 8.49425160e+08]]) + array([[ 4.16252911e-01, 0.00000000e+00, 3.08758378e+08], + [ 9.83011677e-01, 0.00000000e+00, 3.08758378e+08], + [ 8.22756066e-01, 0.00000000e+00, 3.08758378e+08]]) @@ -143,7 +143,7 @@ efficiently. We will now reproduce Figure 6(a) in `*Lintusaari at al 2016* `__ *[2]* with ELFI. Let's start by defining some constants: -.. code:: ipython3 +.. code:: python # Fixed model parameters delta = 0 @@ -156,7 +156,7 @@ start by defining some constants: Let's build the beginning of a new model for the birth rate :math:`\alpha` as the only unknown -.. code:: ipython3 +.. code:: python m = elfi.ElfiModel(name='bdm') elfi.Prior('uniform', .005, 2, model=m, name='alpha') @@ -170,7 +170,7 @@ Let's build the beginning of a new model for the birth rate -.. code:: ipython3 +.. code:: python # Get the BDM source directory sources_path = elfi.examples.bdm.get_sources_path() @@ -185,12 +185,14 @@ Let's build the beginning of a new model for the birth rate .. parsed-literal:: - g++ bdm.cpp --std=c++0x -O -Wall -o bdm + make: Entering directory '/l/lintusj1/elfi-dev/notebooks/resources/cpp' + g++ bdm.cpp --std=c++0x -O -Wall -o bdm + make: Leaving directory '/l/lintusj1/elfi-dev/notebooks/resources/cpp' .. note:: The source code for the BDM simulator comes with ELFI. You can get the directory with `elfi.examples.bdm.get_source_directory()`. Under unix-like systems it can be compiled with just typing `make` to console in the source directory. For windows systems, you need to have some C++ compiler available to compile it. -.. code:: ipython3 +.. code:: python # Test the executable (assuming we have the executable `bdm` in the working directory) sim = elfi.tools.external_operation('./bdm {0} {1} {2} {3} --seed {seed} --mode 1') @@ -218,7 +220,7 @@ efficient would be to write a native Python module with C++ but it's beyond the scope of this article. So let's work through files which is a fairly common situation especially with existing software. -.. code:: ipython3 +.. code:: python # Assuming we have the executable `bdm` in the working directory command = './bdm {filename} --seed {seed} --mode 1 > {output_filename}' @@ -270,7 +272,7 @@ informative filenames, we ask ELFI to provide the operation some meta information. That will be available under the ``meta`` keyword (see the ``prepare_inputs`` function above): -.. code:: ipython3 +.. code:: python # Create the simulator bdm_node = elfi.Simulator(bdm, m['alpha'], delta, tau, N, observed=y_obs, name='sim') @@ -284,11 +286,11 @@ information. That will be available under the ``meta`` keyword (see the -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/external_files/external_21_0.svg +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/external_files/external_21_0.svg -.. code:: ipython3 +.. code:: python # Test it data = bdm_node.generate(3) @@ -297,9 +299,9 @@ information. That will be available under the ``meta`` keyword (see the .. parsed-literal:: - [[12 1 1 2 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0] - [18 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] - [19 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] + [[ 8 3 1 1 1 1 1 2 1 1 0 0 0 0 0 0 0 0 0 0] + [ 8 1 1 1 1 1 1 3 1 1 1 0 0 0 0 0 0 0 0 0] + [14 2 1 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] Completing the BDM model @@ -310,7 +312,7 @@ We are now ready to finish up the BDM model. To reproduce Figure 6(a) in *[2]*, let's add different summaries and discrepancies to the model and run the inference for each of them: -.. code:: ipython3 +.. code:: python def T1(clusters): clusters = np.atleast_2d(clusters) @@ -338,18 +340,18 @@ run the inference for each of them: -.. code:: ipython3 +.. code:: python elfi.draw(m) -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/external_files/external_25_0.svg +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/external_files/external_25_0.svg -.. code:: ipython3 +.. code:: python # Save parameter and simulation results in memory to speed up the later inference pool = elfi.OutputPool(['alpha', 'sim']) @@ -368,15 +370,15 @@ run the inference for each of them: .. parsed-literal:: - CPU times: user 2.95 s, sys: 96.3 ms, total: 3.05 s - Wall time: 5.05 s - CPU times: user 30.4 ms, sys: 1.7 ms, total: 32.1 ms - Wall time: 31.9 ms - CPU times: user 33.8 ms, sys: 728 µs, total: 34.5 ms - Wall time: 34.4 ms + CPU times: user 3.86 s, sys: 52 ms, total: 3.92 s + Wall time: 4.81 s + CPU times: user 32 ms, sys: 4 ms, total: 36 ms + Wall time: 36.4 ms + CPU times: user 40 ms, sys: 0 ns, total: 40 ms + Wall time: 36.9 ms -.. code:: ipython3 +.. code:: python # Load a precomputed posterior based on an analytic solution (see Lintusaari et al 2016) matdata = sio.loadmat('./resources/bdm.mat') @@ -407,7 +409,7 @@ run the inference for each of them: -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/external_files/external_27_1.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/external_files/external_27_1.png Interfacing with R @@ -424,7 +426,7 @@ Here we demonstrate how to calculate the summary statistics used in the ELFI tutorial (autocovariances) using R's ``acf`` function for the MA2 model. -.. code:: ipython3 +.. code:: python import rpy2.robjects as robj from rpy2.robjects import numpy2ri as np2ri @@ -439,7 +441,7 @@ model. Let's create a Python function that wraps the R commands (please see the documentation of `rpy2 `__ for details): -.. code:: ipython3 +.. code:: python robj.r(''' # create a function `f` @@ -457,7 +459,7 @@ documentation of `rpy2 `__ for details): ans = apply(x, 1, f, lag=lag) return np.atleast_1d(ans) -.. code:: ipython3 +.. code:: python # Test it autocovR(np.array([[1,2,3,4], [4,5,6,7]]), 1) @@ -473,7 +475,7 @@ documentation of `rpy2 `__ for details): Load a ready made MA2 model: -.. code:: ipython3 +.. code:: python ma2 = elfi.examples.ma2.get_model(seed_obs=4) elfi.draw(ma2) @@ -481,13 +483,13 @@ Load a ready made MA2 model: -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/external_files/external_36_0.svg +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/external_files/external_36_0.svg Replace the summaries S1 and S2 with our R autocovariance function. -.. code:: ipython3 +.. code:: python # Replace with R autocov S1 = elfi.Summary(autocovR, ma2['MA2'], 1) @@ -505,10 +507,10 @@ Replace the summaries S1 and S2 with our R autocovariance function. .. parsed-literal:: Method: Rejection - Number of posterior samples: 100 + Number of samples: 100 Number of simulations: 10000 Threshold: 0.111 - Posterior means: t1: 0.599, t2: 0.177 + Sample means: t1: 0.599, t2: 0.177 @@ -521,20 +523,20 @@ MATLAB function using the official `MATLAB Python cd API `__. (Tested with MATLAB 2016b.) -.. code:: ipython3 +.. code:: python import matlab.engine A MATLAB session needs to be started (and stopped) separately: -.. code:: ipython3 +.. code:: python eng = matlab.engine.start_matlab() # takes a while... Similarly as with R, we have to write a piece of code to interface between MATLAB and Python: -.. code:: ipython3 +.. code:: python def euclidean_M(x, y): # MATLAB array initialized with Python's list @@ -547,7 +549,7 @@ between MATLAB and Python: d = np.atleast_1d(dM).reshape(-1) return d -.. code:: ipython3 +.. code:: python # Test it euclidean_M(np.array([[1,2,3], [6,7,8], [2,2,3]]), np.array([2,2,2])) @@ -563,7 +565,7 @@ between MATLAB and Python: Load a ready made MA2 model: -.. code:: ipython3 +.. code:: python ma2M = elfi.examples.ma2.get_model(seed_obs=4) elfi.draw(ma2M) @@ -571,13 +573,13 @@ Load a ready made MA2 model: -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/external_files/external_48_0.svg +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/external_files/external_48_0.svg Replace the summaries S1 and S2 with our R autocovariance function. -.. code:: ipython3 +.. code:: python # Replace with Matlab distance implementation d = elfi.Distance(euclidean_M, ma2M['S1'], ma2M['S2']) @@ -593,16 +595,16 @@ Replace the summaries S1 and S2 with our R autocovariance function. .. parsed-literal:: Method: Rejection - Number of posterior samples: 100 + Number of samples: 100 Number of simulations: 10000 Threshold: 0.113 - Posterior means: t1: 0.602, t2: 0.178 + Sample means: t1: 0.602, t2: 0.178 Finally, don't forget to quit the MATLAB session: -.. code:: ipython3 +.. code:: python eng.quit() diff --git a/docs/usage/parallelization.rst b/docs/usage/parallelization.rst index 7159a0c8..66f28e17 100644 --- a/docs/usage/parallelization.rst +++ b/docs/usage/parallelization.rst @@ -17,120 +17,139 @@ inference via different clients. Currently ELFI includes three clients: `ipyparallel `__ based client that can parallelize from multiple cores up to a distributed cluster. -A client is activated by importing the respective ELFI module. +A client is activated by importing the respective ELFI module or by +giving the name of the client to ``elfi.set_client``. -This tutorial shows how to activate and use the ``ipyparallel`` client -with ELFI. For local parallelization, the ``multiprocessing`` client is -simpler to use. +This tutorial shows how to activate and use the ``multiprocessing`` or +``ipyparallel`` client with ELFI. The ``ipyparallel`` client supports +parallelization from local computer up to a cluster environment. For +local parallelization however, the ``multiprocessing`` client is simpler +to use. Let's begin by importing ELFI and our example MA2 model from the +tutorial. -Activating parallelization --------------------------- - -To activate the ``ipyparallel`` client in ELFI you just need to import -it: - -.. code:: ipython3 +.. code:: python import elfi - # This activates the parallelization with ipyparallel - import elfi.clients.ipyparallel + from elfi.examples import ma2 -Starting a local ipcluster --------------------------- +Let's get the model and plot it (requires graphviz) -Before you can actually run things in parallel you also need to start an -``ipyparallel`` cluster. Below is an example of how to start a local -cluster to the background using 4 CPU cores: +.. code:: python -.. code:: ipython3 + model = ma2.get_model() + elfi.draw(model) - !ipcluster start -n 4 --daemon - - # This is here just to ensure that ipcluster has enough time to start properly before continuing - import time - time.sleep(10) -.. note:: The exclamation mark above is a Jupyter syntax for executing shell commands. You can run the same command in your terminal without the exclamation mark. -.. tip:: Please see the [ipyparallel documentation](https://ipyparallel.readthedocs.io/en/latest/intro.html#getting-started) for more information and details for setting up and using ipyparallel clusters. -Running parallel inference --------------------------- +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/parallelization_files/parallelization_5_0.svg -We will run parallel inference for the MA2 model introduced in the basic -tutorial. A ready made model can be imported from ``elfi.examples``: -.. code:: ipython3 - from elfi.examples import ma2 - model = ma2.get_model() - - elfi.draw(model) +Multiprocessing client +---------------------- + +The multiprocessing client allows you to easily use the cores available +in your computer. You can activate it simply by +.. code:: python + elfi.set_client('multiprocessing') +Any inference instance created after you have set the new client will +automatically use it to perform the computations. Let's try it with our +MA2 example model from the tutorial. When running the next command, take +a look at the system monitor of your operating system; it should show +that all of your cores are doing heavy computation simultaneously. -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/parallelization_files/parallelization_9_0.svg +.. code:: python + rej = elfi.Rejection(model, 'd', batch_size=10000, seed=20170530) + %time result = rej.sample(5000, n_sim=int(1e6)) # 1 million simulations -Otherwise everything should be familiar, and ELFI handles everything for -you regarding the parallelization. +.. parsed-literal:: -.. code:: ipython3 + CPU times: user 272 ms, sys: 28 ms, total: 300 ms + Wall time: 2.41 s - rej = elfi.Rejection(model, 'd', batch_size=10000, seed=20170530) -When running the next command, take a look at the system monitor of your -operating system; it should show 4 (or whatever number you gave the -``ipcluster start`` command) Python processes doing heavy computation -simultaneously. +And that is it. The result object is also just like in the basic case: -.. code:: ipython3 +.. code:: python - %time result = rej.sample(5000, n_sim=int(5e6)) # 5 million simulations + # Print the summary + result.summary() + + import matplotlib.pyplot as plt + result.plot_pairs(); + plt.show() .. parsed-literal:: - CPU times: user 3.59 s, sys: 417 ms, total: 4 s - Wall time: 20.9 s + Method: Rejection + Number of samples: 5000 + Number of simulations: 1000000 + Threshold: 0.0817 + Sample means: t1: 0.68, t2: 0.133 -The ``Sample`` object is also just like in the basic case: -.. code:: ipython3 +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/parallelization_files/parallelization_11_1.png - result.summary +Ipyparallel client +------------------ -.. parsed-literal:: +The ``ipyparallel`` client allows you to parallelize the computations to +cluster environments. To use the ``ipyparallel`` client, you first have +to create an ``ipyparallel`` cluster. Below is an example of how to +start a local cluster to the background using 4 CPU cores: - Method: Rejection - Number of posterior samples: 5000 - Number of simulations: 5000000 - Threshold: 0.0336 - Posterior means: t1: 0.493, t2: 0.0332 +.. code:: python + !ipcluster start -n 4 --daemon + + # This is here just to ensure that ipcluster has enough time to start properly before continuing + import time + time.sleep(10) -.. code:: ipython3 +.. note:: The exclamation mark above is a Jupyter syntax for executing shell commands. You can run the same command in your terminal without the exclamation mark. - import matplotlib.pyplot as plt - result.plot_pairs() - plt.show() +.. tip:: Please see the ipyparallel documentation (https://ipyparallel.readthedocs.io/en/latest/intro.html#getting-started) for more information and details for setting up and using ipyparallel clusters in different environments. + +Running parallel inference with ipyparallel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +After the cluster has been set up, we can proceed as usual. ELFI will +take care of the parallelization from now on: + +.. code:: python + # Let's start using the ipyparallel client + elfi.set_client('ipyparallel') + + rej = elfi.Rejection(model, 'd', batch_size=10000, seed=20170530) + %time result = rej.sample(5000, n_sim=int(5e6)) # 5 million simulations -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/parallelization_files/parallelization_16_0.png +.. parsed-literal:: + + CPU times: user 3.16 s, sys: 184 ms, total: 3.35 s + Wall time: 13.4 s To summarize, the only thing that needed to be changed from the basic -scenario was enabling the ``ipyparallel`` client. +scenario was creating the ``ipyparallel`` cluster and enabling the +``ipyparallel`` client. -Working interactively ---------------------- +Working interactively with ipyparallel +-------------------------------------- -All imports and definitions must be visible to all ``ipyparallel`` +If you are using the ``ipyparallel`` client from an interactive +environment (e.g. jupyter notebook) there are some things to take care +of. All imports and definitions must be visible to all ``ipyparallel`` engines. You can ensure this by writing a script file that has all the definitions in it. In a distributed setting, this file must be present in all remote workers running an ``ipyparallel`` engine. @@ -145,7 +164,7 @@ documentation `__. In interactive sessions, you can change the model with built-in functionality without problems: -.. code:: ipython3 +.. code:: python d2 = elfi.Distance('cityblock', model['S1'], model['S2'], p=1) @@ -155,7 +174,7 @@ functionality without problems: But let's say you want to use your very own distance function in a jupyter notebook: -.. code:: ipython3 +.. code:: python def my_distance(x, y): # Note that interactively defined functions must use full module names, e.g. numpy instead of np @@ -169,7 +188,7 @@ This function definition is not automatically visible for the engines run in different processes and will not see interactively defined objects and functions. The below would therefore fail: -.. code:: ipython3 +.. code:: python # This will fail if you try it! # result3 = rej3.sample(1000, quantile=0.01) @@ -179,7 +198,7 @@ the scopes of the engines from interactive sessions. Because ``my_distance`` also uses ``numpy``, that must be imported in the engines as well: -.. code:: ipython3 +.. code:: python # Get the ipyparallel client ipyclient = elfi.get_client().ipp_client @@ -199,7 +218,7 @@ engines as well: The above may look a bit cumbersome, but now this works: -.. code:: ipython3 +.. code:: python rej3.sample(1000, quantile=0.01) # now this works @@ -209,10 +228,10 @@ The above may look a bit cumbersome, but now this works: .. parsed-literal:: Method: Rejection - Number of posterior samples: 1000 + Number of samples: 1000 Number of simulations: 100000 - Threshold: 0.0117 - Posterior means: t1: 0.492, t2: 0.0389 + Threshold: 0.0136 + Sample means: t1: 0.676, t2: 0.129 @@ -224,13 +243,13 @@ engines. Remember to stop the ipcluster when done ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code:: ipython3 +.. code:: python !ipcluster stop .. parsed-literal:: - 2017-06-21 16:06:24.007 [IPClusterStop] Stopping cluster [pid=94248] with [signal=] + 2017-07-19 16:20:58.662 [IPClusterStop] Stopping cluster [pid=21020] with [signal=] diff --git a/docs/usage/tutorial.rst b/docs/usage/tutorial.rst index 1b147e85..37c3a82e 100644 --- a/docs/usage/tutorial.rst +++ b/docs/usage/tutorial.rst @@ -12,8 +12,10 @@ save results for later use and run different inference algorithms. Let's begin by importing libraries that we will use and specify some settings. -.. code:: ipython3 +.. code:: python + import time + import numpy as np import scipy.stats import matplotlib @@ -22,11 +24,9 @@ settings. %matplotlib inline %precision 2 - import logging - logging.basicConfig(level=logging.INFO) - - # Set an arbitrary global seed to keep the randomly generated quantities the same - np.random.seed(20170530) + # Set an arbitrary seed and a global random state to keep the randomly generated quantities the same between runs + seed = 20170530 + np.random.seed(seed) Inference with ELFI: case MA(2) model ------------------------------------- @@ -52,7 +52,7 @@ In this tutorial, our task is to infer the parameters :math:`y` that originate from an MA(2) process. Let's define the MA(2) simulator as a Python function: -.. code:: ipython3 +.. code:: python def MA2(t1, t2, n_obs=100, batch_size=1, random_state=None): # Make inputs 2d arrays for numpy broadcasting with w @@ -88,9 +88,9 @@ length ``batch_size`` and the method returns a 2d array with the simulations on the rows. Notice that for convenience, the funtion also works with scalars that are first converted to vectors. -.. note:: there is a built-in tool (`elfi.tools.vectorize`) in ELFI to vectorize operations that are not vectorized. It is basically a for loop wrapper. +.. note:: There is a built-in tool (`elfi.tools.vectorize`) in ELFI to vectorize operations that are not vectorized. It is basically a for loop wrapper. -.. Important:: in order to guarantee a consistent state of pseudo-random number generation, the simulator must have `random_state` as a keyword argument for reading in a `numpy.RandomState` object. +.. Important:: In order to guarantee a consistent state of pseudo-random number generation, the simulator must have `random_state` as a keyword argument for reading in a `numpy.RandomState` object. Let's now use this simulator to create toy observations. We will use parameter values :math:`\theta_1=0.6, \theta_2=0.2` as in `*Marin et al. @@ -98,7 +98,7 @@ parameter values :math:`\theta_1=0.6, \theta_2=0.2` as in `*Marin et al. and then try to infer these parameter values back based on the toy observed data alone. -.. code:: ipython3 +.. code:: python # true parameters t1_true = 0.6 @@ -116,7 +116,7 @@ observed data alone. -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_10_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_10_0.png Approximate Bayesian Computation @@ -165,7 +165,7 @@ conveniently. Often the target of the generative model is a distance between the simulated and observed data. To start creating our model, we will first import ELFI: -.. code:: ipython3 +.. code:: python import elfi @@ -176,7 +176,7 @@ available in ``scipy.stats`` (for custom priors, see `below <#Custom-priors>`__). For simplicity, let's start by assuming that both parameters follow ``Uniform(0, 2)``. -.. code:: ipython3 +.. code:: python # a node is defined by giving a distribution from scipy.stats together with any arguments (here 0 and 2) t1 = elfi.Prior(scipy.stats.uniform, 0, 2) @@ -189,7 +189,7 @@ and give the priors to it as arguments. This means that the parameters for the simulations will be drawn from the priors. Because we have the observed data available for this node, we provide it here as well: -.. code:: ipython3 +.. code:: python Y = elfi.Simulator(MA2, t1, t2, observed=y_obs) @@ -206,7 +206,7 @@ Here, we will apply the intuition arising from the definition of the MA(2) process, and use the autocovariances with lags 1 and 2 as the summary statistics: -.. code:: ipython3 +.. code:: python def autocov(x, lag=1): C = np.mean(x[:,lag:] * x[:,:-lag], axis=1) @@ -216,7 +216,7 @@ As is familiar by now, a ``Summary`` node is defined by giving the autocovariance function and the simulated data (which includes the observed as well): -.. code:: ipython3 +.. code:: python S1 = elfi.Summary(autocov, Y) S2 = elfi.Summary(autocov, Y, 2) # the optional keyword lag is given the value 2 @@ -225,7 +225,7 @@ Here, we choose the discrepancy as the common Euclidean L2-distance. ELFI can use many common distances directly from ``scipy.spatial.distance`` like this: -.. code:: ipython3 +.. code:: python # Finish the model with the final node that calculates the squared distance (S1_sim-S1_obs)**2 + (S2_sim-S2_obs)**2 d = elfi.Distance('euclidean', S1, S2) @@ -238,14 +238,14 @@ distance/discrepancy functions as well (see the documentation for Now that the inference model is defined, ELFI can visualize the model as a DAG. -.. code:: ipython3 +.. code:: python elfi.draw(d) # just give it a node in the model, or the model itself (d.model) -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_27_0.svg +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_27_0.svg @@ -280,7 +280,7 @@ internal book-keeping of pseudo-random number generation. Also the ``size`` keyword is needed (which in the simple cases is the same as the ``batch_size`` in the simulator definition). -.. code:: ipython3 +.. code:: python # define prior for t1 as in Marin et al., 2012 with t1 in range [-b, b] class CustomPrior_t1(elfi.Distribution): @@ -299,7 +299,7 @@ internal book-keeping of pseudo-random number generation. Also the These indeed sample from a triangle: -.. code:: ipython3 +.. code:: python t1_1000 = CustomPrior_t1.rvs(2, 1000) t2_1000 = CustomPrior_t2.rvs(t1_1000, 1, 1000) @@ -308,12 +308,12 @@ These indeed sample from a triangle: -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_33_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_33_0.png Let's change the earlier priors to the new ones in the inference model: -.. code:: ipython3 +.. code:: python t1.become(elfi.Prior(CustomPrior_t1, 2)) t2.become(elfi.Prior(CustomPrior_t2, t1, 1)) @@ -323,7 +323,7 @@ Let's change the earlier priors to the new ones in the inference model: -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_35_0.svg +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_35_0.svg @@ -351,9 +351,8 @@ Another optional keyword is the seed. This ensures that the outcome will be always the same for the same data and model. If you leave it out, a random seed will be taken. -.. code:: ipython3 +.. code:: python - seed = 20170530 rej = elfi.Rejection(d, batch_size=10000, seed=seed) .. note:: In Python, doing many calculations with a single function call can potentially save a lot of CPU time, depending on the operation. For example, here we draw 10000 samples from `t1`, pass them as input to `t2`, draw 10000 samples from `t2`, and then use these both to run 10000 simulations and so forth. All this is done in one passing through the graph and hence the overall number of function calls is reduced 10000-fold. However, this does not mean that batches should be as big as possible, since you may run out of memory, the fraction of time spent in function call overhead becomes insignificant, and many algorithms operate in multiples of `batch_size`. Furthermore, the `batch_size` is a crucial element for efficient parallelization (see the notebook on parallelization). @@ -372,19 +371,19 @@ visualization on so that if you run this on a notebook you will see the posterior forming from a prior distribution. In this case most of the time is spent in drawing. -.. code:: ipython3 +.. code:: python N = 1000 vis = dict(xlim=[-2,2], ylim=[-1,1]) # You can give the sample method a `vis` keyword to see an animation how the prior transforms towards the - # posterior with a decreasing threshold (interactive visualization will slow it down a bit though). + # posterior with a decreasing threshold. %time result = rej.sample(N, quantile=0.01, vis=vis) -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_42_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_42_0.png @@ -395,8 +394,8 @@ time is spent in drawing. .. parsed-literal:: - CPU times: user 2.2 s, sys: 182 ms, total: 2.38 s - Wall time: 2.39 s + CPU times: user 2.59 s, sys: 100 ms, total: 2.69 s + Wall time: 2.68 s The ``sample`` method returns a ``Sample`` object, which contains @@ -407,7 +406,7 @@ in the model). For rejection sampling, other attributes include e.g. the ``threshold``, which is the threshold value resulting in the requested quantile. -.. code:: ipython3 +.. code:: python result.samples['t1'].mean() @@ -422,18 +421,18 @@ quantile. The ``Sample`` object includes a convenient ``summary`` method: -.. code:: ipython3 +.. code:: python - result.summary + result.summary() .. parsed-literal:: Method: Rejection - Number of posterior samples: 1000 + Number of samples: 1000 Number of simulations: 100000 Threshold: 0.117 - Posterior means: t1: 0.556, t2: 0.219 + Sample means: t1: 0.556, t2: 0.219 Rejection sampling can also be performed with using a threshold or total @@ -442,7 +441,7 @@ draws from the prior for which the generated distance is below the threshold will be accepted as samples. Note that the simulator will run as long as it takes to generate the requested number of samples. -.. code:: ipython3 +.. code:: python %time result2 = rej.sample(N, threshold=0.2) @@ -451,15 +450,79 @@ as long as it takes to generate the requested number of samples. .. parsed-literal:: - CPU times: user 215 ms, sys: 51.8 ms, total: 267 ms - Wall time: 269 ms + CPU times: user 248 ms, sys: 12 ms, total: 260 ms + Wall time: 255 ms Method: Rejection - Number of posterior samples: 1000 + Number of samples: 1000 Number of simulations: 40000 Threshold: 0.185 - Posterior means: t1: 0.555, t2: 0.223 + Sample means: t1: 0.555, t2: 0.223 + + + +Iterative advancing +------------------- + +Often it may not be practical to wait to the end before investigating +the results. There may be time constraints or one may wish to check the +results at certain intervals. For this, ELFI provides an iterative +approach to advance the inference. First one sets the objective of the +inference and then calls the ``iterate`` method. + +Below is an example how to run the inference until the objective has +been reached or a maximum of one second of time has been used. + +.. code:: python + + # Request for 1M simulations. + rej.set_objective(1000, n_sim=1000000) + # We only have 1 sec of time and we are unsure if we will be finished by that time. + # So lets simulate as many as we can. + + time0 = time.time() + time1 = time0 + 1 + while not rej.finished and time.time() < time1: + rej.iterate() + # One could investigate the rej.state or rej.extract_result() here + # to make more complicated stopping criterions + + # Extract and print the result as it stands. It will show us how many simulations were generated. + print(rej.extract_result()) + + +.. parsed-literal:: + + Method: Rejection + Number of samples: 1000 + Number of simulations: 150000 + Threshold: 0.0968 + Sample means: t1: 0.561, t2: 0.217 + + + +.. code:: python + + # We will see that it was not finished in 1 sec + rej.finished + + + + +.. parsed-literal:: + + False + + +We could continue from this stage just by continuing to call the +``iterate`` method. The ``extract_result`` will always give a proper +result even if the objective was not reached. + +Next we will look into how to store all the data that was generated so +far. This allows us to e.g. save the data to disk and continue the next +day, or modify the model and reuse some of the earlier data if +applicable. Storing simulated data ---------------------- @@ -471,7 +534,7 @@ storing all outputs of any node in the model (not just the accepted samples). Let's save all outputs for ``t1``, ``t2``, ``S1`` and ``S2`` in our model: -.. code:: ipython3 +.. code:: python pool = elfi.OutputPool(['t1', 't2', 'S1', 'S2']) rej = elfi.Rejection(d, pool=pool) @@ -482,8 +545,8 @@ in our model: .. parsed-literal:: - CPU times: user 6.14 s, sys: 102 ms, total: 6.24 s - Wall time: 6.38 s + CPU times: user 9.95 s, sys: 0 ns, total: 9.95 s + Wall time: 9.95 s @@ -491,10 +554,10 @@ in our model: .. parsed-literal:: Method: Rejection - Number of posterior samples: 1000 + Number of samples: 1000 Number of simulations: 1000000 - Threshold: 0.036 - Posterior means: t1: 0.56, t2: 0.227 + Threshold: 0.0362 + Sample means: t1: 0.554, t2: 0.233 @@ -503,7 +566,7 @@ to resimulate them. Above we saved the summaries to the pool, so we can change the distance node of the model without having to resimulate anything. Let's do that. -.. code:: ipython3 +.. code:: python # Replace the current distance with a cityblock (manhattan) distance and recreate the inference d.become(elfi.Distance('cityblock', S1, S2, p=1)) @@ -515,8 +578,8 @@ anything. Let's do that. .. parsed-literal:: - CPU times: user 848 ms, sys: 12.1 ms, total: 860 ms - Wall time: 895 ms + CPU times: user 700 ms, sys: 0 ns, total: 700 ms + Wall time: 699 ms @@ -524,10 +587,10 @@ anything. Let's do that. .. parsed-literal:: Method: Rejection - Number of posterior samples: 1000 + Number of samples: 1000 Number of simulations: 1000000 - Threshold: 0.0447 - Posterior means: t1: 0.56, t2: 0.227 + Threshold: 0.0453 + Sample means: t1: 0.555, t2: 0.235 @@ -537,7 +600,7 @@ considered simulations stayed the same. We can also continue the inference by increasing the total number of simulations and only have to simulate the new ones: -.. code:: ipython3 +.. code:: python %time result5 = rej.sample(N, n_sim=1200000) result5 @@ -545,8 +608,8 @@ simulations and only have to simulate the new ones: .. parsed-literal:: - CPU times: user 1.96 s, sys: 29.4 ms, total: 1.99 s - Wall time: 2.02 s + CPU times: user 2.07 s, sys: 16 ms, total: 2.09 s + Wall time: 2.09 s @@ -554,54 +617,51 @@ simulations and only have to simulate the new ones: .. parsed-literal:: Method: Rejection - Number of posterior samples: 1000 + Number of samples: 1000 Number of simulations: 1200000 - Threshold: 0.0409 - Posterior means: t1: 0.56, t2: 0.23 + Threshold: 0.0421 + Sample means: t1: 0.554, t2: 0.239 Above the results were saved into a python dictionary. If you store a -lot of data to dictionaries, you will eventually run out of memory. -Instead you can save the outputs to standard numpy .npy files: +lot of data to dictionaries, you will eventually run out of memory. ELFI +provides an alternative pool that, by default, saves the outputs to +standard numpy .npy files: -.. code:: ipython3 +.. code:: python - arraypool = elfi.store.ArrayPool(['t1', 't2', 'Y', 'd'], basepath='./output') + arraypool = elfi.ArrayPool(['t1', 't2', 'Y', 'd']) rej = elfi.Rejection(d, pool=arraypool) %time result5 = rej.sample(100, threshold=0.3) .. parsed-literal:: - CPU times: user 25.6 ms, sys: 2.58 ms, total: 28.2 ms - Wall time: 29.3 ms + CPU times: user 40 ms, sys: 0 ns, total: 40 ms + Wall time: 37.1 ms This stores the simulated data in binary ``npy`` format under ``arraypool.path``, and can be loaded with ``np.load``. -.. code:: ipython3 +.. code:: python - # Let's flush the outputs to disk (alternatively you can close the pool) so that we can read them - # while we still have the arraypool open. + # Let's flush the outputs to disk (alternatively you can save or close the pool) so that we can read the .npy files. arraypool.flush() import os - os.listdir(arraypool.path) - - + print('Files in', arraypool.path, 'are', os.listdir(arraypool.path)) .. parsed-literal:: - ['d.npy', 't1.npy', 't2.npy', 'Y.npy'] - + Files in pools/arraypool_3615052699 are ['Y.npy', 't2.npy', 'd.npy', 't1.npy'] Now lets load all the parameters ``t1`` that were generated with numpy: -.. code:: ipython3 +.. code:: python np.load(arraypool.path + '/t1.npy') @@ -610,13 +670,44 @@ Now lets load all the parameters ``t1`` that were generated with numpy: .. parsed-literal:: - array([-0.51, 0.09, 0.72, ..., -1.23, 0.02, -0.66]) + array([-0.82, -0.03, 0.27, ..., 1.03, 0.44, -0.56]) + + + +We can also close (or save) the whole pool if we wish to continue later: + +.. code:: python + + arraypool.close() + name = arraypool.name + print(name) + + +.. parsed-literal:: + + arraypool_3615052699 + +And open it up later to continue where we were left. We can open it +using its name: + +.. code:: python + + arraypool = elfi.ArrayPool.open(name) + print('This pool has', len(arraypool), 'batches') + + # This would give the contents of the first batch + # arraypool[0] + + +.. parsed-literal:: + + This pool has 3 batches You can delete the files with: -.. code:: ipython3 +.. code:: python arraypool.delete() @@ -625,12 +716,12 @@ You can delete the files with: os.listdir(arraypool.path) except FileNotFoundError: - print("No such file or directory") + print("The directry is removed") .. parsed-literal:: - No such file or directory + The directry is removed Visualizing the results @@ -642,24 +733,24 @@ are convenience methods to plotting functions defined under For example one can plot the marginal distributions: -.. code:: ipython3 +.. code:: python result.plot_marginals(); -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_66_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_74_0.png Often "pairwise relationships" are more informative: -.. code:: ipython3 +.. code:: python result.plot_pairs(); -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_68_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_76_0.png Note that if working in a non-interactive environment, you can use e.g. @@ -681,7 +772,7 @@ used custom priors, so we have to specify a ``pdf`` function by ourselves. If we used standard priors, this step would not be needed. Let's modify the prior distribution classes: -.. code:: ipython3 +.. code:: python # define prior for t1 as in Marin et al., 2012 with t1 in range [-b, b] class CustomPrior_t1(elfi.Distribution): @@ -722,7 +813,7 @@ Run SMC ABC In ELFI, one can setup a SMC ABC sampler just like the Rejection sampler: -.. code:: ipython3 +.. code:: python smc = elfi.SMC(d, batch_size=10000, seed=seed) @@ -730,7 +821,7 @@ For sampling, one has to define the number of output samples, the number of populations and a *schedule* i.e. a list of quantiles to use for each population. In essence, a population is just refined rejection sampling. -.. code:: ipython3 +.. code:: python N = 1000 schedule = [0.7, 0.2, 0.05] @@ -739,87 +830,100 @@ population. In essence, a population is just refined rejection sampling. .. parsed-literal:: - INFO:elfi.methods.parameter_inference:---------------- Starting round 0 ---------------- - INFO:elfi.methods.parameter_inference:---------------- Starting round 1 ---------------- - INFO:elfi.methods.parameter_inference:---------------- Starting round 2 ---------------- - - -.. parsed-literal:: - - CPU times: user 1.36 s, sys: 241 ms, total: 1.6 s - Wall time: 1.62 s + CPU times: user 6.46 s, sys: 132 ms, total: 6.59 s + Wall time: 1.83 s We can have summaries and plots of the results just like above: -.. code:: ipython3 +.. code:: python - result_smc.summary + result_smc.summary(all=True) .. parsed-literal:: Method: SMC - Number of posterior samples: 1000 + Number of samples: 1000 Number of simulations: 190000 - Threshold: 0.0492 - Posterior means for final population: t1: 0.552, t2: 0.205 + Threshold: 0.0491 + Sample means: t1: 0.556, t2: 0.22 + + Population 0: + Method: Rejection within SMC-ABC + Number of samples: 1000 + Number of simulations: 10000 + Threshold: 0.488 + Sample means: t1: 0.547, t2: 0.232 + + Population 1: + Method: Rejection within SMC-ABC + Number of samples: 1000 + Number of simulations: 20000 + Threshold: 0.185 + Sample means: t1: 0.556, t2: 0.236 + + Population 2: + Method: Rejection within SMC-ABC + Number of samples: 1000 + Number of simulations: 160000 + Threshold: 0.0491 + Sample means: t1: 0.556, t2: 0.22 -The ``Sample`` object returned by the SMC-ABC sampling contains also -some methods for investigating the evolution of populations, e.g.: +Or just the means: -.. code:: ipython3 +.. code:: python - result_smc.posterior_means_all_populations + result_smc.sample_means_summary(all=True) .. parsed-literal:: - Posterior means for population 0: t1: 0.547, t2: 0.232 - Posterior means for population 1: t1: 0.559, t2: 0.23 - Posterior means for population 2: t1: 0.552, t2: 0.205 + Sample means for population 0: t1: 0.547, t2: 0.232 + Sample means for population 1: t1: 0.556, t2: 0.236 + Sample means for population 2: t1: 0.556, t2: 0.22 -.. code:: ipython3 +.. code:: python - result_smc.plot_marginals_all_populations(bins=25, figsize=(8, 2), fontsize=12) + result_smc.plot_marginals(all=True, bins=25, figsize=(8, 2), fontsize=12) -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_81_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_89_0.png -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_81_1.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_89_1.png -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_81_2.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_89_2.png Obviously one still has direct access to the samples as well, which allows custom plotting: -.. code:: ipython3 +.. code:: python n_populations = len(schedule) fig, ax = plt.subplots(ncols=n_populations, sharex=True, sharey=True, figsize=(16,6)) - samples = [pop.samples_list for pop in result_smc.populations] - for ii in range(n_populations): - s = samples[ii] - ax[ii].scatter(s[0], s[1], s=5, edgecolor='none'); - ax[ii].set_title("Population {}".format(ii)); - ax[ii].plot([0, 2, -2, 0], [-1, 1, 1, -1], 'b') - ax[0].set_xlabel(result_smc.names_list[0]); - ax[0].set_ylabel(result_smc.names_list[1]); + + for i, pop in enumerate(result_smc.populations): + s = pop.samples + ax[i].scatter(s['t1'], s['t2'], s=5, edgecolor='none'); + ax[i].set_title("Population {}".format(i)); + ax[i].plot([0, 2, -2, 0], [-1, 1, 1, -1], 'b') + ax[i].set_xlabel('t1'); + ax[0].set_ylabel('t2'); ax[0].set_xlim([-2, 2]) ax[0].set_ylim([-1, 1]); -.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6/usage/tutorial_files/tutorial_83_0.png +.. image:: http://research.cs.aalto.fi/pml/software/elfi/docs/0.6.1/usage/tutorial_files/tutorial_91_0.png It can be seen that the populations iteratively concentrate more and @@ -827,11 +931,8 @@ more around the true parameter values. Note that for the later populations some of the samples lie outside allowed region. This is due to the SMC algorithm sampling near previous -samples, with *near* meaning a Gaussian distribution centered around -previous samples with variance as twice the weighted empirical variance. -However, the outliers carry zero weight, and have no effect on the -estimates. +samples. However, the outliers carry zero weight and have no effect on +the estimates. That's it! See the other documentation for more advanced topics on e.g. BOLFI, external simulators and parallelization. - diff --git a/elfi/__init__.py b/elfi/__init__.py index e7dbea7f..cd5bdf29 100644 --- a/elfi/__init__.py +++ b/elfi/__init__.py @@ -17,4 +17,4 @@ __email__ = 'elfi-support@hiit.fi' # make sure __version_ is on the last non-empty line (read by setup.py) -__version__ = '0.6.0' +__version__ = '0.6.1'