Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization #92

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,033 changes: 1,033 additions & 0 deletions notebooks/0.2-stijnvanhoey-refactor-optimization.ipynb

Large diffs are not rendered by default.

13 changes: 8 additions & 5 deletions src/covid19model/data/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,27 +54,30 @@ def get_COVID19_SEIRD_parameters():
-----------
parameters = get_COVID19_SEIRD_parameters()
"""
abs_dir = os.path.dirname(__file__)
rel_dir = os.path.join(abs_dir, "../../../data/raw/model_parameters")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you do a merge of master, as I think this already changed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I did update this morning and merged with the adjustments Jenna did earlier.


# Initialize parameters dictionary
parameters = {}

# Assign Nc_total from the Polymod study to the parameters dictionary
initN, Nc_home, Nc_work, Nc_schools, Nc_transport, Nc_leisure, Nc_others, Nc_total = polymod.get_interaction_matrices()
parameters['Nc'] = Nc_total

# Verity_etal
df = pd.read_csv("../../data/raw/model_parameters/verity_etal.csv", sep=',',header='infer')
df = pd.read_csv(os.path.join(rel_dir, "verity_etal.csv"), sep=',',header='infer')
parameters['h'] = np.array(df.loc[:,'symptomatic_hospitalized'].astype(float).tolist())/100
parameters['icu'] = np.array(df.loc[:,'hospitalized_ICU'].astype(float).tolist())/100
parameters['c'] = 1-parameters['icu']
parameters['m0'] = np.array(df.loc[:,'CFR'].astype(float).tolist())/100/parameters['h']/parameters['icu']

# Wu_etal
df_asymp = pd.read_csv("../../data/raw/model_parameters/wu_etal.csv", sep=',',header='infer')
# Abrams_etal
df_asymp = pd.read_csv(os.path.join(rel_dir, "wu_etal.csv"), sep=',',header='infer')
parameters['a'] = np.array(df_asymp.loc[:,'fraction asymptomatic'].astype(float).tolist())
parameters['m'] = 1-parameters['a']

# Other parameters
df_other_pars = pd.read_csv("../../data/raw/model_parameters/others.csv", sep=',',header='infer')
df_other_pars = pd.read_csv(os.path.join(rel_dir, "others.csv"), sep=',',header='infer')
parameters.update(df_other_pars.T.to_dict()[0])

# Fitted parameters
Expand Down
28 changes: 15 additions & 13 deletions src/covid19model/data/polymod.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np

def get_interaction_matrices():
"""Extract interaction matrices and demographic data from `data/raw/polymod` folder
"""Extract interaction matrices and demographic data from `data/raw/polymod` folder

This function returns the total number of individuals in ten year age bins in the Belgian population and the interaction matrices Nc at home, at work, in schools, on public transport, during leisure activities and during other activities.

Expand All @@ -15,15 +15,15 @@ def get_interaction_matrices():
Nc_home : np.array (9x9)
number of daily contacts at home of individuals in age group X with individuals in age group Y
Nc_work : np.array (9x9)
number of daily contacts in the workplace of individuals in age group X with individuals in age group Y
number of daily contacts in the workplace of individuals in age group X with individuals in age group Y
Nc_schools : np.array (9x9)
number of daily contacts in schools of individuals in age group X with individuals in age group Y
number of daily contacts in schools of individuals in age group X with individuals in age group Y
Nc_transport : np.array (9x9)
number of daily contacts on public transport of individuals in age group X with individuals in age group Y
number of daily contacts on public transport of individuals in age group X with individuals in age group Y
Nc_leisure : np.array (9x9)
number of daily contacts during leisure activities of individuals in age group X with individuals in age group Y
Nc_others : np.array (9x9)
number of daily contacts in other places of individuals in age group X with individuals in age group Y
number of daily contacts in other places of individuals in age group X with individuals in age group Y
Nc_total : np.array (9x9)
total number of daily contacts of individuals in age group X with individuals in age group Y, calculated as the sum of all the above interaction

Expand All @@ -36,15 +36,17 @@ def get_interaction_matrices():
-----------
initN, Nc_home, Nc_work, Nc_schools, Nc_transport, Nc_leisure, Nc_others, Nc_total = get_interaction_matrices()
"""
abs_dir = os.path.dirname(__file__)
rel_dir = os.path.join(abs_dir, "../../../data/raw/polymod/interaction_matrices/Belgium")

# Data source
Nc_home = np.loadtxt("../../data/raw/polymod/interaction_matrices/Belgium/BELhome.txt", dtype='f', delimiter='\t')
Nc_work = np.loadtxt("../../data/raw/polymod/interaction_matrices/Belgium/BELwork.txt", dtype='f', delimiter='\t')
Nc_schools = np.loadtxt("../../data/raw/polymod/interaction_matrices/Belgium/BELschools.txt", dtype='f', delimiter='\t')
Nc_transport = np.loadtxt("../../data/raw/polymod/interaction_matrices/Belgium/BELtransport.txt", dtype='f', delimiter='\t')
Nc_leisure = np.loadtxt("../../data/raw/polymod/interaction_matrices/Belgium/BELleisure.txt", dtype='f', delimiter='\t')
Nc_others = np.loadtxt("../../data/raw/polymod/interaction_matrices/Belgium/BELothers.txt", dtype='f', delimiter='\t')
Nc_total = np.loadtxt("../../data/raw/polymod/interaction_matrices/Belgium/BELtotal.txt", dtype='f', delimiter='\t')
initN = np.loadtxt("../../data/raw/polymod/demographic/BELagedist_10year.txt", dtype='f', delimiter='\t')
Nc_home = np.loadtxt(os.path.join(rel_dir, "BELhome.txt"), dtype='f', delimiter='\t')
Nc_work = np.loadtxt(os.path.join(rel_dir, "BELwork.txt"), dtype='f', delimiter='\t')
Nc_schools = np.loadtxt(os.path.join(rel_dir, "BELschools.txt"), dtype='f', delimiter='\t')
Nc_transport = np.loadtxt(os.path.join(rel_dir, "BELtransport.txt"), dtype='f', delimiter='\t')
Nc_leisure = np.loadtxt(os.path.join(rel_dir, "BELleisure.txt"), dtype='f', delimiter='\t')
Nc_others = np.loadtxt(os.path.join(rel_dir, "BELothers.txt"), dtype='f', delimiter='\t')
Nc_total = np.loadtxt(os.path.join(rel_dir, "BELtotal.txt"), dtype='f', delimiter='\t')
initN = np.loadtxt(os.path.join(abs_dir, "../../../data/raw/polymod/demographic/BELagedist_10year.txt"), dtype='f', delimiter='\t')

return initN, Nc_home, Nc_work, Nc_schools, Nc_transport, Nc_leisure, Nc_others, Nc_total
77 changes: 74 additions & 3 deletions src/covid19model/optimization/objective_fcns.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,76 @@
import numpy as np


def sse(theta, model, data, model_parameters, lag_time=None):
"""Calculate the sum of squared errors from data and model instance

The function assumes the number of values in the theta array corresponds
to the number of defined parameters, but can be extended with a lag_time
parameters to make this adjustable as well by an optimization algorithm.


Parameters
----------
theta : np.array
Array with N (if lag_time is defined) or N+1 (if lag_time is not
defined) values.
model : covidmodel.model instance
Covid model instance
data : xarray.DataArray
Xarray DataArray with time as a dimension and the name corresponding
to an existing model output state.
model_parameters : list of str
Parameter names of the model parameters to adjust in order to get the
SSE.
lag_time : int
Warming up period before comparing the data with the model output.
e.g. if 40; comparison of data only starts after 40 days.


Notes
-----
Assumes daily time step(!) # TODO: need to generalize this

Examples
--------
>>> data_to_fit = xr.DataArray([ 54, 79, 100, 131, 165, 228, 290],
coords={"time": range(1, 8)}, name="ICU",
dims=['time'])
>>> sse([1.6, 0.025, 42], sir_model, data_to_fit, ['sigma', 'beta'], lag_time=None)
>>> # previous is equivalent to
>>> sse([1.6, 0.025], sir_model, data_to_fit, ['sigma', 'beta'], lag_time=42)
>>> # but the latter will use a fixed lag_time,
>>> # whereas the first one can be used inside optimization
"""

if data.name not in model.state_names:
raise Exception("Data variable to fit is not available as model output")

# define new parameters in model
for i, parameter in enumerate(model_parameters):
model.parameters.update({parameter: theta[i]})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if we should introduce a concept of "cloning" a model, to avoid changing the original one when setting parameters like this for an optimization run

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I propose to define this in a separate issue/PR to provide a more generic solution.


# extract additional parameter # TODO - check alternatives for generalisation here
if not lag_time:
lag_time = int(round(theta[-1]))
else:
lag_time = int(round(lag_time))

# run model
time = len(data.time) + lag_time # at least this length
output = model.sim(time)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could also run with [-lag_time, time] ? (or does the model not work with negative time?)

Copy link
Collaborator

@twallema twallema Jun 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The solve_ivp from SciPy requires the user to define the times at which the model output should be returned, they can be negative without any problem. So assigning the time as [ -lag_time, time] in the model output xarray, and assigning the time as [0 1 2 ... 7] in the data xarray should allow for pairing without using values as suggested below. However, this does not really abstract the lag_time. I'm also wondering if this approach will work if the times do not match, f.i. when using a discrete-time model with step-length 0.235, some form of extrapolation will be necessary to match outputs. I've seen xarray can be linked to SciPy interpolate: see http://xarray.pydata.org/en/stable/generated/xarray.DataArray.interp.html

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default will be to only align those values where the time values match exactly, but you can indeed make this more flexible (eg http://xarray.pydata.org/en/stable/generated/xarray.DataArray.reindex.html also allows a simple "nearest" or a certain tolerance that should be allowed)


# extract the variable of interest
subset_output = output.sum(dim="stratification")[data.name]
# adjust to fix lag time to extract start of comparison
output_to_compare = subset_output.sel(time=slice(lag_time, lag_time - 1 + len(data)))

# calculate sse -> we could maybe enable more function options on this level?
sse = np.sum((data.values - output_to_compare.values)**2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check that the "time" actually matched? (with the provided data) So if you adjust the time of the model output with lag_time, this calculation could be done with the xarray object (so without .values), and then this alignment on the time dimension will be done automatically


return sse


def SSE(thetas,BaseModel,data,states,parNames,weights,checkpoints=None):

"""
Expand Down Expand Up @@ -30,7 +101,7 @@ def SSE(thetas,BaseModel,data,states,parNames,weights,checkpoints=None):
-----------
SSE = SSE(model,thetas,data,parNames,positions,weights)
"""

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# assign estimates to correct variable
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -46,7 +117,7 @@ def SSE(thetas,BaseModel,data,states,parNames,weights,checkpoints=None):
# Run simulation
# ~~~~~~~~~~~~~~
# number of dataseries
n = len(data)
n = len(data)
# Compute simulation time
data_length =[]
for i in range(n):
Expand Down Expand Up @@ -129,7 +200,7 @@ def MLE(thetas,BaseModel,data,states,parNames,checkpoints=None):
# Run simulation
# ~~~~~~~~~~~~~~
# number of dataseries
n = len(data)
n = len(data)
# Compute simulation time
data_length =[]
for i in range(n):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,59 @@
from covid19model.optimization import objective_fcns
from covid19model.optimization import pso


def fit(func, model, data, model_parameters, bounds,
disp=True, maxiter=30, popsize=10):
"""Fit a model using a defined SSE function

Parameters
----------
func : objective function
The arguments ``model``, ``data``
and ``model_parameters`` should be the
other arguments of the optimization func
model : covidmodel.model instance
Covid19model instance
data : xarray.DataArray
Xarray DataArray with time as a dimension and the name corresponding
to an existing model output state.
model_parameters : list of str
Parameter names of the model parameters to adjust in order to get the
SSE.
bounds : list of (min, max)
For each parameter (model parameter or others), define the min and
max boundaries as tuples. If func has non-model parameters to
optimize as well, len(bounds) > len(model_parameters).
disp: boolean
display the pso output stream
maxiter: float or int
maximum number of pso iterations
popsize: float or int
population size of particle swarm
increasing this variable lowers the chance of finding local minima but slows down calculations
"""
# TODO - add checks on inputs,...

original_parameters = model.parameters.copy()

(theta_hat, obj_fun_val, pars_final_swarm,
obj_fun_val_final_swarm) = pso.optim(func,
bounds,
args=(model, data, model_parameters),
swarmsize=popsize,
maxiter=maxiter,
processes=mp.cpu_count()-1,
minfunc=1e-9,
minstep=1e-9,
debug=True,
particle_output=True)

# reset the model parameters to original values
model.parameters = original_parameters

return theta_hat


def fit_pso(BaseModel,data,parNames,states,bounds,checkpoints=None,disp=True,maxiter=30,popsize=10):
"""
A function to compute the mimimum of the absolute value of the maximum likelihood estimator using a particle swarm optimization
Expand All @@ -12,7 +65,7 @@ def fit_pso(BaseModel,data,parNames,states,bounds,checkpoints=None,disp=True,max
BaseModel: model object
correctly initialised model to be fitted to the dataset
data: array
list containing dataseries
list containing dataseries
parNames: array
list containing the names of the parameters to be fitted
states: array
Expand All @@ -38,7 +91,7 @@ def fit_pso(BaseModel,data,parNames,states,bounds,checkpoints=None,disp=True,max

Notes
-----------
Use all available cores minus one by default (optimal number of processors for 2-,4- or 6-core PC's with an OS).
Use all available cores minus one by default (optimal number of processors for 2-,4- or 6-core PC's with an OS).

Example use
-----------
Expand Down