diff --git a/R/py_int/covid_run.R b/R/py_int/covid_run.R index 3233a47c..32ba4535 100644 --- a/R/py_int/covid_run.R +++ b/R/py_int/covid_run.R @@ -47,6 +47,11 @@ run_status <- function(pop, asymp_rate = 0.7, output_switch = TRUE, rank_assign = FALSE, + local_outbreak_timestep = 0, + local_outbreak = FALSE, + msoa_infect="E02004152", + number_people_local=100, + local_prob_increase=0.75), overweight_sympt_mplier = 1.46, overweight = 1, obesity_30 = 1, @@ -127,6 +132,15 @@ run_status <- function(pop, print("probabilities calculated") + if(local_outbreak == TRUE & timestep == local_outbreak_timestep){ + print("Local outbreak - super spreader event!") + df_prob <- local_outbreak(df=df_prob, + msoa_infect=msoa_infect, + number_people=number_people_local, + risk_prob=local_prob_increase) + } + + if(timestep > 1){ df_ass <- case_assign(df = df_prob, tmp.dir=tmp.dir, diff --git a/R/py_int/data/gam_cases.rda b/R/py_int/data/gam_cases.rda new file mode 100644 index 00000000..c5bf22e0 Binary files /dev/null and b/R/py_int/data/gam_cases.rda differ diff --git a/R/py_int/data/msoas.rda b/R/py_int/data/msoas.rda new file mode 100644 index 00000000..ae135b13 Binary files /dev/null and b/R/py_int/data/msoas.rda differ diff --git a/build/lib/microsim/QUANTRampAPI2.py b/build/lib/microsim/QUANTRampAPI2.py new file mode 100644 index 00000000..7f3b5f4f --- /dev/null +++ b/build/lib/microsim/QUANTRampAPI2.py @@ -0,0 +1,356 @@ +""" +Public interface to the data +Contains accessor functions for getting probabilities of trips from MSOA or IZ origins to +primary schools, secondary schools and retail locations. +Further information is contained in each function description. +Changes made to read in data in format suitable for RAMP UA. +""" + +import pandas as pd +import pickle +import os +import numpy as np + +################################################################################ +# Utilities +################################################################################ + + +""" +Load a numpy matrix from a file +""" +def loadMatrix(filename): + with open(filename,'rb') as f: + matrix = pickle.load(f) + return matrix + +################################################################################ +# Globals +################################################################################ + +os.chdir("..") +quant_dir = 'data/QUANT_RAMP/model-runs/' +dfPrimaryPopulation = pd.read_csv(os.path.join(quant_dir,'primaryPopulation.csv')) +dfPrimaryZones = pd.read_csv(os.path.join(quant_dir,'primaryZones.csv')) +primary_probPij = loadMatrix(os.path.join(quant_dir,'primaryProbPij.bin')) +dfSecondaryPopulation = pd.read_csv(os.path.join(quant_dir,'secondaryPopulation.csv')) +dfSecondaryZones = pd.read_csv(os.path.join(quant_dir,'secondaryZones.csv')) +secondary_probPij = loadMatrix(os.path.join(quant_dir,'secondaryProbPij.bin')) +dfRetailPointsPopulation = pd.read_csv(os.path.join(quant_dir,'retailpointsPopulation.csv')) +dfRetailPointsZones = pd.read_csv(os.path.join(quant_dir,'retailpointsZones.csv')) +retailpoints_probSij = loadMatrix(os.path.join(quant_dir,'retailpointsProbSij.bin')) +dfHospitalPopulation = pd.read_csv(os.path.join(quant_dir,'hospitalPopulation.csv')) +dfHospitalZones = pd.read_csv(os.path.join(quant_dir,'hospitalZones.csv')) +hospital_probHij = loadMatrix(os.path.join(quant_dir,'hospitalProbHij.bin')) + + +################################################################################ +# Interface +################################################################################ + + +""" +getProbablePrimarySchoolsByMSOAIZ +Given an MSOA area code (England and Wales) or an Intermediate Zone (IZ) 2001 code (Scotland), return +a list of all the surrounding primary schools whose probabilty of being visited by the MSOA_IZ is +greater than or equal to the threshold. +School ids are taken from the Edubase list of URN +NOTE: code identical to the secondary school version, only with switched lookup tables +@param msoa_iz An MSOA code (England/Wales e.g. E02000001) or an IZ2001 code (Scotland e.g. S02000001) +@param threshold Probability threshold e.g. 0.5 means return all possible schools with probability>=0.5 +@returns a list of probabilities in the same order as the venues +""" +def getProbablePrimarySchoolsByMSOAIZ(msoa_iz,threshold): + result = [] + zonei = int(dfPrimaryPopulation.loc[dfPrimaryPopulation['msoaiz'] == msoa_iz,'zonei']) + m,n = primary_probPij.shape + for j in range(n): + p = primary_probPij[zonei,j] + if p>=threshold: + row2 = dfPrimaryZones.loc[dfPrimaryZones['zonei'] == j] #yes, zonei==j is correct, they're always called 'zonei' + id = row2['URN'].values[0] + result.append(p) + #end if + #end for + return result + +################################################################################ + +""" +getProbableSecondarySchoolsByMSOAIZ +Given an MSOA area code (England and Wales) or an Intermediate Zone (IZ) 2001 code (Scotland), return +a list of all the surrounding secondary schools whose probabilty of being visited by the MSOA_IZ is +greater than or equal to the threshold. +School ids are taken from the Edubase list of URN +NOTE: code identical to the primary school version, only with switched lookup tables +@param msoa_iz An MSOA code (England/Wales e.g. E02000001) or an IZ2001 code (Scotland e.g. S02000001) +@param threshold Probability threshold e.g. 0.5 means return all possible schools with probability>=0.5 +@returns a list of probabilities in the same order as the venues +""" +def getProbableSecondarySchoolsByMSOAIZ(msoa_iz,threshold): + result = [] + zonei = int(dfSecondaryPopulation.loc[dfSecondaryPopulation['msoaiz'] == msoa_iz, 'zonei']) + m,n = secondary_probPij.shape + for j in range(n): + p = secondary_probPij[zonei,j] + if p>=threshold: + row2 = dfSecondaryZones.loc[dfSecondaryZones['zonei'] == j] #yes, zonei==j is correct, they're always called 'zonei' + id = row2['URN'].values[0] + result.append(p) + #end if + #end for + return result + +################################################################################ + +""" +getProbableRetailByMSOAIZ +Given an MSOA area code (England and Wales) or an Intermediate Zone (IZ) 2001 code (Scotland), return +a list of all the surrounding retail points whose probabilty of being visited by the MSOA_IZ is +greater than or equal to the threshold. +Retail ids are from ???? +@param msoa_iz An MSOA code (England/Wales e.g. E02000001) or an IZ2001 code (Scotland e.g. S02000001) +@param threshold Probability threshold e.g. 0.5 means return all possible retail points with probability>=0.5 +@returns a list of probabilities in the same order as the venues +""" +def getProbableRetailByMSOAIZ(msoa_iz,threshold): + result = [] + zonei = int(dfRetailPointsPopulation.loc[dfRetailPointsPopulation['msoaiz'] == msoa_iz, 'zonei']) + m,n = retailpoints_probSij.shape + for j in range(n): + p = retailpoints_probSij[zonei,j] + if p>=threshold: + row2 = dfRetailPointsZones.loc[dfRetailPointsZones['zonei'] == j] #yes, zonei==j is correct, they're always called 'zonei' + id = row2['id'].values[0] + result.append(p) + #end if + #end for + return result + +################################################################################ + +""" +getProbableHospitalByMSOAIZ +Given an MSOA area code (England and Wales) or an Intermediate Zone (IZ) 2001 code (Scotland), return +a list of all the surrounding hospitals whose probabilty of being visited by the MSOA_IZ is +greater than or equal to the threshold. +Hospital ids are taken from the NHS England export of "location" - see hospitalZones for ids and names (and east/north) +NOTE: code identical to the primary school version, only with switched lookup tables +@param msoa_iz An MSOA code (England/Wales e.g. E02000001) or an IZ2001 code (Scotland e.g. S02000001) +@param threshold Probability threshold e.g. 0.5 means return all possible hospital points with probability>=0.5 +@returns a list of [ {id: 'hospitalid1', p: 0.5}, {id: 'hospitalid2', p:0.6}, ... etc] (NOTE: not sorted in any particular order) +""" + +def getProbableHospitalByMSOAIZ(msoa_iz,threshold): + result = [] + zonei = int(dfHospitalPopulation.loc[dfHospitalPopulation['msoaiz'] == msoa_iz, 'zonei']) + m,n = hospital_probHij.shape + for j in range(n): + p = hospital_probHij[zonei,j] + if p>=threshold: + row2 = dfHospitalZones.loc[dfHospitalZones['zonei'] == j] #yes, zonei==j is correct, they're always called 'zonei' + id = row2['id'].values[0] + result.append(p) + #end if + #end for + return result + +################################################################################ + + + +################################################################################ +# Prepare RAMP UA compatible data +################################################################################ + +def get_flows(venue, msoa_list, threshold, thresholdtype): + + # get all probabilities so they sum to at least threshold value + dic = {} # appending to dictionary is faster than dataframe + for m in msoa_list: + print(m) + # get all probabilities for this MSOA (threshold set to 0) + if venue == "PrimarySchool": + result_tmp = getProbablePrimarySchoolsByMSOAIZ(m,0) + elif venue == "SecondarySchool": + result_tmp = getProbableSecondarySchoolsByMSOAIZ(m,0) + elif venue == "Retail": + result_tmp = getProbableRetailByMSOAIZ(m,0) + else: + sys.exit("unknown venue type") + # keep only values that sum to at least the specified threshold + sort_index = np.argsort(result_tmp) # index from lowest to highest value + result = [0.0] * len(result_tmp) # initialise + i = len(result_tmp)-1 # start with last of sorted (highest prob) + if thresholdtype == "prob": + sum_p = 0 # initialise + while sum_p < threshold: + result[sort_index[i]] = result_tmp[sort_index[i]] + sum_p = sum_p + result_tmp[sort_index[i]] + #print(sum_p) + i = i - 1 + elif thresholdtype == "nr": + for t in range(0,threshold): + result[sort_index[i]] = result_tmp[sort_index[i]] + i = i - 1 + else: + sys.exit("unknown threshold type") + dic[m] = result + + # now turn this into a dataframe with the right columns etc compatible with _flows variable + nr_venues = len(dic[msoa_list[0]]) + col_names = [] + for n in range(0,nr_venues): + col_names.append(f"Loc_{n}") + df = pd.DataFrame.from_dict(dic,orient='index') + df.columns = col_names + df.insert(loc=0, column='Area_ID', value=[*range(1, len(msoa_list)+1, 1)]) + df.insert(loc=1, column='Area_Code', value=df.index) + df.reset_index(drop=True, inplace=True) + return df + + + + + + + + + +# def get_flows_test(venue, msoa_list, threshold, thresholdtype): + +# # get all probabilities so they sum to at least threshold value +# dic = {} # appending to dictionary is faster than dataframe +# for m in msoa_list: +# print(m) +# # get all probabilities for this MSOA (threshold set to 0) +# if venue == "PrimarySchool": +# result_tmp = getProbablePrimarySchoolsByMSOAIZ(m,0) +# elif venue == "SecondarySchool": +# result_tmp = getProbableSecondarySchoolsByMSOAIZ(m,0) +# elif venue == "Retail": +# result_tmp = getProbableRetailByMSOAIZ(m,0) +# else: +# sys.exit("unknown venue type") +# # keep only values that sum to at least the specified threshold +# sort_index = np.argsort(result_tmp) # index from lowest to highest value +# result = [0.0] * len(result_tmp) # initialise +# i = len(result_tmp)-1 # start with last of sorted (highest prob) +# if thresholdtype == "prob": +# sum_p = 0 # initialise +# while sum_p < threshold: +# result[sort_index[i]] = result_tmp[sort_index[i]] +# sum_p = sum_p + result_tmp[sort_index[i]] +# #print(sum_p) +# i = i - 1 +# elif thresholdtype == "nr": +# for t in range(0,threshold): +# result[sort_index[i]] = result_tmp[sort_index[i]] +# i = i - 1 +# else: +# sys.exit("unknown threshold type") +# dic[m] = result + +# # now turn this into a dataframe with the right columns etc compatible with _flows variable +# nr_venues = len(dic[msoa_list[0]]) +# col_names = [] +# for n in range(0,nr_venues): +# col_names.append(f"Loc_{n}") +# df = pd.DataFrame.from_dict(dic,orient='index') +# df.columns = col_names + +# # optional: check +# if thresholdtype == "prob": # check nr of venues +# test = pd.Series(np.count_nonzero(df, axis=1)) +# elif thresholdtype == "nr": # check total probability +# test = df.sum(axis=1) + +# df.insert(loc=0, column='Area_ID', value=[*range(1, len(msoa_list)+1, 1)]) +# df.insert(loc=1, column='Area_Code', value=df.index) +# df.reset_index(drop=True, inplace=True) +# return df, test + + + +# # testing only +# import geopandas as gpd +# import random +# import matplotlib.pyplot as plt +# # load in shapefile with England MSOAs +# sh_file = os.path.join("C:\\Users\Toshiba\git_repos\RAMP-UA\devon_data","MSOAS_shp","bcc21fa2-48d2-42ca-b7b7-0d978761069f2020412-1-12serld.j1f7i.shp") +# map_df = gpd.read_file(sh_file) +# # rename column to get ready for merging +# msoas_england = map_df.msoa11cd[0:6791].tolist() +# msoas_england = random.sample(msoas_england, 100) + + +# # to call, use something like: +# msoa_list = msoas_england #['E02002559', 'E02002560'] + +# threshold = 5 # explain 20% +# thresholdtype = "nr" +# venue = "SecondarySchool" #PrimarySchool, SecondarySchool, Retail +# df_sec_nr,test_sec_nr = get_flows_test(venue, msoa_list,threshold,thresholdtype) + +# plt.hist(test_sec_nr) +# plt.title(f'Secondary Schools probability top {threshold} venues') +# plt.ylabel('Nr MSOAs') +# plt.xlabel('Summed probability venues') +# plt.savefig("test_sec_nr.pdf") +# plt.clf() + +# venue = "PrimarySchool" +# df_prim_nr,test_prim_nr = get_flows_test(venue, msoa_list,threshold,thresholdtype) + +# plt.hist(test_prim_nr) +# plt.title(f'Primary Schools probability top {threshold} venues') +# plt.ylabel('Nr MSOAs') +# plt.xlabel('Summed probability venues') +# plt.savefig("test_prim_nr.pdf") +# plt.clf() + +# threshold = 10 +# venue = "Retail" +# df_shop_nr,test_shop_nr = get_flows_test(venue, msoa_list,threshold,thresholdtype) + +# plt.hist(test_shop_nr) +# plt.title(f'Retail probability top {threshold} venues') +# plt.ylabel('Nr MSOAs') +# plt.xlabel('Summed probability venues') +# plt.savefig("test_shop_nr.pdf") +# plt.clf() + + +# threshold = 0.1 +# thresholdtype = "prob" +# venue = "SecondarySchool" #PrimarySchool, SecondarySchool, Retail +# df_sec_prob,test_sec_prob = get_flows_test(venue, msoa_list,threshold,thresholdtype) + +# plt.hist(test_sec_prob) +# plt.title(f'Secondary Schools nr venues prob > {threshold}') +# plt.ylabel('Nr MSOAs') +# plt.xlabel('Nr venues') +# plt.savefig("test_sec_prob.pdf") +# plt.clf() + +# venue = "PrimarySchool" +# df_prim_prob,test_prim_prob = get_flows_test(venue, msoa_list,threshold,thresholdtype) + +# plt.hist(test_prim_prob) +# plt.title(f'Primary Schools nr venues prob > {threshold}') +# plt.ylabel('Nr MSOAs') +# plt.xlabel('Nr venues') +# plt.savefig("test_prim_prob.pdf") +# plt.clf() + +# venue = "Retail" +# df_shop_prob,test_shop_prob = get_flows_test(venue, msoa_list,threshold,thresholdtype) + +# plt.hist(test_shop_prob) +# plt.title(f'Retail nr venues prob > {threshold}') +# plt.ylabel('Nr MSOAs') +# plt.xlabel('Nr venues') +# plt.savefig("test_shop_prob.pdf") +# plt.clf() + + diff --git a/build/lib/microsim/__init__.py b/build/lib/microsim/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/lib/microsim/activity_location.py b/build/lib/microsim/activity_location.py new file mode 100644 index 00000000..b88ef100 --- /dev/null +++ b/build/lib/microsim/activity_location.py @@ -0,0 +1,98 @@ +from typing import List + +import pandas as pd + +from microsim.column_names import ColumnNames + + +class ActivityLocation(): + """Class to represent information about activity locations, e.g. retail destinations, workpaces, etc.""" + def __init__(self, name: str, locations: pd.DataFrame, flows: pd.DataFrame, + individuals: pd.DataFrame, duration_col: str): + """ + Initialise an ActivityLocation. + + IMPORTANT: the locations dataframe must be in the same order as columns in the flow matrix. For example, + the first (e.g.) shop in the locations dataframe must have its flows stored in the first column in the + matrix. + + :param name: A name to use to refer to this activity. Column names in the big DataFrame of individuals + will be named according to this + :param locations: A dataframe containing information about each loction. + :param flows: A dataframe containing the flows. + :param individuals: The dataframe containing the individual population. This is needed because a new + '*_DURATION' column will be added to that table to show how much time each individual spends doing + this activity. The new column is added inplace + :param duration_col: The column in the 'individuals' dataframe that gives the proportion of time + spend doing this activity. This needs to be renamed according to a standard format, e.g. for retail + the column needs to be called 'RETAIL_DURATION'. + """ + self._name = name + # Check that the dataframe has all the standard columns needed + if ColumnNames.LOCATION_ID not in locations.columns or \ + ColumnNames.LOCATION_DANGER not in locations.columns or \ + ColumnNames.LOCATION_NAME not in locations.columns: + raise Exception(f"Activity '{name}' dataframe needs columns called 'ID' and 'Danger' and 'Location_Name'." + f"It only has: {locations.columns}") + # Check that the DataFrame's ID column is also an index, this is to ensure that the IDs always + # refer to the same thing. NO LONGER DOING THIS, INDEX AND ID CAN BE DIFFERENT # + #if locations.index.name != ColumnNames.LOCATION_ID or False in (locations.index == locations[ColumnNames.LOCATION_ID]): + # f"that is equal to the 'ID' columns.") + self._locations = locations + self._flows = flows + + # Check that the duration column exists and create a new column showing the duration of this activity + if duration_col not in individuals.columns: + raise Exception(f"The duration column '{duration_col}' is not one of the columns in the individuals" + f"data frame: {individuals.columns}") + self.duration_column = self._name + ColumnNames.ACTIVITY_DURATION + individuals[self.duration_column] = individuals[duration_col] + + def __repr__(self): + return f"<{self._name} ActivityLocation>" + + def get_dangers(self) -> List[float]: + """Get the danger associated with each location as a list. These will be in the same order as the + location IDs returned by `get_ids()`""" + return list(self._locations[ColumnNames.LOCATION_DANGER]) + + def get_name(self) -> str: + """Get the name of this activity. This is used to label columns in the file of individuals""" + return self._name + + def get_indices(self) -> List[int]: + """Return the index (row number) of each destination + Shouldn't need to know these. Use get_dangers or update_dangers instead + """ + return list(self._locations.index) + + def get_dataframe_copy(self) -> pd.DataFrame: + """ + Get a copy of the dataframe that underpins this ActivityLocation + :return: + """ + return self._locations.copy() + + def get_ids(self) -> List[int]: + """Retrn the IDs of each destination. + Shouldn't need to know these. Use get_dangers or update_dangers instead""" + return list(self._locations[ColumnNames.LOCATION_ID]) + + #def get_location(self, id: int) -> pd.DataFrame: + # """Get the location with the given id""" + # loc = self._locations.loc[self._locations[ColumnNames.LOCATION_ID] == id, :] + # if len(loc) != 1: + # raise Exception(f"Location with ID {id} does not return exactly one row: {loc}") + # return loc + + + def update_dangers(self, dangers: List[float]): + """ + Update the danger associated with each location + :param dangers: A list of dangers for each location. Must be in the same order as the locations as + returned by `get_ids`. + """ + if len(dangers) != len(self._locations): + raise Exception(f"The number of danger scores ({len(dangers)}) is not the same as the number of" + f"activity locations ({len(self._locations)}).") + self._locations[ColumnNames.LOCATION_DANGER] = dangers \ No newline at end of file diff --git a/build/lib/microsim/column_names.py b/build/lib/microsim/column_names.py new file mode 100644 index 00000000..20e28277 --- /dev/null +++ b/build/lib/microsim/column_names.py @@ -0,0 +1,58 @@ +class ColumnNames: + """Used to record standard dataframe column names used throughout""" + + LOCATION_DANGER = "Danger" # Danger associated with a location + LOCATION_NAME = "Location_Name" # Name of a location + LOCATION_ID = "ID" # Unique ID for each location + + # Define the different types of activities/locations that the model can represent + class Activities: + RETAIL = "Retail" + PRIMARY = "PrimarySchool" + SECONDARY = "SecondarySchool" + HOME = "Home" + WORK = "Work" + ALL = [RETAIL, PRIMARY, SECONDARY, HOME, WORK] + + ACTIVITY_VENUES = "_Venues" # Venues an individual may visit. Appended to activity type, e.g. 'Retail_Venues' + ACTIVITY_FLOWS = "_Flows" # Flows to a venue for an individual. Appended to activity type, e.g. 'Retail_Flows' + ACTIVITY_RISK = "_Risk" # Risk associated with a particular activity for each individual. E.g. 'Retail_Risk' + ACTIVITY_DURATION = "_Duration" # Column to record proportion of the day that invividuals do the activity + ACTIVITY_DURATION_INITIAL = "_Duration_Initial" # Amount of time on the activity at the start (might change) + + # Standard columns for time spent travelling in different modes + TRAVEL_CAR = "Car" + TRAVEL_BUS = "Bus" + TRAVEL_TRAIN = "Train" + TRAVEL_WALK = "Walk" + + INDIVIDUAL_AGE = "DC1117EW_C_AGE" # Age column in the table of individuals + INDIVIDUAL_SEX = "DC1117EW_C_SEX" # Sex column in the table of individuals + INDIVIDUAL_ETH = "DC2101EW_C_ETHPUK11" # Ethnicity column in the table of individuals + + # Columns for information about the disease. These are needed for estimating the disease status + + # Disease status is one of the following: + class DiseaseStatuses: + SUSCEPTIBLE = 0 + EXPOSED = 1 + PRESYMPTOMATIC = 2 + SYMPTOMATIC = 3 + ASYMPTOMATIC = 4 + RECOVERED = 5 + DEAD = 6 + ALL = [SUSCEPTIBLE, EXPOSED, PRESYMPTOMATIC, SYMPTOMATIC, ASYMPTOMATIC, RECOVERED, DEAD] + assert len(ALL) == 7 + + DISEASE_STATUS = "disease_status" # Which one it is + DISEASE_STATUS_CHANGED = "status_changed" # Whether it has changed between the current iteration and the last + DISEASE_PRESYMP = "presymp_days" + DISEASE_SYMP_DAYS = "symp_days" + DISEASE_EXPOSED_DAYS = "exposed_days" + + #DAYS_WITH_STATUS = "Days_With_Status" # The number of days that have elapsed with this status + CURRENT_RISK = "current_risk" # This is the risk that people get when visiting locations. + + # No longer update disease counts per MSOA etc. Not needed + #MSOA_CASES = "MSOA_Cases" # The number of cases per MSOA + #HID_CASES = "HID_Cases" # The number of cases in the individual's house \ No newline at end of file diff --git a/build/lib/microsim/dashboard.py b/build/lib/microsim/dashboard.py new file mode 100644 index 00000000..b354085c --- /dev/null +++ b/build/lib/microsim/dashboard.py @@ -0,0 +1,1211 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 4 16:22:57 2020 + +@author: Natalie +""" + +import os +import click +import pickle +import pandas as pd +import numpy as np +import geopandas as gpd +import imageio +from shapely.geometry import Point +import json + +from bokeh.io import output_file +from bokeh.plotting import figure, show +from bokeh.models import (BasicTicker, CDSView, ColorBar, ColumnDataSource, + CustomJS, CustomJSFilter, FactorRange, + GeoJSONDataSource, HoverTool, Legend, + LinearColorMapper, PrintfTickFormatter, Slider) +from bokeh.layouts import row, column, gridplot, grid, widgetbox +from bokeh.models.widgets import Tabs, Panel +from bokeh.palettes import brewer +from bokeh.transform import transform, factor_cmap + +import click # command-line interface +from yaml import load, dump, SafeLoader # pyyaml library for reading the parameters.yml file + + + + + + +# Functions for preprocessing +# --------------------------- + + + + +def calc_nr_days(data_file): + # figure out nr days by reading in e.g. retail dangers pickle file of run 0 + pickle_in = open(data_file,"rb") + dangers = pickle.load(pickle_in) + pickle_in.close() + + filter_col = [col for col in dangers if col.startswith('Danger')] + # don't use the column simply called 'Danger' + filter_col = filter_col[1:len(filter_col)] + nr_days = len(filter_col) + return nr_days + + +def create_venue_dangers_dict(locations_dict,r_range,data_dir,start_day,end_day,start_run,nr_runs): + ''' + Reads in venue pickle files (venues from locations_dict) and populates dangers_dict_3d (raw data: venue, day, run), dangers_dict (mean across runs) and dangers_dict_std (standard deviation across runs) + Possible output includes: + dangers_dict # mean (value to be plotted) + dangers_dict_std # standard deviation (could plot as error bars) + dangers_dict_3d # full 3D data (for debugging) + ''' + + dangers_dict = {} + dangers_dict_std = {} + dangers_dict_3d = {} + + for key, value in locations_dict.items(): + #for r in range(nr_runs): + for r in r_range: + + data_file = os.path.join(data_dir, f"{r}",f"{locations_dict[key]}.pickle") + pickle_in = open(data_file,"rb") + dangers = pickle.load(pickle_in) + pickle_in.close() + + filter_col = [col for col in dangers if col.startswith('Danger')] + # don't use the column simply called 'Danger' + filter_col = filter_col[1:len(filter_col)] + #nr_days = len(filter_col) + + # # set row index to ID + # dangers.set_index('ID', inplace = True) + dangers_colnames = filter_col[start_day:end_day+1] + dangers_rownames = dangers.index + dangers_values = dangers[filter_col[start_day:end_day+1]] + + if r == start_run: + dangers_3d = np.zeros((dangers.shape[0],dangers_values.shape[1],nr_runs)) + dangers_3d[:,:,r-start_run] = dangers_values + dangers_dict_3d[key] = dangers_3d + dangers_dict[key] = pd.DataFrame(data=dangers_3d.mean(axis=2), index=dangers_rownames, columns=dangers_colnames) + dangers_dict_std[key] = pd.DataFrame(data=dangers_3d.std(axis=2), index=dangers_rownames, columns=dangers_colnames) + + return dangers_dict, dangers_dict_std, dangers_dict_3d + + +def create_difference_dict(dict_sc0,dict_sc1,lookup_dict): + dict_out = {} + for key, value in lookup_dict.items(): + dict_out[key] = dict_sc1[key].subtract(dict_sc0[key]) + return dict_out + + + +def create_msoa_dangers_dict(dangers_dict,keys,msoa_codes): + ''' + Converts dangers_dict to MSOA level data for the appropriate venue types. Produces average danger score (sum dangers in MSOA / total nr venues in MSOA) + Output: dangers_msoa_dict + ''' + + dangers_msoa_dict = {} + for k in range(0,len(keys)): + dangers = dangers_dict[keys[k]] + msoa_code = msoa_codes[k] + dangers['MSOA'] = msoa_code + # count nr for this condition per area + msoa_sum = dangers.groupby(['MSOA']).agg('sum') + msoa_count = dangers.groupby(['MSOA']).agg('count') + msoa_avg = msoa_sum.div(msoa_count, axis='index') + dangers_msoa_dict[keys[k]] = msoa_avg + return dangers_msoa_dict + + + +def create_counts_dict(conditions_dict,r_range,data_dir,start_day,end_day,start_run,nr_runs,age_cat): + ''' + Counts per condition (3D, mean and standard deviation) + Produces 5 types of counts: + msoacounts: nr per msoa and day + agecounts: nr per age category and day + totalcounts: nr per day (across all areas) + cumcounts: nr per MSOA (across given time period) + uniquecounts: nr with 'final' disease status across time period e.g. someone who is presymptomatic, symptomatic and recoverd is only counted once as recovered + Output: + msoas # list of msoas + totalcounts_dict, cumcounts_dict, agecounts_dict, msoacounts_dict, cumcounts_dict_3d, totalcounts_dict_std, cumcounts_dict_std, agecounts_dict_std, msoacounts_dict_std, totalcounts_dict_3d, agecounts_dict_3d, msoacounts_dict_3d, uniquecounts_dict_3d, uniquecounts_dict_std, uniquecounts_dict + ''' + + # start with empty dictionaries + msoas = [] + msoacounts_dict_3d = {} + totalcounts_dict_3d = {} + cumcounts_dict_3d = {} + agecounts_dict_3d = {} + uniquecounts_dict_3d = {} + msoacounts_dict = {} + agecounts_dict = {} + totalcounts_dict = {} + cumcounts_dict = {} + uniquecounts_dict = {} + msoacounts_dict_std = {} + agecounts_dict_std = {} + totalcounts_dict_std = {} + cumcounts_dict_std = {} + uniquecounts_dict_std = {} + + nr_days = end_day - start_day + 1 + dict_days = [] # empty list for column names 'Day0' etc + for d in range(start_day, end_day+1): + dict_days.append(f'Day{d}') + age_cat_str = [] + for a in range(age_cat.shape[0]): + age_cat_str.append(f"{age_cat[a,0]}-{age_cat[a,1]}") + + # first, create 3d dictionaries + for r in r_range: + # read in pickle file individuals (disease status) + data_file = os.path.join(data_dir, f"{r}", "Individuals.pickle") + pickle_in = open(data_file,"rb") + individuals_tmp = pickle.load(pickle_in) + pickle_in.close() + # if first ever run, keep copy and initialise 3D frame for aggregating + if r == start_run: + individuals = individuals_tmp.copy() + msoas.extend(sorted(individuals.area.unique())) # populate list of msoas (previously empty outside this function) + area_individuals = individuals['area'] # keep area per person to use later + + # next bit of code is to restrict to user specified day range + # first, find all columns starting with disease_status + filter_col = [col for col in individuals if col.startswith('disease_status')] + # don't use the column simply called 'disease_status' + filter_col = filter_col[1:len(filter_col)] + counts_colnames = filter_col[start_day:end_day+1] + + + # TO BE ADDED SO USER CAN DEFINE AGE BRACKETS - FOR NOW JUST USING PREDEFINED CATEGORIES + # individuals['age0'] = np.zeros((len(individuals),1)) + # for a in range(age_cat.shape[0]): + # individuals['age0'] = np.where((individuals['DC1117EW_C_AGE'] >= age_cat[a,0]) & (individuals['DC1117EW_C_AGE'] <= age_cat[a,1]), a+1, individuals['age0']) + # age_cat_col = individuals['age0'].values + # temporary workaround + age_cat_col = individuals['Age1'].values + + # add age brackets column to individuals_tmp + individuals_tmp.insert(8, 'age0', age_cat_col) + + + uniquecounts_df = pd.DataFrame() + + for key, value in conditions_dict.items(): + #print(key) + if r == start_run: + msoacounts_dict_3d[key] = np.zeros((len(msoas),nr_days,nr_runs)) + agecounts_dict_3d[key] = np.zeros((age_cat.shape[0],nr_days,nr_runs)) + totalcounts_dict_3d[key] = np.zeros((nr_days,nr_runs)) + cumcounts_dict_3d[key] = np.zeros((len(msoas),nr_runs)) + uniquecounts_dict_3d[key] = np.zeros(nr_runs) + + # cumulative counts + # select right columns + tmp = individuals_tmp[counts_colnames] + #tmp = individuals_tmp.iloc[:,start_col:end_col] + # find all rows with condition (dict value) + indices = tmp[tmp.eq(value).any(1)].index + # create new df of zeros and replace with 1 at indices + cumcounts_run = pd.DataFrame(np.zeros((tmp.shape[0], 1))) + cumcounts_run.loc[indices] = 1 + + uniquecounts_df[key] = cumcounts_run.values[:,0] + + #uniqcounts[:,value,r] = cumcounts_run.values[:,0] + # merge with MSOA df + cumcounts_run = cumcounts_run.merge(area_individuals, left_index=True, right_index=True) + cumcounts_msoa_run = cumcounts_run.groupby(['area']).sum() + cumcounts_msoa_run = cumcounts_msoa_run.values + + # loop aroud days + msoacounts_run = np.zeros((len(msoas),nr_days)) + agecounts_run = np.zeros((age_cat.shape[0],nr_days)) + for day in range(0, nr_days): + #print(day) + # count nr for this condition per area + #msoa_count_temp = individuals_tmp[individuals_tmp.iloc[:, -nr_days+day] == conditions_dict[key]].groupby(['area']).agg({individuals_tmp.columns[-nr_days+day]: ['count']}) + + msoa_count_temp = individuals_tmp[tmp.iloc[:,day] == conditions_dict[key]].groupby(['area']).agg({tmp.columns[day]: ['count']}) + + + if msoa_count_temp.shape[0] == len(msoas): + msoa_count_temp = msoa_count_temp.values + msoacounts_run[:,day] = msoa_count_temp[:, 0] + elif msoa_count_temp.empty == False: + #print('check MSOAs') + # in case some entries don't exist + # start with empty dataframe + tmp_df = pd.DataFrame(np.zeros(len(msoas)), columns = ['tmp'], index=msoas) + # drop multiindex to prevent warning msg + msoa_count_temp.columns = msoa_count_temp.columns.droplevel(0) + # merge with obtained counts - NaN will appear + tmp_df = pd.merge(tmp_df, msoa_count_temp, how='left', left_index=True,right_index=True) + # replace NaN by 0 + tmp_df = tmp_df.fillna(0) + msoacounts_run[:,day] = tmp_df.iloc[:,1].values + + + # count nr for this condition per age bracket + age_count_temp = individuals_tmp[tmp.iloc[:,day] == conditions_dict[key]].groupby(['age0']).agg({tmp.columns[day]: ['count']}) + + if age_count_temp.shape[0] == 6: + age_count_temp = age_count_temp.values + agecounts_run[:,day] = age_count_temp[:, 0] + elif age_count_temp.empty == False: + # in case some entries don't exist + # start with empty dataframe + tmp_df = pd.DataFrame(np.zeros(age_cat.shape[0]), columns = ['tmp'], index=list(range(1,age_cat.shape[0]+1))) + # drop multilevel index to prevent warning msg + age_count_temp.columns = age_count_temp.columns.droplevel(0) + # merge with obtained counts - NaN will appear + tmp_df = pd.merge(tmp_df, age_count_temp, how='left', left_index=True,right_index=True) + # replace NaN by 0 + tmp_df = tmp_df.fillna(0) + agecounts_run[:,day] = tmp_df.iloc[:,1].values + + #age_count_temp.loc['2'].count + + # get current values from dict + msoacounts = msoacounts_dict_3d[key] + agecounts = agecounts_dict_3d[key] + totalcounts = totalcounts_dict_3d[key] + cumcounts = cumcounts_dict_3d[key] + # add current run's values + msoacounts[:,:,r-start_run] = msoacounts_run + agecounts[:,:,r-start_run] = agecounts_run + totalcounts[:,r-start_run] = msoacounts_run.sum(axis=0) + cumcounts[:,r-start_run] = cumcounts_msoa_run[:, 0] + # write out to dict + msoacounts_dict_3d[key] = msoacounts + agecounts_dict_3d[key] = agecounts + totalcounts_dict_3d[key] = totalcounts + cumcounts_dict_3d[key] = cumcounts + + uniquecounts_df[key] = uniquecounts_df[key]*(value+1) + + # uniquecounts['presymptomatic'] = uniquecounts['presymptomatic']*2 + # uniquecounts['symptomatic'] = uniquecounts['symptomatic']*3 + # uniquecounts['recovered'] = uniquecounts['recovered']*4 + # uniquecounts['dead'] = uniquecounts['dead']*5 + uniquecounts_df['maxval'] = uniquecounts_df.max(axis = 1) + + for key, value in conditions_dict.items(): + + # get current values from dict + uniquecounts = uniquecounts_dict_3d[key] + # add current run's values + uniquecounts[r-start_run] = uniquecounts_df[uniquecounts_df.maxval == (value+1)].shape[0] + # write out to dict + uniquecounts_dict_3d[key] = uniquecounts + + + # next, create mean and std + for key, value in conditions_dict.items(): + # get current values from dict + msoacounts = msoacounts_dict_3d[key] + agecounts = agecounts_dict_3d[key] + totalcounts = totalcounts_dict_3d[key] + cumcounts = cumcounts_dict_3d[key] + uniquecounts = uniquecounts_dict_3d[key] + # aggregate + msoacounts_std = msoacounts.std(axis=2) + msoacounts = msoacounts.mean(axis=2) + agecounts_std = agecounts.std(axis=2) + agecounts = agecounts.mean(axis=2) + totalcounts_std = totalcounts.std(axis=1) + totalcounts = totalcounts.mean(axis=1) + cumcounts_std = cumcounts.std(axis=1) + cumcounts = cumcounts.mean(axis = 1) + uniquecounts_std = uniquecounts.std() + uniquecounts = uniquecounts.mean() + # write out to dict + msoacounts_dict[key] = pd.DataFrame(data=msoacounts, index=msoas, columns=dict_days) + msoacounts_dict_std[key] = pd.DataFrame(data=msoacounts_std, index=msoas, columns=dict_days) + agecounts_dict[key] = pd.DataFrame(data=agecounts, index=age_cat_str, columns=dict_days) + agecounts_dict_std[key] = pd.DataFrame(data=agecounts_std, index=age_cat_str, columns=dict_days) + totalcounts_dict[key] = pd.Series(data=totalcounts, index=dict_days) + totalcounts_dict_std[key] = pd.Series(data=totalcounts_std, index=dict_days) + cumcounts_dict[key] = pd.Series(data=cumcounts, index=msoas) + cumcounts_dict_std[key] = pd.Series(data=cumcounts_std, index=msoas) + uniquecounts_dict[key] = pd.Series(data=uniquecounts, index=["total"]) + uniquecounts_dict_std[key] = pd.Series(data=uniquecounts_std, index=["total"]) + + + return msoas, totalcounts_dict, cumcounts_dict, agecounts_dict, msoacounts_dict, cumcounts_dict_3d, totalcounts_dict_std, cumcounts_dict_std, agecounts_dict_std, msoacounts_dict_std, totalcounts_dict_3d, agecounts_dict_3d, msoacounts_dict_3d, uniquecounts_dict_3d, uniquecounts_dict_std, uniquecounts_dict + + + + +# def calc_cumtotal_counts(cumcounts_dict_3d): +# """ Calculate cumulative total across time and space, returned as mean and std nr susceptible, presymptomatic, symptomatic, recovered and dead""" +# cumtotal_counts = np.zeros((5,nr_runs)) +# for r in r_range: +# nr_dead = cumcounts_dict_3d["dead"][:,r-start_run].sum() +# nr_recovered = cumcounts_dict_3d["recovered"][:,r-start_run].sum() +# nr_symptomatic = cumcounts_dict_3d["symptomatic"][:,r-start_run].sum() - nr_dead - nr_recovered +# nr_presymptomatic = cumcounts_dict_3d["presymptomatic"][:,r-start_run].sum() - nr_dead - nr_recovered - nr_symptomatic +# nr_susceptible = cumcounts_dict_3d["susceptible"][:,r-start_run].sum() - nr_dead - nr_recovered - nr_symptomatic - nr_presymptomatic +# cumtotal_counts[0,r-start_run] = nr_susceptible +# cumtotal_counts[1,r-start_run] = nr_presymptomatic +# cumtotal_counts[2,r-start_run] = nr_symptomatic +# cumtotal_counts[3,r-start_run] = nr_recovered +# cumtotal_counts[4,r-start_run] = nr_dead +# cumtotal_counts_mean = cumtotal_counts.mean(axis=1) +# cumtotal_counts_std = cumtotal_counts.std(axis=1) +# return cumtotal_counts_mean, cumtotal_counts_std + + + + + + + + + + + + + + +# ******** +# PROGRAM ENTRY POINT +# Uses 'click' library so that it can be run from the command line +# ******** +@click.command() +@click.option('-p', '--parameters_file', default="./model_parameters/default_dashboard.yml", type=click.Path(exists=True), + help="Parameters file to use to configure the dashboard. Default: ./model_parameters/default_dashboard.yml") + + +def create_dashboard(parameters_file): + + + # FUNCTIONS FOR PLOTTING + # ---------------------- + + # plot 1a: heatmap condition + + def plot_heatmap_condition(condition2plot): + """ Create heatmap plot: x axis = time, y axis = MSOAs, colour = nr people with condition = condition2plot. condition2plot is key to conditions_dict.""" + + # Prep data + var2plot = msoacounts_dict[condition2plot] + var2plot = var2plot.rename_axis(None, axis=1).rename_axis('MSOA', axis=0) + var2plot.columns.name = 'Day' + # reshape to 1D array or rates with a month and year for each row. + df_var2plot = pd.DataFrame(var2plot.stack(), columns=['condition']).reset_index() + source = ColumnDataSource(df_var2plot) + # add better colour + mapper_1 = LinearColorMapper(palette=colours_ch_cond[condition2plot], low=0, high=var2plot.max().max()) + # create fig + s1 = figure(title="Heatmap", + x_range=list(var2plot.columns), y_range=list(var2plot.index), x_axis_location="above") + s1.rect(x="Day", y="MSOA", width=1, height=1, source=source, + line_color=None, fill_color=transform('condition', mapper_1)) + color_bar_1 = ColorBar(color_mapper=mapper_1, location=(0, 0), orientation = 'horizontal', ticker=BasicTicker(desired_num_ticks=len(colours_ch_cond[condition2plot]))) + s1.add_layout(color_bar_1, 'below') + s1.axis.axis_line_color = None + s1.axis.major_tick_line_color = None + s1.axis.major_label_text_font_size = "7px" + s1.axis.major_label_standoff = 0 + s1.xaxis.major_label_orientation = 1.0 + # Create hover tool + s1.add_tools(HoverTool( + tooltips=[ + ( f'Nr {condition2plot}', '@condition'), + ( 'Day', '@Day' ), + ( 'MSOA', '@MSOA'), + ], + )) + s1.toolbar.autohide = False + plotref_dict[f"hm{condition2plot}"] = s1 + + + # plot 1b: heatmap venue + + def plot_heatmap_danger(venue2plot): + """ Create heatmap plot: x axis = time, y axis = MSOAs, colour =danger score. """ + + # Prep data + var2plot = dangers_msoa_dict[venue2plot] + var2plot.columns.name = 'Day' + # reshape to 1D array or rates with a month and year for each row. + df_var2plot = pd.DataFrame(var2plot.stack(), columns=['venue']).reset_index() + source = ColumnDataSource(df_var2plot) + # add better colour + mapper_1 = LinearColorMapper(palette=colours_ch_danger, low=0, high=var2plot.max().max()) + # Create fig + s1 = figure(title="Heatmap", + x_range=list(var2plot.columns), y_range=list(var2plot.index), x_axis_location="above") + s1.rect(x="Day", y="MSOA", width=1, height=1, source=source, + line_color=None, fill_color=transform('venue', mapper_1)) + color_bar_1 = ColorBar(color_mapper=mapper_1, location=(0, 0), orientation = 'horizontal', ticker=BasicTicker(desired_num_ticks=len(colours_ch_danger))) + s1.add_layout(color_bar_1, 'below') + s1.axis.axis_line_color = None + s1.axis.major_tick_line_color = None + s1.axis.major_label_text_font_size = "7px" + s1.axis.major_label_standoff = 0 + s1.xaxis.major_label_orientation = 1.0 + # Create hover tool + s1.add_tools(HoverTool( + tooltips=[ + ( 'danger score', '@venue'), + ( 'Day', '@Day' ), + ( 'MSOA', '@MSOA'), + ], + )) + s1.toolbar.autohide = False + plotref_dict[f"hm{venue2plot}"] = s1 + + + # plot 2: disease conditions across time + + def plot_cond_time(): + # build ColumnDataSource + data_s2 = dict(totalcounts_dict) + data_s2["days"] = days + for key, value in totalcounts_dict.items(): + data_s2[f"{key}_std_upper"] = totalcounts_dict[key] + totalcounts_dict_std[key] + data_s2[f"{key}_std_lower"] = totalcounts_dict[key] - totalcounts_dict_std[key] + source_2 = ColumnDataSource(data=data_s2) + # Create fig + s2 = figure(background_fill_color="#fafafa",title="Time", x_axis_label='Time', y_axis_label='Nr of people',toolbar_location='above') + legend_it = [] + for key, value in totalcounts_dict.items(): + c1 = s2.line(x = 'days', y = key, source = source_2, line_width=2, line_color=colour_dict[key],muted_color="grey", muted_alpha=0.2) + c2 = s2.square(x = 'days', y = key, source = source_2, fill_color=colour_dict[key], line_color=colour_dict[key], size=5, muted_color="grey", muted_alpha=0.2) + # c3 = s2.rect('days', f"{key}_std_upper", 0.2, 0.01, source = source_2, line_color="black",muted_color="grey", muted_alpha=0.2) + # c4 = s2.rect('days', f"{key}_std_lower", 0.2, 0.01, source = source_2, line_color="black",muted_color="grey", muted_alpha=0.2) + c5 = s2.segment('days', f"{key}_std_lower", 'days', f"{key}_std_upper", source = source_2, line_color="black",muted_color="grey", muted_alpha=0.2) + legend_it.append((f"nr {key}", [c1,c2,c5])) + legend = Legend(items=legend_it) + legend.click_policy="hide" + # Misc + tooltips = tooltips_cond_basic.copy() + tooltips.append(tuple(( 'Day', '@days' ))) + s2.add_tools(HoverTool( + tooltips=tooltips, + )) + s2.add_layout(legend, 'right') + s2.toolbar.autohide = False + plotref_dict["cond_time"] = s2 + + + # plot 3: Conditions across MSOAs + + def plot_cond_msoas(): + # build ColumnDataSource + data_s3 = {} + data_s3["msoa_nr"] = msoas_nr + data_s3["msoa_name"] = msoas + for key, value in cumcounts_dict.items(): + data_s3[key] = cumcounts_dict[key] + data_s3[f"{key}_std_upper"] = cumcounts_dict[key] + cumcounts_dict_std[key] + data_s3[f"{key}_std_lower"] = cumcounts_dict[key] - cumcounts_dict_std[key] + source_3 = ColumnDataSource(data=data_s3) + # Create fig + s3 = figure(background_fill_color="#fafafa",title="MSOA", x_axis_label='Nr people', y_axis_label='MSOA',toolbar_location='above') + legend_it = [] + for key, value in msoacounts_dict.items(): + c1 = s3.circle(x = key, y = 'msoa_nr', source = source_3, fill_color=colour_dict[key], line_color=colour_dict[key], size=5,muted_color="grey", muted_alpha=0.2) + c2 = s3.segment(f"{key}_std_lower", 'msoa_nr', f"{key}_std_upper", 'msoa_nr', source = source_3, line_color="black",muted_color="grey", muted_alpha=0.2) + legend_it.append((key, [c1,c2])) + legend = Legend(items=legend_it) + legend.click_policy="hide" + # Misc + s3.yaxis.ticker = data_s3["msoa_nr"] + MSOA_dict = dict(zip(data_s3["msoa_nr"], data_s3["msoa_name"])) + s3.yaxis.major_label_overrides = MSOA_dict + tooltips = tooltips_cond_basic.copy() + tooltips.append(tuple(( 'MSOA', '@msoa_name' ))) + s3.add_tools(HoverTool( + tooltips=tooltips, + )) + s3.add_layout(legend, 'right') + s3.toolbar.autohide = False + plotref_dict["cond_msoas"] = s3 + + + # plot 4a: choropleth + + def plot_choropleth_condition_slider(condition2plot): + # Prepare data + max_val = 0 + merged_data = pd.DataFrame() + merged_data["y"] = msoacounts_dict[condition2plot].iloc[:,0] + for d in range(0,nr_days): + merged_data[f"{d}"] = msoacounts_dict[condition2plot].iloc[:,d] + max_tmp = merged_data[f"{d}"].max() + if max_tmp > max_val: max_val = max_tmp + merged_data["Area"] = msoacounts_dict[condition2plot].index.to_list() + merged_data = pd.merge(map_df,merged_data,on='Area') + geosource = GeoJSONDataSource(geojson = merged_data.to_json()) + # Create color bar + mapper_4 = LinearColorMapper(palette = colours_ch_cond[condition2plot], low = 0, high = max_val) + color_bar_4 = ColorBar(color_mapper = mapper_4, + label_standoff = 8, + #"width = 500, height = 20, + border_line_color = None, + location = (0,0), + orientation = 'horizontal') + # Create figure object. + s4 = figure(title = f"{condition2plot} total") + s4.xgrid.grid_line_color = None + s4.ygrid.grid_line_color = None + # Add patch renderer to figure. + msoasrender = s4.patches('xs','ys', source = geosource, + fill_color = {'field' : 'y', + 'transform' : mapper_4}, + line_color = 'gray', + line_width = 0.25, + fill_alpha = 1) + # Create hover tool + s4.add_tools(HoverTool(renderers = [msoasrender], + tooltips = [('MSOA','@Area'), + ('Nr people','@y'), + ])) + s4.add_layout(color_bar_4, 'below') + s4.axis.visible = False + s4.toolbar.autohide = True + # Slider + # create dummy data source to store start value slider + slider_val = {} + slider_val["s"] = [start_day] + source_slider = ColumnDataSource(data=slider_val) + callback = CustomJS(args=dict(source=geosource,sliderval=source_slider), code=""" + var data = source.data; + var startday = sliderval.data; + var s = startday['s']; + var f = cb_obj.value -s; + console.log(f); + var y = data['y']; + var toreplace = data[f]; + for (var i = 0; i < y.length; i++) { + y[i] = toreplace[i] + } + source.change.emit(); + """) + slider = Slider(start=start_day, end=end_day, value=start_day, step=1, title="Day") + slider.js_on_change('value', callback) + plotref_dict[f"chpl{condition2plot}"] = s4 + plotref_dict[f"chsl{condition2plot}"] = slider + + + # plot 4b: choropleth dangers + + def plot_choropleth_danger_slider(venue2plot): + # Prep data + max_val = 0 + merged_data = pd.DataFrame() + merged_data["y"] = dangers_msoa_dict[venue2plot].iloc[:,0] + for d in range(0,nr_days): + merged_data[f"{d}"] = dangers_msoa_dict[venue2plot].iloc[:,d] + max_tmp = merged_data[f"{d}"].max() + if max_tmp > max_val: max_val = max_tmp + merged_data["Area"] = dangers_msoa_dict[venue2plot].index.to_list() + merged_data = pd.merge(map_df,merged_data,on='Area') + geosource2 = GeoJSONDataSource(geojson = merged_data.to_json()) + # Create color bar + mapper_4 = LinearColorMapper(palette = colours_ch_danger, low = 0, high = max_val) + color_bar_4 = ColorBar(color_mapper = mapper_4, + label_standoff = 8, + border_line_color = None, + location = (0,0), + orientation = 'horizontal') + # Create figure object + s4 = figure(title = f"{venue2plot} total") + s4.xgrid.grid_line_color = None + s4.ygrid.grid_line_color = None + # Add patch renderer to figure. + msoasrender = s4.patches('xs','ys', source = geosource2, + fill_color = {'field' : 'y', + 'transform' : mapper_4}, + line_color = 'gray', + line_width = 0.25, + fill_alpha = 1) + # Create hover tool + s4.add_tools(HoverTool(renderers = [msoasrender], + tooltips = [('MSOA','@Area'), + ('Danger score','@y'), + ])) + s4.add_layout(color_bar_4, 'below') + s4.axis.visible = False + s4.toolbar.autohide = True + # Slider + # create dummy data source to store start value slider + slider_val = {} + slider_val["s"] = [start_day] + source_slider = ColumnDataSource(data=slider_val) + callback = CustomJS(args=dict(source=geosource2,sliderval=source_slider), code=""" + var data = source.data; + var startday = sliderval.data; + var s = startday['s']; + var f = cb_obj.value -s; + console.log(f); + var y = data['y']; + var toreplace = data[f]; + for (var i = 0; i < y.length; i++) { + y[i] = toreplace[i] + } + source.change.emit(); + """) + slider = Slider(start=start_day, end=end_day, value=start_day, step=1, title="Day") + slider.js_on_change('value', callback) + plotref_dict[f"chpl{venue2plot}"] = s4 + plotref_dict[f"chsl{venue2plot}"] = slider + + + # plot 5: danger scores across time per venue type + + def plot_danger_time(): + # build ColumnDataSource + data_s5 = {} + data_s5["days"] = days + for key, value in dangers_dict.items(): + data_s5[key] = value.mean(axis = 0) + data_s5[f"{key}_std_upper"] = value.mean(axis = 0) + value.std(axis = 0) + data_s5[f"{key}_std_lower"] = value.mean(axis = 0) - value.std(axis = 0) + source_5 = ColumnDataSource(data=data_s5) + # Build figure + s5 = figure(background_fill_color="#fafafa",title="Time", x_axis_label='Time', y_axis_label='Average danger score', toolbar_location='above') + legend_it = [] + for key, value in dangers_dict.items(): + c1 = s5.line(x = 'days', y = key, source = source_5, line_width=2, line_color=colour_dict[key], muted_color="grey", muted_alpha=0.2) + c2 = s5.circle(x = 'days', y = key, source = source_5, fill_color=colour_dict[key], line_color=colour_dict[key], size=5) + c3 = s5.segment('days', f"{key}_std_lower", 'days', f"{key}_std_upper", source = source_5, line_color="black",muted_color="grey", muted_alpha=0.2) + legend_it.append((key, [c1,c2,c3])) + legend = Legend(items=legend_it) + legend.click_policy="hide" + # Misc + tooltips = tooltips_venue_basic.copy() + tooltips.append(tuple(( 'Day', '@days' ))) + s5.add_tools(HoverTool( + tooltips=tooltips, + )) + s5.add_layout(legend, 'right') + s5.toolbar.autohide = False + plotref_dict["danger_time"] = s5 + + + + + + + def plot_cond_time_age(): + # 1 plot per condition, nr of lines = nr age brackets + colour_dict_age = { + 0: "red", + 1: "orange", + 2: "yellow", + 3: "green", + 4: "teal", + 5: "blue", + 6: "purple", + 7: "pink", + 8: "gray", + 9: "black", + } + + for key, value in totalcounts_dict.items(): + data_s2= dict() + data_s2["days"] = days + tooltips = [] + for a in range(len(age_cat_str)): + age_cat_str[a] + data_s2[f"c{a}"] = agecounts_dict[key].iloc[a] + data_s2[f"{age_cat_str[a]}_std_upper"] = agecounts_dict[key].iloc[a] + agecounts_dict_std[key].iloc[a] + data_s2[f"{age_cat_str[a]}_std_lower"] = agecounts_dict[key].iloc[a] - agecounts_dict_std[key].iloc[a] + source_2 = ColumnDataSource(data=data_s2) + # Create fig + s2 = figure(background_fill_color="#fafafa",title=f"{key}", x_axis_label='Time', y_axis_label=f'Nr of people - {key}',toolbar_location='above') + legend_it = [] + for a in range(len(age_cat_str)): + c1 = s2.line(x = 'days', y = f"c{a}", source = source_2, line_width=2, line_color=colour_dict_age[a],muted_color="grey", muted_alpha=0.2) + c2 = s2.square(x = 'days', y = f"c{a}", source = source_2, fill_color=colour_dict_age[a], line_color=colour_dict_age[a], size=5, muted_color="grey", muted_alpha=0.2) + c5 = s2.segment('days', f"{age_cat_str[a]}_std_lower", 'days', f"{age_cat_str[a]}_std_upper", source = source_2, line_color="black",muted_color="grey", muted_alpha=0.2) + legend_it.append((f"nr {age_cat_str[a]}", [c1,c2,c5])) + tooltips.append(tuple(( f"{age_cat_str[a]}", f"@c{a}" ))) + + legend = Legend(items=legend_it) + legend.click_policy="hide" + # Misc + tooltips.append(tuple(( 'Day', '@days' ))) + s2.add_tools(HoverTool( + tooltips=tooltips, + )) + s2.add_layout(legend, 'right') + s2.toolbar.autohide = False + plotref_dict[f"cond_time_age_{key}"] = s2 + + + def plot_scenario_hist(scen_hist,category,label): + max_plot = 0 + for s in sc_nam: + if max_plot < max(scen_hist[s]): + max_plot = max(scen_hist[s]) + data = scen_hist + Categories = scen_hist[category] + Scenarios = sc_nam + x = [] + counts = [] + v = 0 + for c in Categories: + for s in Scenarios: + x.append((c,s)) + counts.append(scen_hist[s][v]) + v = v + 1 + counts = tuple(counts) + source = ColumnDataSource(data=dict(x=x, counts=counts)) + p = figure(x_range=FactorRange(*x), plot_height=350, title=f"{label} across scenarios",y_axis_label=f"{label}") + p.vbar(x='x', top='counts', width=0.9, source=source, line_color="white", + fill_color=factor_cmap('x', palette=scen_palette, factors=Scenarios, start=1, end=2)) + p.y_range.start = 0 + p.x_range.range_padding = 0.1 + p.xaxis.major_label_orientation = 1 + p.xgrid.grid_line_color = None + + #plotref_dict[f"cond_time_age_{key}"] = s2 + tooltips = [(f'{label}','@counts'),] + p.add_tools(HoverTool( + tooltips=tooltips, + )) + #show(p) + plotref_dict[f"scen_hist_{category}"] = p + + + + def plot_scenario_time(scen_time,category,label,dkey): + # build ColumnDataSource + data_s5 = scen_time[dkey].copy() + data_s5["days"] = days + # add standard dev? + source_5 = ColumnDataSource(data=data_s5) + # Build figure + s5 = figure(background_fill_color="#fafafa",title=f"{dkey}", x_axis_label='Time', y_axis_label=f"{label}", toolbar_location='above') + legend_it = [] + snr = 0 + tooltips=[] + for s in sc_nam: + c1 = s5.line(x = 'days', y = s, source = source_5, line_width=2, line_color=scen_palette[snr], muted_color="grey", muted_alpha=0.2) + c2 = s5.circle(x = 'days', y = s, source = source_5, fill_color=scen_palette[snr], line_color=scen_palette[snr], size=5) + legend_it.append((s, [c1,c2])) + tooltips.append(tuple(( f"{s}", f"@{s}"))) + snr = snr + 1 + legend = Legend(items=legend_it) + legend.click_policy="hide" + # Misc + s5.add_tools(HoverTool( + tooltips=tooltips, + )) + s5.add_layout(legend, 'right') + s5.toolbar.autohide = False + plotref_dict[f"scen_time_{category}_{dkey}"] = s5 + + + + # MAIN SCRIPT + # ----------- + + # Set parameters (optional to overwrite defaults) + # ----------------------------------------------- + # Set to None to use defaults + + base_dir = os.getcwd() # get current directory (usually RAMP-UA) + + # from file + # parameters_file = os.path.join(base_dir, "model_parameters","default_dashboard.yml") + + # read from file + with open(parameters_file, 'r') as f: + parameters = load(f, Loader=SafeLoader) + dash_params = parameters["dashboard"] # Parameters for the dashboard + output_name_user = dash_params["output_name"] + data_dir_user = dash_params["data_dir"] + start_day_user = dash_params["start_day"] + end_day_user = dash_params["end_day"] + start_run_user = dash_params["start_run"] + end_run_user = dash_params["end_run"] + sc_dir = dash_params["scenario_dir"] + sc_nam = dash_params["scenario_name"] + + + + + # Set parameters (advanced) + # ------------------------- + + # dictionaries with condition and venue names + # conditions are coded as numbers in microsim output + conditions_dict = { + "susceptible": 0, + "exposed": 1, + "presymptomatic": 2, + "symptomatic": 3, + "asymptomatic": 4, + "recovered": 5, + "dead": 6, + } + # venues are coded as strings - redefined here so script works as standalone, could refer to ActivityLocations instead + locations_dict = { + "PrimarySchool": "PrimarySchool", + "SecondarySchool": "SecondarySchool", + "Retail": "Retail", + "Work": "Work", + "Home": "Home", + } + + # default list of tools for plots + tools = "crosshair,hover,pan,wheel_zoom,box_zoom,reset,box_select,lasso_select" + + # colour schemes for plots + # colours for line plots + colour_dict = { + "susceptible": "grey", + "exposed": "blue", + "presymptomatic": "orange", + "symptomatic": "red", + "asymptomatic": "magenta", + "recovered": "green", + "dead": "black", + "Retail": "blue", + "PrimarySchool": "orange", + "SecondarySchool": "red", + "Work": "black", + "Home": "green", + } + # colours for heatmaps and choropleths for conditions (colours_ch_cond) and venues/danger scores (colours_ch_danger) + colours_ch_cond = { + "susceptible": brewer['Blues'][8][::-1], + "exposed": brewer['YlOrRd'][8][::-1], + "presymptomatic": brewer['YlOrRd'][8][::-1], + "symptomatic": brewer['YlOrRd'][8][::-1], + "asymptomatic": brewer['YlOrRd'][8][::-1], + "recovered": brewer['Greens'][8][::-1], + "dead": brewer['YlOrRd'][8][::-1], + } + colours_ch_danger = brewer['YlOrRd'][8][::-1] + # other good palettes / way to define: + # palette = brewer['BuGn'][8][::-1] # -1 reverses the order + # palette = = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"] + + # Nr days, runs, scenarios + # ------------------------ + # check user input or revert to default + + if sc_nam is None: # no scenarios defined, use output directory + sc_dir = ["output"] + sc_nam = ["Scenario"] + else: # add output to each directory + for d in range(0,len(sc_dir)): + sc_dir[d] = os.path.join("output", sc_dir[d]) + + nr_scenarios = len(sc_dir) + + # directory to read data from + data_dir = "data" if (data_dir_user is None) else data_dir_user + data_dir = os.path.join(base_dir, data_dir) # update data dir + + # base file name + file_name = "dashboard" if (output_name_user is None) else output_name_user + + # start and end day + start_day = 0 if (start_day_user is None) else start_day_user + end_day = calc_nr_days(os.path.join(data_dir, sc_dir[0],"0","Retail.pickle"))-1 if (end_day_user is None) else end_day_user + nr_days = end_day - start_day + 1 + + # start and end run + start_run = 0 if (start_run_user is None) else start_run_user + end_run = len(next(os.walk(os.path.join(data_dir, "output")))[1]) -1 if (end_run_user is None) else end_run_user + nr_runs = end_run - start_run + 1 + r_range = range(start_run, end_run+1) + + + dict_days = [] # empty list for column names 'Day0' etc + for d in range(start_day, end_day+1): + dict_days.append(f'Day{d}') + + + # Read in third party data + # ------------------------ + + # read in details about venues + data_file = os.path.join(data_dir, "devon-schools","exeter schools.csv") + schools = pd.read_csv(data_file) + data_file = os.path.join(data_dir, "devon-retail","devon smkt.csv") + retail = pd.read_csv(data_file) + + # load in shapefile with England MSOAs for choropleth + sh_file = os.path.join(data_dir, "MSOAS_shp","bcc21fa2-48d2-42ca-b7b7-0d978761069f2020412-1-12serld.j1f7i.shp") + map_df = gpd.read_file(sh_file) + # rename column to get ready for merging + map_df.rename(index=str, columns={'msoa11cd': 'Area'},inplace=True) + + # postcode to MSOA conversion (for retail data) + data_file = os.path.join(data_dir, "PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv") + postcode_lu = pd.read_csv(data_file, encoding = "ISO-8859-1", usecols = ["pcds", "msoa11cd"]) + + + + # Read in and process pickled output from microsim + # ------------------------------------------------ + + # read in and process pickle files location/venue dangers + # create one or more dangers_dict (only using means i.e. first output of function for now) + + # if there is only 1 scenario, plot as usual + if nr_scenarios == 1: + dangers_dict = create_venue_dangers_dict(locations_dict,r_range,os.path.join(data_dir,sc_dir[0]),start_day,end_day,start_run,nr_runs)[0] + + # if there are 2 scenarios, replace dangers_dict by the difference between scenario 0 and 1 + elif nr_scenarios == 2: + dangers_dict_sc0 = create_venue_dangers_dict(locations_dict,r_range,os.path.join(data_dir,sc_dir[0]),start_day,end_day,start_run,nr_runs)[0] + dangers_dict_sc1 = create_venue_dangers_dict(locations_dict,r_range,os.path.join(data_dir,sc_dir[1]),start_day,end_day,start_run,nr_runs)[0] + # calculate difference between 2 scenarios + dangers_dict = {} # this is used for plotting + dangers_dict = create_difference_dict(dangers_dict_sc0,dangers_dict_sc1,locations_dict) + + if nr_scenarios >= 2: + scen_hist_venues = {} + scen_hist_venues["Venues"] = [] + scen_time_venues = {} + + for s in range(0,nr_scenarios): + dangers_dict_tmp = create_venue_dangers_dict(locations_dict,r_range,os.path.join(data_dir,sc_dir[s]),start_day,end_day,start_run,nr_runs)[0] + scen_hist_venues[sc_nam[s]] = [] + + for key, value in dangers_dict.items(): + if s == 0: + scen_hist_venues["Venues"].append(key) + scen_time_venues[key] = dict({"Day":dict_days, sc_nam[0]:dangers_dict_tmp[key].mean(axis=0)}) + else: + scen_time_venues[key][sc_nam[s]] = dangers_dict_tmp[key].mean(axis=0) + scen_hist_venues[sc_nam[s]].append(dangers_dict_tmp[key].mean(axis=0).mean()) + + + # retrieve data + #tmp = scen_time_data["Retail"] + + + if nr_scenarios <= 2: + # Add additional info about schools and retail including spatial coordinates + # merge + primaryschools = pd.merge(schools, dangers_dict["PrimarySchool"], left_index=True, right_index=True) + secondaryschools = pd.merge(schools, dangers_dict["SecondarySchool"], left_index=True, right_index=True) + retail = pd.merge(retail, dangers_dict["Retail"], left_index=True, right_index=True) + + # creat LUT + lookup = dict(zip(postcode_lu.pcds, postcode_lu.msoa11cd)) # zip together the lists and make a dict from it + # use LUT and add column to retail variable + msoa_code = [lookup.get(retail.postcode[i]) for i in range(0, len(retail.postcode), 1)] + retail.insert(2, 'MSOA_code', msoa_code) + + # normalised danger scores per msoa for schools and retail (for choropleth) + dangers_msoa_dict = create_msoa_dangers_dict(dangers_dict,['Retail','PrimarySchool','SecondarySchool'],[retail.MSOA_code,schools.MSOA_code,schools.MSOA_code]) + + + # age brackets + age_cat = np.array([[0, 19], [20, 29], [30,44], [45,59], [60,74], [75,200]]) + # label for plotting age categories + age_cat_str = [] + for a in range(age_cat.shape[0]): + age_cat_str.append(f"{age_cat[a,0]}-{age_cat[a,1]}") + + + + + + + + # counts per condition + if nr_scenarios == 1: + results_tmp = create_counts_dict(conditions_dict,r_range,os.path.join(data_dir, sc_dir[0]),start_day,end_day,start_run,nr_runs,age_cat) # get all + #only pick results variables needed + msoas, totalcounts_dict, cumcounts_dict, agecounts_dict, msoacounts_dict, cumcounts_dict_3d, totalcounts_dict_std, cumcounts_dict_std, agecounts_dict_std = results_tmp[0:9] + uniquecounts_dict_std, uniquecounts_dict = results_tmp[14:16] + + elif nr_scenarios == 2: + # if there is one other scenario, replace the counts dictionaries by the difference between scenario 0 and 1 + results_tmp = create_counts_dict(conditions_dict,r_range,os.path.join(data_dir,sc_dir[0]),start_day,end_day,start_run,nr_runs,age_cat) # get all + msoas, totalcounts_dict_sc0, cumcounts_dict_sc0, agecounts_dict_sc0, msoacounts_dict_sc0, cumcounts_dict_3d_sc0 = results_tmp[0:6] + results_tmp = create_counts_dict(conditions_dict,r_range,os.path.join(data_dir,sc_dir[1]),start_day,end_day,start_run,nr_runs,age_cat) # get all + totalcounts_dict_sc1, cumcounts_dict_sc1, agecounts_dict_sc1, msoacounts_dict_sc1, cumcounts_dict_3d_sc1 = results_tmp[1:6] + # calculate differenes + msoacounts_dict = create_difference_dict(msoacounts_dict_sc0,msoacounts_dict_sc1,conditions_dict) + agecounts_dict = create_difference_dict(agecounts_dict_sc0,agecounts_dict_sc1,conditions_dict) + totalcounts_dict = create_difference_dict(totalcounts_dict_sc0,totalcounts_dict_sc1,conditions_dict) + cumcounts_dict = create_difference_dict(cumcounts_dict_sc0,cumcounts_dict_sc1,conditions_dict) + # make sure std are zero, subtract from itself + agecounts_dict_std = create_difference_dict(agecounts_dict_sc0,agecounts_dict_sc0,conditions_dict) + totalcounts_dict_std = create_difference_dict(totalcounts_dict_sc0,totalcounts_dict_sc0,conditions_dict) + cumcounts_dict_std = create_difference_dict(cumcounts_dict_sc0,cumcounts_dict_sc0,conditions_dict) + + + + if nr_scenarios >= 2: + scen_hist_conditions = {} + scen_hist_conditions["Condition"] = [] + scen_time_conditions = {} + + for s in range(0,nr_scenarios): + result_tmp = create_counts_dict(conditions_dict,r_range,os.path.join(data_dir, sc_dir[s]),start_day,end_day,start_run,nr_runs,age_cat) + totalcounts_dict_tmp = result_tmp[1] + uniquecounts_dict_tmp = result_tmp[-1] + + scen_hist_conditions[sc_nam[s]] = [] + + for key, value in conditions_dict.items(): + if s == 0: + scen_hist_conditions["Condition"].append(key) + scen_time_conditions[key] = dict({"Day":dict_days, sc_nam[0]:totalcounts_dict_tmp[key]}) + else: + scen_time_conditions[key][sc_nam[s]] = totalcounts_dict_tmp[key] + scen_hist_conditions[sc_nam[s]].append(uniquecounts_dict_tmp[key].values[0]) + + + # Plotting + # -------- + + # MSOA nrs (needs nrs not strings to plot) + msoas_nr = [i for i in range(0,len(msoas))] + + # days (needs list to plot) + days = [i for i in range(start_day,end_day+1)] + + + + if nr_scenarios <= 2: + + # determine where/how the visualization will be rendered + html_output = os.path.join(data_dir, f'{file_name}.html') + output_file(html_output, title='RAMP-UA microsim output') # Render to static HTML + #output_notebook() # To tender inline in a Jupyter Notebook + + # optional: threshold map to only use MSOAs currently in the study or selection + map_df = map_df[map_df['Area'].isin(msoas)] + + # basic tool tip for condition plots + tooltips_cond_basic=[] + for key, value in totalcounts_dict.items(): + tooltips_cond_basic.append(tuple(( f"Nr {key}", f"@{key}"))) + + tooltips_venue_basic=[] + for key, value in dangers_dict.items(): + tooltips_venue_basic.append(tuple(( f"Danger {key}", f"@{key}"))) + + # empty dictionary to track condition and venue specific plots + plotref_dict = {} + + # create heatmaps condition + for key,value in conditions_dict.items(): + plot_heatmap_condition(key) + + # create heatmaps venue dangers + for key,value in dangers_msoa_dict.items(): + plot_heatmap_danger(key) + + # disease conditions across time + plot_cond_time() + + # disease conditions across msoas + plot_cond_msoas() + + # choropleth conditions + for key,value in conditions_dict.items(): + plot_choropleth_condition_slider(key) + + # choropleth dangers + for key,value in dangers_msoa_dict.items(): + plot_choropleth_danger_slider(key) + + # danger scores across time per venue type + plot_danger_time() + + + # conditions across time per age category + plot_cond_time_age() + + + # Layout and output + + tab1 = Panel(child=row(plotref_dict["cond_time"], plotref_dict["cond_msoas"]), title='Summary conditions') + + tab2 = Panel(child=row(plotref_dict["hmsusceptible"],column(plotref_dict["chslsusceptible"],plotref_dict["chplsusceptible"])), title='Susceptible') + + tab3 = Panel(child=row(plotref_dict["hmexposed"],column(plotref_dict["chslexposed"],plotref_dict["chplexposed"])), title='Exposed') + + tab4 = Panel(child=row(plotref_dict["hmpresymptomatic"],column(plotref_dict["chslpresymptomatic"],plotref_dict["chplpresymptomatic"])), title='Presymptomatic') + + tab5 = Panel(child=row(plotref_dict["hmsymptomatic"],column(plotref_dict["chslsymptomatic"],plotref_dict["chplsymptomatic"])), title='Symptomatic') + + tab6 = Panel(child=row(plotref_dict["hmasymptomatic"],column(plotref_dict["chslasymptomatic"],plotref_dict["chplasymptomatic"])), title='Asymptomatic') + + tab7 = Panel(child=row(plotref_dict["hmrecovered"],column(plotref_dict["chslrecovered"],plotref_dict["chplrecovered"])), title='Recovered') + + tab8 = Panel(child=row(plotref_dict["hmdead"],column(plotref_dict["chsldead"],plotref_dict["chpldead"])), title='Dead') + + tab9 = Panel(child=row(plotref_dict["danger_time"]), title='Summary dangers') + + tab10 = Panel(child=row(plotref_dict["hmRetail"],column(plotref_dict["chslRetail"],plotref_dict["chplRetail"])), title='Danger retail') + + tab11 = Panel(child=row(plotref_dict["hmPrimarySchool"],column(plotref_dict["chslPrimarySchool"],plotref_dict["chplPrimarySchool"])), title='Danger primary school') + + tab12 = Panel(child=row(plotref_dict["hmSecondarySchool"],column(plotref_dict["chslSecondarySchool"],plotref_dict["chplSecondarySchool"])), title='Danger secondary school') + + tab13 = Panel(child=row(plotref_dict["cond_time_age_susceptible"],plotref_dict["cond_time_age_presymptomatic"],plotref_dict["cond_time_age_symptomatic"],plotref_dict["cond_time_age_recovered"],plotref_dict["cond_time_age_dead"]), title='Breakdown by age') + + + # Put the Panels in a Tabs object + tabs = Tabs(tabs=[tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8, tab9, tab10, tab11, tab12, tab13]) + + show(tabs) + + if nr_scenarios >= 2: + # determine where/how the visualization will be rendered + html_output = os.path.join(data_dir, f'{file_name}_scenarios.html') + output_file(html_output, title='RAMP-UA microsim output scenario comparison') # Render to static HTML + #output_notebook() # To tender inline in a Jupyter Notebook + + scen_palette = ["darkgrey","royalblue","olive","orange","darkviolet","firebrick","tomato","deeppink","lightseagreen","limegreen"] + + plot_scenario_hist(scen_hist_venues,"Venues","Danger scores") + + plot_scenario_hist(scen_hist_conditions,"Condition","Nr people") + + + for key,value in conditions_dict.items(): + plot_scenario_time(scen_time_conditions,"Conditions","Nr people",key) + + for key,value in dangers_dict.items(): + plot_scenario_time(scen_time_venues,"Venues","Danger scores",key) + + # Layout and output + + tab1 = Panel(child=row(plotref_dict["scen_hist_Condition"], plotref_dict["scen_hist_Venues"]), title='Histograms') + + tab2 = Panel(child=row(plotref_dict["scen_time_Conditions_susceptible"], plotref_dict["scen_time_Conditions_presymptomatic"], plotref_dict["scen_time_Conditions_symptomatic"], plotref_dict["scen_time_Conditions_recovered"], plotref_dict["scen_time_Conditions_dead"]), title='Conditions') + + tab3 = Panel(child=row(plotref_dict["scen_time_Venues_Retail"], plotref_dict["scen_time_Venues_PrimarySchool"], plotref_dict["scen_time_Venues_SecondarySchool"], plotref_dict["scen_time_Venues_Work"], plotref_dict["scen_time_Venues_Home"]), title='Venues') + + # Put the Panels in a Tabs object + tabs = Tabs(tabs=[tab1, tab2, tab3]) + + show(tabs) + + + +if __name__ == "__main__": + create_dashboard() + print("End of program") + + + + + diff --git a/build/lib/microsim/microsim_initialisation.py b/build/lib/microsim/microsim_initialisation.py new file mode 100644 index 00000000..2ebb7813 --- /dev/null +++ b/build/lib/microsim/microsim_initialisation.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Used to create data that can be used to parameterise the disease estimates. These data +need to be created so that the coefficients can be calculated. + +Created on Wed 39 May 14:30 + +@author: nick +""" +import sys +sys.path.append("microsim") +import click # command-line interface +import os +import pandas as pd +import random +import copy +import multiprocessing + +from microsim_model import Microsim +from column_names import ColumnNames + + +class MicrosimInit(Microsim): + """ + Create some preliminary data that are needed to estimate coefficients for the disease estimates. + + Process is as follows: + 1. Read a list of the most high risk MSOAs per day for N (=6?) days + 2. Read the total number of incidences per day. + 3. Then iterate for N days: + - At the start of each day, randomly distribute cases to randomly chosen individuals to + the high-risk MSOAs + - Run step the model to calculate a danger score + - Save the danger score for each individual each day + + Also repeat the above process M times. + """ + + def __init__(self, msoa_danger: pd.DataFrame, cases: pd.DataFrame, results_dir: str, + *args, **kwargs): + """ + Initialise the MicrosimInit. Some specific parameters are required for initialisation, + everything else is passed to the parent constructor + + :param msoa_danger: A list of MSOAs with an associated danger rating 'high', 'medium', 'low' + :param cases: Number of cases per iteration. + :param results_dir: The root results directory (actual results from this model will be written + into a unique subdirectory + + """ + super(MicrosimInit, self).__init__(*args, **kwargs) + self.msoa_danger = msoa_danger + self.results_dir = results_dir + self.cases = cases + # Work out which areas and individuals are high risk (might be assigned to be a case) + self.high_risk_msoas = self.msoa_danger.loc[self.msoa_danger.risk == "High", "area"].values + # Now get the indices of the high risk individuals (all those who live in high risk areaa) + self.high_risk_individuals = self.individuals.index[ + self.individuals["Area"].isin(self.high_risk_msoas)] + assert len(self.individuals.loc[self.high_risk_individuals, :]["Area"].unique()) == len(self.high_risk_msoas) + + @staticmethod + def run(m: Microsim, results_subdirectory: str): + """ + Run the initialisation for a model. + + :param m: A microsim object run + :param results_subdirectory: Where to write the results for this model + :return: + """ + # Create a directory for these results + try: + os.makedirs(results_subdirectory) + except FileExistsError as e: + print("Directory ", results_subdirectory, " already exists") + raise e + + # Maintain a new dataframe that has the risk per individual for each iteration + individual_risks = pd.DataFrame(m.individuals.loc[:, ["Area", ColumnNames.CURRENT_RISK] ]) + # Do the same for each activity + activity_dangers = {} + for name, activity_location in m.activity_locations.items(): + activity_dangers[name] = activity_location._locations.copy() + + if len(m.cases) > 100: + raise Exception("Internal error. If there are more than 100 days of cases then the format" + "line below which is used to create a new column for each day as a two-digit " + "number, will need to be adapted to make 3-digit numbers.") + + # Loop for every iteration (get this from cases) + for i, row in m.cases.iterrows(): + print(i, row['date'], row['new_cases']) + # Reset everyone's disease status + m.individuals.loc[:,ColumnNames.DISEASE_STATUS] = 0 + + # Manually change people's activity durations after lockdown + if i > 39: # After day 39 - March 23RD in new cases + total_duration = 0.0 + for colum_name in ['Retail', 'PrimarySchool', 'SecondarySchool', 'Work']: + new_duration = m.individuals.loc[:, colum_name+ ColumnNames.ACTIVITY_DURATION] * 0.33 + # Round the new duration to prevent tiny numbers + new_duration = round(new_duration, 10) + total_duration += new_duration + m.individuals.loc[:, colum_name + ColumnNames.ACTIVITY_DURATION] = new_duration + + # Now set home + m.individuals.loc[:, 'Home'+ ColumnNames.ACTIVITY_DURATION] = (1 - total_duration) + + # If you want to loop over all activities this is how you do it: + #for name, activity_location in m.activity_locations.items(): + # Reduce the duration of all activites by 0.9: + #m.individuals.loc[ : , name+ColumnNames.ACTIVITY_DURATION ] = + #m.individuals.loc[ : , name+ColumnNames.ACTIVITY_DURATION ] * 0.9 + + # Randomly assign cases to individuals + num_cases = row['new_cases'] + random.seed() # Sometimes different Processes can be given the same generator and seed + infected_individuals = random.sample(list(m.high_risk_individuals), num_cases) + assert len(infected_individuals) == num_cases + m.individuals.loc[infected_individuals, ColumnNames.DISEASE_STATUS] = 1 + assert len(m.individuals.loc[m.individuals[ColumnNames.DISEASE_STATUS] == 1]) == num_cases + # Aggregate and count the cases per area and check they are the correct numbers of cases + assert m.individuals.loc[m.individuals.Disease_Status > 0, :]["Area"].value_counts().sum() == num_cases + + # Step the model + m.step() + + # Write out some of the info about individuals as a csv (verbose format string is to make the iteration two-digit) + #self.individuals.to_csv(os.path.join(results_subdirectory,"individuals-{0:0=2d}.csv".format(i))) + + # Remember the information about the activity locations + for name, activity_location in m.activity_locations.items(): + activity_dangers[name][ColumnNames.LOCATION_DANGER+"{0:0=2d}".format(i)] = \ + activity_location._locations[ColumnNames.LOCATION_DANGER] + + # Remember the current risk per individual per day + individual_risks[ColumnNames.CURRENT_RISK+"{0:0=2d}".format(i)] = m.individuals.loc[:, ColumnNames.CURRENT_RISK] + + # Write out the risks per individual each day + individual_risks.to_csv(os.path.join(results_subdirectory, "individual_risks.csv")) + # And the locations dangers per day + for name, activity_df in activity_dangers.items(): + activity_df.to_csv(os.path.join(results_subdirectory, f"{name}.csv")) + + return individual_risks + + @staticmethod + def make_a_copy(m: Microsim): + """When copying a microsim object, reset the seed""" + m.random.seed() + return copy.deepcopy(m) + + +# PROGRAM ENTRY POINT +# Uses 'click' library so that it can be run from the command line +@click.command() +@click.option('--repetitions', default=5, help='Number of times to repeat the initialisation process') +@click.option('--data_dir', default="devon_data", help='Root directory to load main model data from') +@click.option('--init_dir', default="init_data", help="Directory that stores initialisation data, and where outputs are written") +@click.option('--multiprocess', default=False, help="Whether to run multiprocess or not") +@click.option('--debug', default=False, help="Whether to run some more expensive checks (default True)") +def run_script(repetitions, data_dir, init_dir, multiprocess, debug): + # To fix file path issues, use absolute/full path at all times + base_dir = os.getcwd() # get current directory + data_dir = os.path.join(base_dir, data_dir) + init_dir = os.path.join(base_dir, init_dir) + results_dir = os.path.join(init_dir, "results/") + + print(f"Running initialisation process {repetitions} times.\n\t" + f"Using model data in {data_dir}, initialisation data in {init_dir}\n\t" + f"Storing results in {results_dir}.") + + # Read the initialisation data + cases = pd.read_csv(os.path.join(init_dir, "devon_cases_fn.csv")) + # Cut off cases after 10 + #cases = cases.loc[0:10, :] + msoa_danger = pd.read_csv(os.path.join(init_dir, "msoa_danger_fn.csv")) + + # Sense check: all MSOAs in Devon should have a danger score + devon_msoas = pd.read_csv(os.path.join(data_dir, "devon_msoas.csv"), header=None, + names=["x", "y", "Num", "Code", "Desc"]) + assert len(devon_msoas) == len(msoa_danger) + assert set(devon_msoas.Code) == set(msoa_danger.area) + + # Initialise a model (MirosimInit init is a child of Microsim) + m = MicrosimInit(msoa_danger=msoa_danger, cases=cases, results_dir=results_dir, + # These are for the parent Microsim object + study_msoas=list(devon_msoas.Code), data_dir=data_dir, output=False, debug=debug, + # TODO Since implementing this we have the disease status stuff working. Check that deactivating it works. + disable_disease_status=True # Turn off disease status calculation as we want to run it without this + ) + + # Find a new directory for this initialisation (may have old ones) + i = 0 + while os.path.exists(os.path.join(results_dir, str(i))): + i += 1 + # Create a directory for these results + results_subdir = os.path.join(results_dir, str(i) ) + try: + os.mkdir(results_subdir) + except FileExistsError as e: + print("Directory ", results_subdir, " already exists") + raise e + + # Write out the individuals as they were before the model(s) started iterating + m.individuals.to_csv(os.path.join(results_subdir, "individuals.csv")) + + # For each repetition, remember risks of each individual and dangers of each location + individual_risks = [] + if repetitions <= 1 or not multiprocess: # Run in a single process + for j in range(0, repetitions): + # Now find subdirectories for each individual run + subdir = os.path.join(results_subdir, str(j)) + # Run it, writing out data, and returing the risks per day. + # Copy the model initialisation instance each time (although maybe this not strictly necessary + individual_risks.append( + MicrosimInit.run(MicrosimInit.make_a_copy(m), subdir)) + else: # Run as multiprocess + subdirs = [ os.path.join(results_subdir, str(j)) for j in range(repetitions) ] + models = (MicrosimInit.make_a_copy(m) for _ in range(repetitions)) # Generator so dont need to do all copies at once + with multiprocessing.Pool(processes=int(os.cpu_count()) ) as pool: + try: + #individual_risks = pool.map(MicrosimInit.run, zip(models, subdirs)) + individual_risks = pool.starmap(MicrosimInit.run, zip(models, subdirs)) + finally: + pool.close() + + print("End of initialisation") + +if __name__ == "__main__": + run_script() + print("End of initialisation") + diff --git a/build/lib/microsim/microsim_model.py b/build/lib/microsim/microsim_model.py new file mode 100644 index 00000000..48e4adc4 --- /dev/null +++ b/build/lib/microsim/microsim_model.py @@ -0,0 +1,1780 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Core RAMP-UA model. + +Created on Wed Apr 29 19:59:25 2020 + +@author: nick +""" +import os +# directory to read data from +tmp_base_dir = os.getcwd() # get current directory (usually RAMP-UA) +tmp_microsim_dir = os.path.join(tmp_base_dir, "microsim") # go to data dir +os.chdir(tmp_microsim_dir) + +import sys +#import os +#owd = os.getcwd() +#os.chdir(os.path.join(owd,"microsim")) + +sys.path.append("microsim") # This is only needed when testing. I'm so confused about the imports +from microsim.activity_location import ActivityLocation +from microsim.r_interface import RInterface +from microsim.column_names import ColumnNames +from microsim.utilities import Optimise +import multiprocessing +import copy + +import pandas as pd + +pd.set_option('display.expand_frame_repr', False) # Don't wrap lines when displaying DataFrames +# pd.set_option('display.width', 0) # Automatically find the best width +import numpy as np +import glob +import os +import random +import time +import re # For analysing file names +import warnings +from collections.abc import Iterable # drop `.abc` with Python 2.7 or lower +from typing import List, Dict +from tqdm import tqdm # For a progress bar +import click # command-line interface +import pickle # to save data +import swifter # For speeding up apply functions (e.g. df.swifter.apply) +import rpy2.robjects as ro # For calling R scripts +from yaml import load, dump, SafeLoader # pyyaml library for reading the parameters.yml file +from shutil import copyfile + +USE_QUANT_DATA = False # Temorary flag to use UCL SIMs or Devon ones. Needs to become a command-line parameter +#import QUANTRampAPI2 as qa + +#import pandas.rpy.common as com # throws error and doesn't seem to be used? + +os.chdir(tmp_base_dir) + +#os.chdir(owd) + +class Microsim: + """ + A class used to represent the microsimulation model. + + TODO: Document class. Please use reStructuredText format: + https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html + ... + + Attributes + ---------- + XXXX + + Methods + ------- + XXXX + """ + + def __init__(self, + data_dir: str = "./data/", + scen_dir: str = "default", + r_script_dir: str = "./R/py_int/", + hazard_individual_multipliers: Dict[str, float] = {}, + hazard_location_multipliers: Dict[str, float] = {}, + risk_multiplier: float = 1.0, + lockdown_file: str = "google_mobility_lockdown_daily.csv", + random_seed: float = None, read_data: bool = True, + testing: bool = False, + output: bool = True, + output_every_iteration = False, + debug=False, + disable_disease_status=False, + disease_params: Dict = {} + ): + """ + Microsim constructor. This reads all of the necessary data to run the microsimulation. + ---------- + :param data_dir: A data directory from which to read the source data + :param scen_dir: A data directory to write the output to (i.e. a name for this model run) + :param r_script_dir: A directory with the required R scripts in (these are used to estimate disease status) + :param hazard_individual_multipliers: A dictionary containing multipliers that can make particular disease + statuses more hazardous to the locations that the people visit. See 'hazard_individual_multipliers' section + of the parameters file (e.g. model_parameters/default.yml). Default is {} which means the multiplier will + be 1.0 + :hazard_location_multipliers: A dictionary containing multipliers that can make certain locations + more hazardous (e.g. easier to get the disease if you visit). See 'hazard_location_multipliers' section + of the parameters file (e.g. model_parameters/default.yml). Default is {} which means the multiplier will + be 1.0 + :param random_seed: A optional random seed to use when creating the class instance. If None then + the current time is used. + :param read_data: Optionally don't read in the data when instantiating this Microsim (useful + in debugging). + :param testing: Optionally turn off some exceptions and replace them with warnings (only good when testing!) + :param output: Whether to create files to store the results (default True) + :param output_every_iteration: Whether to create files to store the results at every iteration rather than + just at the end (default False) + :param lockdown_file: The file to use for estimating mobility under lockdown, or don't do any lockdown + if this is an empty string. + :param debug: Whether to do some more intense error checks (e.g. for data inconsistencies) + :param disable_disease_status: Optionally turn off the R interface. This will mean we cannot calculate new + disease status. Only good for testing. + :param disease_params: Optional parameters that are passed to the R code that estimates disease status + (a dictionary, assumed to be empty) + """ + + # Administrative variables that need to be defined + Microsim.DATA_DIR = data_dir # TODO (minor) pass the data_dir to class functions directly so no need to have it defined at class level + self.DATA_DIR = data_dir + self.SCEN_DIR = scen_dir + self.iteration = 0 + self.hazard_individual_multipliers = hazard_individual_multipliers + Microsim.__check_hazard_location_multipliers(hazard_location_multipliers) + self.hazard_location_multipliers = hazard_location_multipliers + self.risk_multiplier = risk_multiplier + self.lockdown_file = lockdown_file + self.do_lockdown = False if (lockdown_file == "") else True + self.random = random.Random(random_seed) + self.output = output + self.output_every_iteration = output_every_iteration + self.r_script_dir = r_script_dir + Microsim.debug = debug + self.disable_disease_status = disable_disease_status + self.disease_params = disease_params + Microsim.testing = testing + if self.testing: + warnings.warn("Running in testing mode. Some exceptions will be disabled.") + + if not read_data: # Optionally can not do this, usually for debugging + return + + # create full path for scenario dir, also check if scenario dir exists and if so, add nr + self.SCEN_DIR = Microsim._find_new_directory(os.path.join(self.DATA_DIR, "output"), self.SCEN_DIR) + + # We need an interface to R code to calculate disease status, but don't initialise it until the run() + # method is called so that the R process is initiatied in the same process as the Microsim object + self.r_int = None + + # If we do output information, then it will go to this directory. This is determined in run(), rather than + # here, because otherwise all copies of this object will have the same output directory. + self.output_dir = None + + # Now the main chunk of initialisation is to read the input data. + + # Begin by reading the individuals. This includes core information about the population as well as the + # durations that people spend doing activities. + # This also creates flows and venues columns for the journeys of individuals to households, and makes a new + # households dataset to replace the one we read in above. + home_name = ColumnNames.Activities.HOME # How to describe flows to people's houses + self.individuals, self.households = Microsim.read_individual_time_use_and_health_data(home_name) + + # Extract a list of all MSOAs in the study area. Will need this for the new SIMs + self.all_msoas = Microsim.extract_msoas_from_indiviuals(self.individuals) + + + # + # ********** How to assign activities for the population ********** + # + # For each 'activity' (e.g shopping), we need to store the following things: + # + # 1. A data frame of the places where the activities take place (e.g. a list of shops). Refered to as + # the 'locations' dataframe. Importantly this will have a 'Danger' column which records whether infected + # people have visited the location. + # + # 2. Columns in the individuals data frame that says which locations each individual is likely to do that + # activity (a list of 'venues'), how likely they are to do to the activity (a list of 'flows'), and the + # duration spent doing the activity. + # + # For most activities, there are a number of different possible locations that the individual + # could visit. The 'venues' column stores a list of indexes to the locations dataframe. E.g. one individual + # might have venues=[2,54,19]. Those numbers refer to the *row numbers* of locations in the locations + # dataframe. So venue '2' is the third venue in the list of all the locaitons associated with that activity. + # Flows are also a list, e.g. for that individual the flows might be flows=[0.8,0.1,0.1] which means they + # are most likely to go to venue with index 2, and less likely to go to the other two. + # + # For some activities, e.g. 'being at home', each individual has only a single location and one flow, so their + # venues and flows columns will only be single-element lists. + # + # For multi-venue activities, the process is as follows (easiest to see the retail or shopping example): + # 1. Create a dataframe of individual locations and use a spatial interation model to estimate flows to those + # locations, creating a flow matrix. E.g. the `read_retail_flows_data()` function + # 2. Run through the flow matrix, assigning all individuals in each MSOA the appropriate flows. E.g. the + # `add_individual_flows()` function. + # 3. Create an `ActivityLocation` object to store information about these locations in a standard way. + # When they are created, these `ActivityLocation` objects will also add another column + # to the individuals dataframe that records the amount of time they spend doing the activity + # (e.g. 'RETAIL_DURATION'). These raw numbers were attached earlier in `attach_time_use_and_health_data`. + # (Again see the retail example below). + # These ActivityLocation objects are stored in a dictionary (see `activity_locations` created above). This makes + # it possible to run through all activities and calculate risks and dangers using the same code. + # + + # For each type of activity (store, retail, etc), create ActivityLocation objects to keep all the + # required information together. + self.activity_locations: Dict[str, ActivityLocation] = {} + + # Create 'activity locations' for the activity of being at home. (This is done for other activities, + # like retail etc, when those data are read in later. + self.activity_locations[home_name] = ActivityLocation(name=home_name, locations=self.households, + flows=None, individuals=self.individuals, + duration_col="phome") + + # Generate travel time columns and assign travel modes to some kind of risky activity (not doing this yet) + # self.individuals = Microsim.generate_travel_time_colums(self.individuals) + # One thing we do need to do (this would be done in the function) is replace NaNs in the time use data with 0 + # for col in ["pwork", "_pschool", "pshop", "pleisure", "ptransport", "pother"]: + for col in ["punknown", "phome", "pworkhome", "pwork", "_pschool", "pshop", "pservices", "pleisure", + "pescort", "ptransport", "pnothome", "phometot", "pmwalk", "pmcycle", "pmprivate", + "pmpublic", "pmunknown"]: + self.individuals[col].fillna(0, inplace=True) + + + # Read Retail flows data + retail_name = ColumnNames.Activities.RETAIL # How to refer to this in data frame columns etc. + stores, stores_flows = Microsim.read_retail_flows_data(self.all_msoas) # (list of shops and a flow matrix) + Microsim.check_sim_flows(stores, stores_flows) + # Assign Retail flows data to the individuals + self.individuals = Microsim.add_individual_flows(retail_name, self.individuals, stores_flows) + self.activity_locations[retail_name] = \ + ActivityLocation(retail_name, stores, stores_flows, self.individuals, "pshop") + + # Read Schools (primary and secondary) + primary_name = ColumnNames.Activities.PRIMARY + secondary_name = ColumnNames.Activities.SECONDARY + primary_schools, secondary_schools, primary_flows, secondary_flows = \ + Microsim.read_school_flows_data(self.all_msoas) # (list of schools and a flow matrix) + Microsim.check_sim_flows(primary_schools, primary_flows) + Microsim.check_sim_flows(secondary_schools, secondary_flows) + # Assign Schools + # TODO: need to separate primary and secondary school duration. At the moment everyone is given the same + # duration, 'pschool', which means that children will be assigned a PrimarySchool duration *and* a + # seconary school duration, regardless of their age. I think the only way round this is to + # make two new columns - 'pschool_primary' and 'pschool_seconary', and set these to either 'pschool' + # or 0 depending on the age of the child. + self.individuals = Microsim.add_individual_flows(primary_name, self.individuals, primary_flows) + self.activity_locations[primary_name] = \ + ActivityLocation(primary_name, primary_schools.copy(), primary_flows, self.individuals, "pschool-primary") + self.individuals = Microsim.add_individual_flows(secondary_name, self.individuals, secondary_flows) + self.activity_locations[secondary_name] = \ + ActivityLocation(secondary_name, secondary_schools.copy(), secondary_flows, self.individuals, "pschool-secondary") + del primary_schools,secondary_schools # No longer needed as we gave copies to the ActivityLocation + + # Assign work. Each individual will go to a virtual office depending on their occupation (all accountants go + # to the virtual accountant office etc). This means we don't have to calculate a flows matrix (similar to homes) + # Occupation is taken from column soc2010 in individuals df + self.individuals['soc2010'] = self.individuals['soc2010'].astype(str) # These are integers but we need string + possible_jobs = sorted(self.individuals.soc2010.unique()) # list of possible jobs in alphabetical order + workplaces = pd.DataFrame({'ID': range(0, 0 + len(possible_jobs))}) # df with all possible 'virtual offices' + Microsim._add_location_columns(workplaces, location_names=possible_jobs) + work_name = ColumnNames.Activities.WORK + self.individuals = Microsim.add_work_flows(work_name, self.individuals, workplaces) + self.activity_locations[work_name] = ActivityLocation(name=work_name, locations=workplaces, flows=None, + individuals=self.individuals, duration_col="pwork") + + ## Some flows will be very complicated numbers. Reduce the numbers of decimal places across the board. + ## This makes it easier to write out the files and to make sure that the proportions add up properly + ## Use multiprocessing because swifter doesn't work properly for some reason (wont paralelise) + #with multiprocessing.Pool(processes=int(os.cpu_count()/2)) as pool: + # for name in tqdm(self.activity_locations.keys(), desc="Rounding all flows"): + # rounded_flows = pool.map( Microsim._round_flows, list(self.individuals[f"{name}{ColumnNames.ACTIVITY_FLOWS}"])) + # self.individuals[f"{name}{ColumnNames.ACTIVITY_FLOWS}"] = rounded_flows + # # Use swifter, but for some reason it wont paralelise the problem. Not sure why. + # #self.individuals[f"{name}{ColumnNames.ACTIVITY_FLOWS}"] = \ + # # self.individuals.loc[:,f"{name}{ColumnNames.ACTIVITY_FLOWS}"].\ + # # swifter.allow_dask_on_strings(enable=True).progress_bar(True, desc=name).\ + # # apply(lambda flows: [round(flow, 5) for flow in flows]) + + # Round the durations + for name in tqdm(self.activity_locations.keys(), desc="Rounding all durations"): + self.individuals[f"{name}{ColumnNames.ACTIVITY_DURATION}"] = \ + self.individuals[f"{name}{ColumnNames.ACTIVITY_DURATION}"].apply(lambda x: round(x, 5)) + + # Some people's activity durations will not add up to 1.0 because we don't model all their activities. + # Extend the amount of time at home to make up for this + self.individuals = Microsim.pad_durations(self.individuals, self.activity_locations) + + # Now that we have everone's initial activities, remember the proportions of times that they spend doing things + # so that if these change (e.g. under lockdown) they can return to 'normality' later + for activity_name in self.activity_locations.keys(): + self.individuals[f"{activity_name}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] = \ + self.individuals[f"{activity_name}{ColumnNames.ACTIVITY_DURATION}"] + + # Add some necessary columns for the disease + self.individuals = Microsim.add_disease_columns(self.individuals) + + # Read a file that tells us how much more time people should spend at home than normal (this is much greater + # after lockdown + if self.do_lockdown: + self.time_activity_multiplier: pd.DataFrame = Microsim.read_time_activity_multiplier(lockdown_file) + else: + self.time_activity_multiplier = None + + print(" ... finished initialisation.") + + return # finish __init__ + + @staticmethod + def __check_hazard_location_multipliers(hazard_location_multipliers): + """Check that the hazard multipliation multipliers correspond to the actual locations being used + in the model (we don't want a hazard for a location that doesn't exist, and need all locations + to have hazards). If there are no hazard location multipliers (an empty dict) then we're not + using them so just return + + :return None if all is OK. Throw an exception if not + :raise Exception if the multipliers don't align.""" + if not hazard_location_multipliers: + return + hazard_activities = set(hazard_location_multipliers.keys()) + all_activities = set(ColumnNames.Activities.ALL) + if hazard_activities != all_activities: + raise Exception(f"The hzard location multipliers: '{hazard_activities} don't match the " + f"activities in the model: {all_activities}") + + @staticmethod + def _find_new_directory(dir, scendir): + """ + Find a new directory and make one to store results in starting from 'dir'. + :param dir: Start looking from this directory + :return: The new directory (full path) + """ + results_subdir = os.path.join(dir,scendir) + # if it exists already, try adding numbers + i = 0 + while os.path.isdir(results_subdir): + i += 1 + newdir = scendir + "_" + str(i) + results_subdir = os.path.join(dir,newdir) + # Create a directory for these results + #results_subdir = os.path.join(dir, str(i)) + try: + os.mkdir(results_subdir) + except FileExistsError as e: + print("Directory ", results_subdir, " already exists") + raise e + return results_subdir + + + + @staticmethod + def _round_flows(flows): + return [round(flow, 5) for flow in flows] + + @classmethod + def _check_no_homeless(cls, individuals, households, warn=True): + """ + Check that each individual has a household. NOTE: this only works for the raw mirosimulation data. + Once the health data has been attached this wont work becuase the unique identifiers change. + If this function is still needed then it will need to take the specific IDs as arguments, but this is + a little complicated because some combination of [area, HID, (PID)] is needed for unique identification. + + :param individuals: + :param households: + :param warn: Whether to warn (default, True) or raise an exception (False) + :return: True if there are no homeless, False otherwise (unless `warn==False` in which case an + exception is raised). + :raise: An exception if `warn==False` and there are individuals without a household + """ + print("Checking no homeless (all individuals assigned to a household) ...", ) + # This will fail if used on anything other than the raw msm data because once I read in the + # health data the PID and HID columns are renamed to prevent them being accidentally used. + assert "PID" in individuals.columns and "HID" in households.columns + # Households in the msm are uniquely identified by [area,HID] combination. + # Individuals are identified by [House_OA,HID,PID] + hids = households.set_index(["area", "HID"]) # Make a new dataset with a unique index for households + # Find individuals who do not have a related entry in the households dataset + homeless = [(area, hid, pid) for area, hid, pid in individuals.loc[:, ["House_OA", "HID", "PID"]].values if + (area, hid) not in hids.index] + # (version using apply isn't quicker) + # h2 = individuals.reset_index().loc[:, ["House_OA", "HID", "PID"]].swifter.apply( + # lambda x: x[2] if (x[0], x[1]) in hids.index else None, axis=1) + # (Vectorised version doesn't quite work sadly) + # h2 = np.where(individuals.loc[:, ["House_OA", "HID", "PID"]].isin(hids.index), True, False) + if len(homeless) > 0: + msg = f"There are {len(homeless)} individuals without an associated household (HID)." + if warn: + warnings.warn(msg) + return False + else: + raise Exception(msg) + print("... finished checking homeless") + return True + + @classmethod + def extract_msoas_from_indiviuals(cls, individuals: pd.DataFrame) -> List[str]: + """ + Analyse a DataFrame of individuals and extract the unique MSOA codes, returning them as a list in ascending + order + :param individuals: + :return: + """ + areas = list(individuals.area.unique()) + areas.sort() + return areas + + @classmethod + def read_individual_time_use_and_health_data(cls, home_name: str) -> pd.DataFrame: + """ + Read a population of individuals. Includes time-use & health info. + + :param home_name: A string to describe flows to people's homes (probably 'Home') + :return A tuple with new dataframes of individuals and households + """ + print("Reading time use and health data ... ", ) + # filename = os.path.join(cls.DATA_DIR, "devon-tu_health", "Devon_simulated_TU_health.txt") + # filename = os.path.join(cls.DATA_DIR, "devon-tu_health", "Devon_keyworker.txt") + # filename = os.path.join(cls.DATA_DIR, "devon-tu_health", "Devon_Complete.txt") + filename = os.path.join(cls.DATA_DIR, "devon-tu_health", "Devon_simulated_TU_keyworker_health.csv") + + tuh = pd.read_csv(filename) + tuh = Optimise.optimize(tuh) # Reduce memory of tuh where possible. + + # Drop people that weren't matched to a household originally + nohh = len(tuh.loc[tuh.hid == -1]) + if nohh > 0: + warnings.warn(f"{nohh} / {len(tuh)} individuals in the TUH data had not originally been matched " + f"to a household. They're being removed") + tuh = tuh.loc[tuh.hid != -1] + + # Indicate that HIDs and PIDs shouldn't be used as indices as they don't uniquely + # identify indivuals / households in this health data + tuh = tuh.rename(columns={'hid': '_hid', 'pid': '_pid'}) + + # Make a new, unique id for each individual (PIDs have been replicated so no longer uniquely idenfity individuals} + assert len(tuh.index.unique()) == len(tuh) # Index should have been set to row number when tuh was read in + tuh.insert(0, "ID", tuh.index, allow_duplicates=False) # Insert into first position + + # + # ********** Create households dataframe ************* + # + + # Go through each individual. House members can be identified because they have the same [Area, HID] + # combination. + # Maintain a dictionary of (Area, HID) -> House_ID that records a new ID for each house + # Each time a new [Area, HID] combination is found, create a new entry in the households dictionary for that + # household, generate a House_ID, and record that in the dictionary. + # When existing (Area, HID) combinations are found, look up the ID in the dataframe and record it for that + # individual + # Also, maintain a list of house_ids in the same order as individuals in the tuh data which can be used later + # when we link from the individuls in the TUH data to their house id + + # This is the main dictionary. It maps (Area, HID) to house id numbers, along with some more information: + house_ids_dict = {} # (Area, HID) -> [HouseIDNumber, NumPeople, area, hid] + + house_ids_list = [] # ID of each house for each individual + house_id_counter = 0 # Counter to generate new HouseIDNumbers + unique_individuals = [] # Also store all [Area, HID, PID] combinations to check they're are unique later + + # Maybe quicker to loop over 3 lists simultaneously than through a DataFrame + _areas = list(tuh["area"]) + _hids = list(tuh["_hid"]) + _pids = list(tuh["_pid"]) + + for i, (area, hid, pid) in enumerate(zip(_areas, _hids, _pids)): + # print(i, area, hid, pid) + unique_individuals.append((area, hid, pid)) + house_key = (area, hid) # Uniqely identifies a household + house_id_number = -1 + try: # If this lookup works then we've seen this house before. Get it's ID number and increase num people in it + house_info = house_ids_dict[house_key] + # Check the area and hid are the same as the one previously stored in the dictionary + assert area == house_info[2] and hid == house_info[3] + # Also check that the house key (Area, HID) matches the area and HID + assert house_key[0] == house_info[2] and house_key[1] == house_info[3] + # We need the ID number to tell the individual which their house is + house_id_number = house_info[0] + # Increse the number of people in the house and create a new list of info for this house + people_per_house = house_info[1] + 1 + house_ids_dict[house_key] = [house_id_number, people_per_house, area, hid] + except KeyError: # If the lookup failed then this is the first time we've seen this house. Make a new ID. + house_id_number = house_id_counter + house_ids_dict[house_key] = [house_id_number, 1, area, + hid] # (1 is beacuse 1 person so far in the hosue) + house_id_counter += 1 + assert house_id_number > -1 + house_ids_list.append(house_id_number) # Remember the house for this individual + + assert len(unique_individuals) == len(tuh) + assert len(house_ids_list) == len(tuh) + assert len(house_ids_dict) == house_id_counter + + # While we're here, may as well also check that [Area, HID, PID] is a unique identifier of individuals + if len(tuh) != len(set(unique_individuals)): + # TODO FIND OUT FROM KARYN WHY THERE ARE ~20,000 NON-UNIQUE PEOPLE + warnings.warn(f"There are {len(tuh)-len(set(unique_individuals))} / {len(tuh)} non-unique individuals.") + + # Done! Now can create the households dataframe + households_df = pd.DataFrame(house_ids_dict.values(), columns=['House_ID', 'Num_People', 'area', '_hid']) + households_df = Optimise.optimize(households_df) + + # And tell the individuals which house they live in + tuh["House_ID"] = house_ids_list # Assign each individuals to their household + + # Check all house IDs are unique and have same number as in TUH data + assert len(frozenset(households_df.House_ID.unique())) == len(households_df) + assert len(tuh.area.unique()) == len(tuh.area.unique()) + # Check that the area that the individual lives in is the same as the area their house is in + temp_merge = tuh.merge(households_df, how="left", on=["House_ID"], validate="many_to_one") + assert len(temp_merge) == len(tuh) + assert (temp_merge['area_x'] == temp_merge['area_y']).all() # (all says 'all are true') + + # Check that NumPople in the house dataframe is the same as number of people in the indivdiuals dataframe + # with this house id + if Microsim.debug: + for house_id, num_people in tqdm(zip(households_df.House_ID, households_df.Num_People), + desc="Checking household sizes match"): # I know you shouldn't loop, but I can't work out the apply way (and this only happens once) + num_people2 = len(tuh.loc[tuh.House_ID == house_id]) # Number of individuals who link to this house + assert num_people == num_people2, f"House {house_id} doesn't match: {num_people} / {num_people2}" + + # Add some required columns + Microsim._add_location_columns(households_df, location_names=list(households_df.House_ID), + location_ids=households_df.House_ID) + # The new ID column should be the same as the House_ID + assert (households_df.House_ID == households_df[ColumnNames.LOCATION_ID]).all() + + # Later we need time spent in primary and secondary school. But currently we just have 'pschool'. Make + # two new columns separating out primary and secondary based on age + tuh["pschool"] = tuh["pschool"].fillna(0) + tuh["pschool-primary"] = 0.0 + tuh["pschool-secondary"] = 0.0 + children_idx = tuh.index[tuh["age"] < 11] + teen_idx = tuh.index[(tuh["age"] >= 11) & (tuh["age"] < 19)] + + assert len(children_idx) > 0 + assert len(teen_idx) > 0 + + tuh.loc[children_idx, "pschool-primary"] = tuh.loc[children_idx, "pschool"] + tuh.loc[teen_idx, "pschool-secondary"] = tuh.loc[teen_idx, "pschool"] + + # Check that people have been allocated correctly + adults_in_school = tuh.loc[~(tuh["pschool-primary"] + tuh["pschool-secondary"] == tuh["pschool"]), + ["age", "pschool", "pschool-primary", "pschool-secondary"]] + if len(adults_in_school) > 0: + warnings.warn(f"{len(adults_in_school)} people > 18y/o go to school, but they are not being assigned to a " + f"primary or secondary school (so their schooling is ignored at the moment).") + + tuh = tuh.rename(columns={"pschool": "_pschool"}) # Indicate that the pschool column shouldn't be used now + + # For some reason, we get some *very* large households. Can demonstrate this with: + # households_df.Num_People.hist(bins=10000) + # This needs to be resolved, but in the meantime just remove all households that have more than 10 people + large_house_idx = frozenset(households_df.index[households_df.Num_People > 10]) # Indexes of large houses + # For each person, get a house_id, or -1 if the house is very large + large_people_idx = tuh["House_ID"].apply(lambda x: -1 if x in large_house_idx else x) + if len(large_house_idx) > 0: + warnings.warn(f"There are {len(large_house_idx)} households with more than 10 people in them. This covers " + f"{len(large_people_idx[large_people_idx == -1])} people. These households are being removed.") + tuh["TEMP_HOUSE_ID"] = large_people_idx # Use this colum to remove people (all people with HOUSE_ID == -1) + # Check the numbers add up (normal house len + large house len = original len) + assert (len(tuh.loc[tuh.TEMP_HOUSE_ID != -1]) + len(large_people_idx[large_people_idx == -1])) == len(tuh) + assert (len(households_df.loc[~households_df.House_ID.isin(large_house_idx)]) + len(large_house_idx)) == len( + households_df) + # Remove people, but leave the households (no one will live there so they wont affect anything) + tuh = tuh[tuh.TEMP_HOUSE_ID != -1] + # TODO Work out why removing households kills the model later. - it's probably because houses are removed but the indexes and IDs don't change, so indexes will end up larger than size of the households list. Probably would need to recalculate the index and House_ID so that they are ascending again (pain, can't be bothered). + # households_df = households_df.loc[~households_df.House_ID.isin(large_house_idx)] + # households_df = households_df.loc[households_df.Num_People <= 10] + # Check that the large house ids no longer exist in the individuals df (use House_ID rather than index to be sure, but they're the same anyway) + id_set = frozenset(households_df.loc[households_df.Num_People > 10, "House_ID"].values) + assert True not in list(tuh["House_ID"].apply(lambda x: x in id_set)) + del tuh["TEMP_HOUSE_ID"] + + # Add flows for each individual (this is easy, it's just converting their House_ID and flow (1.0) into a + # one-value lists). + venues_col = f"{home_name}{ColumnNames.ACTIVITY_VENUES}" # Names for the new columns + flows_col = f"{home_name}{ColumnNames.ACTIVITY_FLOWS}" + tuh[venues_col] = tuh["House_ID"].apply(lambda x: [x]) + tuh[flows_col] = [[1.0]] * len(tuh) + + # Later we also record the individual risks for each activity per individual. It's nice if the columns for + # each activity are grouped together, so create that column now. + tuh[f"{home_name}{ColumnNames.ACTIVITY_RISK}"] = [-1] * len(tuh) + + print(f"... finished reading TU&H data. There are {len(tuh)} individuals in {len(households_df)} houses " + f"over {len(tuh.area.unique())} MSOAs") + + return tuh, households_df + + @classmethod + def generate_travel_time_colums(cls, individuals: pd.DataFrame) -> pd.DataFrame: + """ + TODO Read the raw travel time columns and create standard ones to show how long individuals + spend travelling on different modes. Ultimately these will be turned into activities + :param individuals: + :return: + """ + + # Some sanity checks for the time use data + # Variables pnothome, phome add up to 100% of the day and + # pwork +pschool +pshop+ pleisure +pescort+ ptransport +pother = phome + + # TODO go through some of these with Karyn, they don't all pass + # Time at home and not home should sum to 1.0 + if False in list((individuals.phome + individuals.pnothome) == 1.0): + raise Exception("Time at home (phome) + time not at home (pnothome) does not always equal 1.0") + # These columns should equal time not at home + # if False in list(tuh.loc[:, ["pwork", "pschool", "pshop", "pleisure", "ptransport", "pother"]]. \ + # sum(axis=1, skipna=True) == tuh.pnothome): + # raise Exception("Times doing activities don't add up correctly") + + # Temporarily (?) remove NAs from activity columns (I couldn't work out how to do this in 1 line like: + for col in ["pwork", "pschool", "pshop", "pleisure", "ptransport", "pother"]: + individuals[col].fillna(0, inplace=True) + + # TODO assign activities properly. Need to map from columns in the dataframe to standard names + # Assign time use for Travel (just do this arbitrarily for now, the correct columns aren't in the data). + # travel_cols = [ x + ColumnNames.ACTIVITY_DURATION for x in + # [ ColumnNames.TRAVEL_CAR, ColumnNames.TRAVEL_BUS, ColumnNames.TRAVEL_TRAIN, ColumnNames.TRAVEL_WALK ] ] + # for col in travel_cols: + # tuh[col] = 0.0 + # OLD WAY OF HARD-CODING TIME USE CATEGORIES FOR EACH INDIVIDUAL + # For now just hard code broad categories. Ultimately will have different values for different activities. + # activities = ["Home", "Retail", "PrimarySchool", "SecondarySchool", "Work", "Leisure"] + # col_names = [] + # for act in activities: + # col_name = act + ColumnNames.ACTIVITY_DURATION + # col_names.append(col_name) + # if act=="Home": + # # Assume XX hours per day at home (this is whatever not spent doing other activities) + # individuals[col_name] = 14/24 + # elif act == "Retail": + # individuals[col_name] = 1.0/24 + # elif act == "PrimarySchool": + # # Assume 8 hours per day for all under 12 + # individuals[col_name] = 0.0 # Default 0 + # individuals.loc[individuals[ColumnNames.INDIVIDUAL_AGE] < 12, col_name] = 8.0/24 + # elif act == "SecondarySchool": + # # Assume 8 hours per day for 12 <= x < 19 + # individuals[col_name] = 0.0 # Default 0 + # individuals.loc[individuals[ColumnNames.INDIVIDUAL_AGE] < 19, col_name] = 8.0 / 24 + # individuals.loc[individuals[ColumnNames.INDIVIDUAL_AGE] < 12, col_name] = 0.0 + # elif act == "Work": + # # Opposite of school + ## individuals[col_name] = 0.0 # Default 0 + # individuals.loc[individuals[ColumnNames.INDIVIDUAL_AGE] >= 19, col_name] = 8.0/24 + # elif act == "Leisure": + # individuals[col_name] = 1.0/24 + # else: + # raise Exception(f"Unrecognised activity: {act}") + + # Check that proportions add up to 1.0 + # For some reason this fails, but as far as I can see the proportions correctly sum to 1 !! + # assert False not in (individuals.loc[:, col_names].sum(axis=1).round(decimals=4) == 1.0) + + ## Add travel data columns (no values yet) + # travel_cols = [ x + ColumnNames.ACTIVITY_DURATION for x in ["Car", "Bus", "Walk", "Train"] ] + # for col in travel_cols: + # individuals[col] = 0.0 + return individuals + + @classmethod + def read_school_flows_data(cls, study_msoas: List[str]) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): + """ + Read the flows between each MSOA and the most likely schools attended by pupils in this area. + All schools are initially read together, but flows are separated into primary and secondary + + :param study_msoas: A list of MSOAs in the study area (flows outside of this will be ignored) + :return: A tuple of three dataframes. All schools, then the flows to primary and secondary + (Schools, PrimaryFlows, SeconaryFlows). Although all the schools are one dataframe, no primary flows will flow + to secondary schools and vice versa). + """ + + + # devon data + if not USE_QUANT_DATA: + # TODO Need to read full school flows, not just those of Devon + print("Reading school flow data for Devon...", ) + dir = os.path.join(cls.DATA_DIR, "devon-schools") + + # Read the schools (all of them) + schools = pd.read_csv(os.path.join(dir, "exeter schools.csv")) + # Add some standard columns that all locations need + schools_ids = list(schools.index + 1) # Mark counts from 1, not zero, so indices need to start from 1 not 0 + schools_names = schools.EstablishmentName # Standard name for the location + Microsim._add_location_columns(schools, location_names=schools_names, location_ids=schools_ids) + + # Read the flows + primary_rows = [] # Build up all the rows in the matrix gradually then add all at once + secondary_rows = [] + with open(os.path.join(dir, "DJS002.TXT")) as f: + # Mark's file comes in batches of 3 lines, each giving different data. However, some lines overrun and are + # read as several lines rather than 1 (hence use of dests_tmp and flows_tmp) + count = 1 + oa = None + oa_name = "" + num_dests = None + dests = None + flows = None + dests_tmp = None + flows_tmp = None + + for lineno, raw_line in enumerate(f): + # print(f"{lineno}: '{raw_line}'") + line_list = raw_line.strip().split() + if count == 1: # primary/secondary school, OA and number of schools + sch_type = int(line_list[0]) + assert sch_type == 1 or sch_type == 2 # Primary schools are 1, secondary 2 + oa = int(line_list[1]) + oa_name = study_msoas[oa - 1] # The OA names are stored in a separate file temporarily + num_dests = int(line_list[2]) + elif count == 2: # school ids + dests_tmp = [int(x) for x in line_list[0:]] # Make the destinations numbers + # check if dests exists from previous iteration and add dests_tmp + if dests == None: + dests = dests_tmp + else: + dests.extend(dests_tmp) + if len(dests) < num_dests: # need to read next line + count = 1 # counteracts count being increased by 1 later + else: + assert len(dests) == num_dests + elif count == 3: # Flows per 1,000 pupils + flows_tmp = [float(x) for x in line_list[0:]] # Make the destinations numbers + # check if dests exists from previous iteration and add dests_tmp + if flows == None: + flows = flows_tmp + else: + flows.extend(flows_tmp) + if len(flows) < num_dests: # need to read next line + count = 2 # counteracts count being increased by 1 later + else: + assert len(flows) == num_dests + + # Have read all information for this area. Store the info in the flows matrix + + # We should have one line in the matrix for each OA, and OA codes are incremental + # assert len(flow_matrix) == oa - 1 + row = [0.0 for _ in range(len(schools))] # Initially assume all flows are 0 + for i in range(num_dests): # Now add the flows + dest = dests[i] + flow = flows[i] + row[dest - 1] = flow # (-1 because destinations are numbered from 1, not 0) + assert len([x for x in row if x > 0]) == num_dests # There should only be N >0 flows + row = [oa, oa_name] + row # Insert the OA number and code (don't know this yet now) + + # Luckily Mark's file does all primary schools first, then all secondary schools, so we + # know that all schools in this area are one or the other + if sch_type == 1: + primary_rows.append(row) + else: + secondary_rows.append(row) + # rows.append(row) + + # Add the row to the matrix. As the OA numbers are incremental they should equal the number + # of rows + # flow_matrix.loc[oa-1] = row + # assert len(flow_matrix) == oa + count = 0 + # reset dests and flows + dests = None + flows = None + + count += 1 + + # Have finished reading the file, now create the matrices. MSOAs as rows, school locations as columns + columns = ["Area_ID", "Area_Code"] # A number (ID) and full code for each MSOA + columns += [f"Loc_{i}" for i in schools.index] # Columns for each school + + primary_flow_matrix = pd.DataFrame(data=primary_rows, columns=columns) + secondary_flow_matrix = pd.DataFrame(data=secondary_rows, columns=columns) + # schools_flows = schools_flows.iloc[0:len(self.study_msoas), :] + print(f"... finished reading school flows.") + + # same df for primary and secondary schools + primary_schools = schools.copy() + secondary_schools = schools.copy() + + # QUANT data + else: + print("Reading school flow data...", ) + dir = os.path.join(cls.DATA_DIR, "QUANT_RAMP","model-runs") + + # Read the primary schools + primary_schools = pd.read_csv(os.path.join(dir, "primaryZones.csv")) + # Add some standard columns that all locations need + primary_school_ids = list(primary_schools.index) + primary_school_names = primary_schools.URN # unique ID for venue + Microsim._add_location_columns(primary_schools, location_names=primary_school_names, location_ids=primary_school_ids) + + # Read the secondary schools + secondary_schools = pd.read_csv(os.path.join(dir, "secondaryZones.csv")) + # Add some standard columns that all locations need + secondary_school_ids = list(secondary_schools.index) + secondary_school_names = secondary_schools.URN # unique ID for venue + Microsim._add_location_columns(secondary_schools, location_names=secondary_school_names, location_ids=secondary_school_ids) + + # Read the primary school flows + threshold = 5 # top 5 + thresholdtype = "nr" # threshold based on nr venues + primary_flow_matrix = qa.get_flows(ColumnNames.Activities.PRIMARY, study_msoas,threshold,thresholdtype) + + # Read the secondary school flows + # same thresholds as before + secondary_flow_matrix = qa.get_flows(ColumnNames.Activities.SECONDARY, study_msoas,threshold,thresholdtype) + + + + return primary_schools, secondary_schools, primary_flow_matrix, secondary_flow_matrix + + @classmethod + def add_work_flows(cls, flow_type: str, individuals: pd.DataFrame, workplaces: pd.DataFrame) \ + -> (pd.DataFrame): + """ + Create a dataframe of (virtual) work locations that individuals + travel to. Unlike retail etc, each individual will have only one work location with 100% of flows there. + :param flow_type: The name for these flows (probably something like 'Work') + :param individuals: The dataframe of synthetic individuals + :param workplaces: The dataframe of workplaces (i.e. occupations) + :return: The new 'individuals' dataframe (with new columns) + """ + # Later on, add a check to see if occupations in individuals df are same as those in workplaces df?? + # Tell the individuals about which virtual workplace they go to + venues_col = f"{flow_type}{ColumnNames.ACTIVITY_VENUES}" + flows_col = f"{flow_type}{ColumnNames.ACTIVITY_FLOWS}" + + # Lists showing where individuals go, and what proption (here only 1 flow as only 1 workplace) + # Need to do the flows in venues in 2 stages: first just add the venue, then turn that venu into a single-element + # list (pandas complains about 'TypeError: unhashable type: 'list'' if you try to make the single-item lists + # directly in the apply + with multiprocessing.Pool(processes=int(os.cpu_count()/2)) as pool: + print("Assigning work venues ... ",) + venues = pool.starmap(Microsim._assign_work_flow, + zip(list(individuals["soc2010"]), (workplaces for _ in range(len(individuals))) ) ) + #venues = list(individuals["soc2010"].swifter.progress_bar(True, desc="Assigning work venues").apply( + # lambda job: workplaces.index[workplaces[ColumnNames.LOCATION_NAME] == job].values[0])) + venues = [[x] for x in venues] + individuals[venues_col] = venues + individuals[flows_col] = [[1.0] for _ in range(len(individuals))] # Flows are easy, [1.0] to the single venue + # Later we also record the individual risks for each activity per individual. It's nice if the columns for + # each activity are grouped together, so create that column now. + individuals[f"{flow_type}{ColumnNames.ACTIVITY_RISK}"] = [-1] * len(individuals) + return individuals + + @staticmethod + def _assign_work_flow(job, workplaces): + return workplaces.index[workplaces[ColumnNames.LOCATION_NAME] == job].values[0] + + @classmethod + def read_retail_flows_data(cls, study_msoas: List[str]) -> (pd.DataFrame, pd.DataFrame): + """ + Read the flows between each MSOA and the most commonly visited shops + + :param study_msoas: A list of MSOAs in the study area (flows outside of this will be ignored) + :return: A tuple of two dataframes. One containing all of the flows and another + containing information about the stores themselves. + """ + + # Devon data + if not USE_QUANT_DATA: + # TODO Need to read full retail flows, not just those of Devon (temporarily created by Mark). + # Will also need to subset the flows into areas of interst, but at the moment assume that we area already + # working with Devon subset of flows + print("Reading retail flow data for Devon...", ) + dir = os.path.join(cls.DATA_DIR, "devon-retail") + + # Read the stores + stores = pd.read_csv(os.path.join(dir, "devon smkt.csv")) + # Add some standard columns that all locations need + stores_ids = list(stores.index + 1) # Mark counts from 1, not zero, so indices need to start from 1 not 0 + store_names = stores.store_name # Standard name for the location + Microsim._add_location_columns(stores, location_names=store_names, location_ids=stores_ids) + + # Read the flows + rows = [] # Build up all the rows in the matrix gradually then add all at once + total_flows = 0 # For info & checking + with open(os.path.join(dir, "DJR002.TXT")) as f: + count = 1 # Mark's file comes in batches of 3 lines, each giving different data + + # See the README for info about these variables. This is only tempoarary so I can't be bothered + # to explain properly + oa = None + oa_name = "" + num_dests = None + dests = None + flows = None + + for lineno, raw_line in enumerate(f): + # print(f"{lineno}: '{raw_line}'") + line_list = raw_line.strip().split() + if count == 1: # OA and number of destinations + oa = int(line_list[0]) + if oa > len(study_msoas): + msg = f"Attempting to read more output areas ({oa}) than are present in the study area {study_msoas}." + if cls.testing: + warnings.warn(msg) + else: + raise Exception(msg) + oa_name = study_msoas[oa - 1] # The OA names are stored in a separate file temporarily + num_dests = int(line_list[1]) + elif count == 2: # Top N (=10) destinations in the OA + # First number is the OA (don't need this), rest are the destinations + assert int(line_list[0]) == oa + dests = [int(x) for x in line_list[1:]] # Make the destinations numbers + assert len(dests) == num_dests + elif count == 3: # Distance to each store (not currently used) + pass + elif count == 4: # Flows per 1,000 trips + # First number is the OA (don't need this), rest are the destinations + assert int(line_list[0]) == oa + flows = [float(x) for x in line_list[1:]] # Make the destinations numbers + assert len(flows) == num_dests + total_flows += sum(flows) + + # Have read all information for this area. Store the info in the flows matrix + + # We should have one line in the matrix for each OA, and OA codes are incremental + # assert len(flow_matrix) == oa - 1 + row = [0.0 for _ in range(len(stores))] # Initially assume all flows are 0 + for i in range(num_dests): # Now add the flows + dest = dests[i] + flow = flows[i] + row[dest - 1] = flow # (-1 because destinations are numbered from 1, not 0) + assert len([x for x in row if x > 0]) == num_dests # There should only be positive flows (no 0s) + row = [oa, oa_name] + row # Insert the OA number and code (don't know this yet now) + + rows.append(row) + + # Add the row to the matrix. As the OA numbers are incremental they should equal the number + # of rows + # flow_matrix.loc[oa-1] = row + # assert len(flow_matrix) == oa + count = 0 + + count += 1 + + # Have finished reading the file, now create the matrix. MSOAs as rows, retail locations as columns + columns = ["Area_ID", "Area_Code"] # A number (ID) and full code for each MSOA + columns += [f"Loc_{i}" for i in stores.index] # Columns for each store + flow_matrix = pd.DataFrame(data=rows, columns=columns) + + # Check that we haven't lost any flows (need two sums, once to get the flows for each row, then + # to add up all rows + total_flows2 = flow_matrix.iloc[:, 2:].apply(lambda row: sum(row)).sum() + assert total_flows == total_flows2 + + print(f"... read {total_flows} flows from {len(flow_matrix)} areas.") + + # QUANT data + else: + print("Reading retail flow data...", ) + dir = os.path.join(cls.DATA_DIR, "QUANT_RAMP","model-runs") + # Read the stores + stores = pd.read_csv(os.path.join(dir, "retailpointsZones.csv")) + # Add some standard columns that all locations need + stores_ids = list(stores.index) + store_names = stores.id # unique ID for venue + Microsim._add_location_columns(stores, location_names=store_names, location_ids=stores_ids) + + # Read the flows + threshold = 10 # top 10 + thresholdtype = "nr" # threshold based on nr venues + flow_matrix = qa.get_flows("Retail", study_msoas,threshold,thresholdtype) + + return stores, flow_matrix + + @classmethod + def check_sim_flows(cls, locations: pd.DataFrame, flows: pd.DataFrame): + """ + Check that the flow matrix looks OK, raising an error if not + :param locations: A DataFrame with information about each location (destination) + :param flows: The flow matrix itself, showing flows from origin MSOAs to destinations + :return: + """ + # TODO All MSOA codes are unique + # TODO Locations have 'Danger' and 'ID' columns + # TODO Number of destination columns ('Loc_*') matches number of locaitons + # TODO Number of origins (rows) in the flow matrix matches number of OAs in the locations + return + + @classmethod + def _add_location_columns(cls, locations: pd.DataFrame, location_names: List[str], location_ids: List[int] = None): + """ + Add some standard columns to DataFrame (in place) that contains information about locations. + :param locations: The dataframe of locations that the columns will be added to + :param location_names: Names of the locations (e.g shop names) + :param location_ids: Can optionally include a list of IDs. An 'ID' column is always created, but if no specific + IDs are provided then the ID will be the same as the index (i.e. the row number). If ids are provided then + the ID column will be set to the given IDs, but the index will still be the row number. + :return: None; the columns are added to the input dataframe inplace. + """ + # Make sure the index will always be the row number + locations.reset_index(inplace=True, drop=True) + if location_ids is None: + # No specific index provided, just use the index + locations[ColumnNames.LOCATION_ID] = locations.index + else: + # User has provided a specific list of indices to use + if len(location_ids) != len(locations): + raise Exception(f"When adding the standard columns to a locations dataframe, a list of specific", + f"IDs has ben passed, but this list (length {len(location_ids)}) is not the same" + f"length as the locations dataframe (length {len(locations)}. The list of ids passed" + f"is: {location_ids}.") + locations[ColumnNames.LOCATION_ID] = location_ids + if len(location_names) != len(locations): + raise Exception(f"The list of location names is not the same as the number of locations in the dataframe", + f"({len(location_names)} != {len(locations)}.") + locations[ColumnNames.LOCATION_NAME] = location_names # Standard name for the location + locations[ColumnNames.LOCATION_DANGER] = 0 # All locations have a disease danger of 0 initially + # locations.set_index(ColumnNames.LOCATION_ID, inplace=True, drop=False) + return None # Columns added in place so nothing to return + + @classmethod + def add_individual_flows(cls, flow_type: str, individuals: pd.DataFrame, flow_matrix: pd.DataFrame) -> pd.DataFrame: + """ + Take a flow matrix from MSOAs to (e.g. retail) locations and assign flows to individuals. + + It a assigns the id of the destination of the flow according to its column in the matrix. So the first column + that has flows for a destination is given index 0, the second is index 1, etc. This is probably not the same as + the ID of the venue that they point to (e.g. the first store probably has ID 1, but will be given the index 0) + so it is important that when the activity_locations are created, they are created in the same order as the + columns that appear in the matix. The first column in the matrix must also be the first row in the locations + data. + :param flow_type: What type of flows are these. This will be appended to the column names. E.g. "Retail". + :param individuals: The DataFrame contining information about all individuals + :param flow_matrix: The flow matrix, created by (e.g.) read_retail_flows_data() + :return: The DataFrame of individuals with new locations and probabilities added + """ + + # Check that there aren't any individuals who wont be given any flows + if len(individuals.loc[-individuals.area.isin(flow_matrix.Area_Code)]) > 0: + raise Exception(f"Some individuals will not be assigned any flows to: '{flow_type}' because their" + f"MSOA is not in the flow matrix: " + f"{individuals.loc[-individuals.area.isin(flow_matrix.Area_Code)]}.") + + # Check that there aren't any duplicate flows + if len(flow_matrix) != len(flow_matrix.Area_Code.unique()): + raise Exception("There are duplicate area codes in the flow matrix: ", flow_matrix.Area_Code) + + # Names for the new columns + venues_col = f"{flow_type}{ColumnNames.ACTIVITY_VENUES}" + flows_col = f"{flow_type}{ColumnNames.ACTIVITY_FLOWS}" + + # Create empty lists to hold the vanues and flows for each individuals + individuals[venues_col] = [[] for _ in range(len(individuals))] + individuals[flows_col] = [[] for _ in range(len(individuals))] + + # Later we also record the individual risks for each activity per individual. It's nice if the columns for + # each activity are grouped together, so create that column now. + individuals[f"{flow_type}{ColumnNames.ACTIVITY_RISK}"] = [-1] * len(individuals) + + # Use a hierarchical index on the Area to speed up finding all individuals in an area + # (not sure this makes much difference). + individuals.set_index(["area", "ID"], inplace=True, drop=False) + + for area in tqdm(flow_matrix.values, + desc=f"Assigning individual flows for {flow_type}"): # Easier to operate over a 2D matrix rather than a dataframe + oa_num: int = area[0] + oa_code: str = area[1] + # Get rid of the area codes, so are now just left with flows to locations + area = list(area[2:]) + # Destinations with positive flows and the flows themselves + dests = [] + flows = [] + for i, flow in enumerate(area): + if flow > 0.0: + dests.append(i) + flows.append(flow) + + # Normalise the flows + flows = Microsim._normalise(flows) + + # Now assign individuals in those areas to those flows + # This ridiculous 'apply' line is the only way I could get pandas to update the particular + # rows required. Something like 'individuals.loc[ ...] = dests' (see below) didn't work becuase + # instead of inserting the 'dests' list itself, pandas tried to unpack the list and insert + # the individual values instead. + # individuals.loc[individuals.area == oa_code, f"{flow_type}_Venues"] = dests + # individuals.loc[individuals.area == oa_code, f"{flow_type}_Probabilities"] = flow + # + # A quicker way to do this is probably to create N subsets of individuals (one table for + # each area) and then concatenate them at the end. + individuals.loc[oa_code, venues_col] = \ + individuals.loc[oa_code, venues_col].apply(lambda _: dests).values + individuals.loc[oa_code, flows_col] = \ + individuals.loc[oa_code, flows_col].apply(lambda _: flows).values + # individuals.loc[individuals.area=="E02004189", f"{flow_type}_Venues"] = \ + # individuals.loc[individuals.area=="E02004189", f"{flow_type}_Venues"].apply(lambda _: dests) + # individuals.loc[individuals.area=="E02004189", f"{flow_type}_Probabilities"] = \ + # individuals.loc[individuals.area=="E02004189", f"{flow_type}_Probabilities"].apply(lambda _: flows) + + # Reset the index so that it's not the PID + individuals.reset_index(inplace=True, drop=True) + + # Check everyone has some flows (all list lengths are >0) + assert False not in (individuals.loc[:, venues_col].apply(lambda cell: len(cell)) > 0).values + assert False not in (individuals.loc[:, flows_col].apply(lambda cell: len(cell)) > 0).values + + return individuals + + @classmethod + def pad_durations(cls, individuals, activity_locations) -> pd.DataFrame: + """ + Some indvidiuals' activity durations don't add up to 1. In these cases pad them out with extra time at home. + :param individuals: + :param activity_locations: + :return: The new individuals dataframe + """ + total_duration = [0.0] * len(individuals) # Add up all the different activity durations + for activity in activity_locations.keys(): + total_duration = total_duration + individuals.loc[:, f"{activity}{ColumnNames.ACTIVITY_DURATION}"] + total_duration = total_duration.apply(lambda x: round(x, 5)) + assert (total_duration <= 1.0).all() # None should be more than 1.0 (after rounding) + + missing_duration = 1.0 - total_duration # Amount of activity time that needs to be added on to home + #missing_duration = missing_duration.apply(lambda x: round(x,5)) + individuals[f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] = \ + (individuals[f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] + missing_duration).apply(lambda x: round(x, 5)) + + Microsim.check_durations_sum_to_1(individuals, activity_locations.keys()) + + return individuals + + @classmethod + def check_durations_sum_to_1(cls, individuals, activities): + total_duration = [0.0] * len(individuals) # Add up all the different activity durations + for activity in activities: + total_duration = total_duration + individuals.loc[:, f"{activity}{ColumnNames.ACTIVITY_DURATION}"] + if not (total_duration.apply(lambda x: round(x, 5)) == 1.0).all(): + print("Some activity durations don't sum to 1", flush=True) + print(total_duration[total_duration!=1.0], flush=True) + raise Exception("Some activity durations don't sum to 1") + + + @classmethod + def read_time_activity_multiplier(cls, lockdown_file) -> pd.DataFrame: + """ + Some times people should spend more time at home than normal. E.g. after lockdown. This function + reads a file that tells us how much more time should be spent at home on each day. + :param lockdown_file: Where to read the mobility data from (assume it's within the DATA_DIR). + :return: A dataframe with 'day' and 'timeout_multiplier' columns + """ + assert lockdown_file != "", \ + "read_time_activity_multiplier should not have been called if lockdown_file is empty" + path = os.path.join(cls.DATA_DIR, lockdown_file) + print(f"Reading time activity multiplier data from {path}...", ) + time_activity = pd.read_csv(path) + # Cap at 1.0 (it's a curve so some times peaks above 1.0)= + time_activity["timeout_multiplier"] = time_activity.loc[:, "timeout_multiplier"].\ + apply(lambda x: 1.0 if x > 1.0 else x) + + return time_activity + + @classmethod + def _normalise(cls, l: List[float], decimals=3) -> List[float]: + """ + Normalise a list so that it sums to almost 1.0. Rounding might cause it not to add exactly to 1 + + :param decimals: Optionally round to the number of decimal places. Default 3. If 'None' the do no rounding. + """ + if not isinstance(l, Iterable): + raise Exception("Can only work with iterables") + if len(l) == 1: # Special case for 1-item iterables + return [1.0] + + l = np.array(l) # Easier to work with numpy vectorised operators + total = l.sum() + l = l / total + if decimals is None: + return list(l) + return [round(x, decimals) for x in l] + + def _init_output(self,repnr): + """ + Might need to write out some data if saving output for analysis later. If so, creates a new directory for the + results as a subdirectory of the data directory. + Also store some information for use in the visualisations and analysis. + """ + if self.output: + print("Saving initial models for analysis ... ", ) + # Find a directory to use, within the 'output' directory +# if repnr == 0: +# self.SCEN_DIR = Microsim._find_new_directory(os.path.join(self.DATA_DIR, "output"), self.SCEN_DIR) + self.output_dir = os.path.join(self.SCEN_DIR, str(repnr)) + os.mkdir(self.output_dir) + + # save initial model + pickle_out = open(os.path.join(self.output_dir, "m0.pickle"), "wb") + pickle.dump(self, pickle_out) + pickle_out.close() + + # collect disease status in new df (for analysis/visualisation) + self.individuals_to_pickle = self.individuals.copy() + self.individuals_to_pickle[ColumnNames.DISEASE_STATUS + "000"] = self.individuals_to_pickle[ + ColumnNames.DISEASE_STATUS] + + # collect location dangers at time 0 in new df(for analysis/visualisation) + self.activities_to_pickle = {} + for name in self.activity_locations: + # Get the details of the location activity + activity = self.activity_locations[name] # Pointer to the ActivityLocation object + loc_name = activity.get_name() # retail, school etc + loc_ids = activity.get_ids() # List of the IDs of the locations + loc_dangers = activity.get_dangers() # List of the current dangers + self.activities_to_pickle[loc_name] = activity.get_dataframe_copy() + self.activities_to_pickle[loc_name]['Danger0'] = loc_dangers + assert False not in list(loc_ids == self.activities_to_pickle[loc_name]["ID"].values) + + @classmethod + def add_disease_columns(cls, individuals: pd.DataFrame) -> pd.DataFrame: + """Adds columns required to estimate disease prevalence""" + individuals[ColumnNames.DISEASE_STATUS] = 0 + individuals[ColumnNames.DISEASE_STATUS_CHANGED] = False + # individuals[ColumnNames.DAYS_WITH_STATUS] = 0 # Also keep the number of days that have elapsed with this status + individuals[ColumnNames.CURRENT_RISK] = 0 # This is the risk that people get when visiting locations. + + # No longer update disease counts per MSOA etc. Not needed + #individuals[ColumnNames.MSOA_CASES] = 0 # Useful to count cases per MSOA + #individuals[ColumnNames.HID_CASES] = 0 # Ditto for the household + + individuals[ColumnNames.DISEASE_PRESYMP] = -1 + individuals[ColumnNames.DISEASE_SYMP_DAYS] = -1 + individuals[ColumnNames.DISEASE_EXPOSED_DAYS] = -1 + return individuals + + def update_behaviour_during_lockdown(self): + """ + Unilaterally alter the proportions of time spent on different activities before and after 'lockddown' + Otherwise this doesn't do anything update_behaviour_during_lockdown. + + Note: ignores people who are currently showing symptoms (`ColumnNames.DiseaseStatus.SYMPTOMATIC`) + """ + # Are we doing any lockdown at all? in this iteration? + if self.do_lockdown: + # Only change the behaviour of people who aren't showing symptoms. If you are showing symptoms then you + # will be mostly at home anyway, so don't want your behaviour overridden by lockdown. + uninfected = self.individuals.index[ + self.individuals[ColumnNames.DISEASE_STATUS] != ColumnNames.DiseaseStatuses.SYMPTOMATIC] + if len(uninfected) < len(self.individuals): + print(f"\t{len(self.individuals) - len(uninfected)} people are symptomatic so not affected by lockdown") + + # Reduce all activities, replacing the lost time with time spent at home + non_home_activities = set(self.activity_locations.keys()) + non_home_activities.remove(ColumnNames.Activities.HOME) + # Need to remember the total duration of time lost for non-home activities + total_duration = pd.Series(data=[0.0] * len(self.individuals.loc[uninfected]), name="TotalDuration") + + # Reduce the initial activity proportion of time by a particular amount per day + timeout_multiplier = self.time_activity_multiplier.loc[ + self.time_activity_multiplier.day == self.iteration, "timeout_multiplier"].values[0] + print(f"\tApplying regular (google mobility) lockdown multiplier {timeout_multiplier}") + for activity in non_home_activities: + # Need to be careful with new_duration because we don't want to keep the index used in + # self.individuals as this will be missing out people who aren't infected so will have gaps + new_duration = pd.Series(list(self.individuals.loc[uninfected, activity + ColumnNames.ACTIVITY_DURATION_INITIAL] * timeout_multiplier), name="NewDuration") + total_duration += new_duration + self.individuals.loc[uninfected, activity + ColumnNames.ACTIVITY_DURATION] = list(new_duration) + + + assert (total_duration <= 1.0).all() and (new_duration <= 1.0).all() + # Now set home duration to fill in the time lost from doing other activities. + self.individuals.loc[uninfected, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] = list(1 - total_duration) + + # Check they still sum correctly (if not then they probably need rounding) + # (If you want to print the durations) + # self.individuals.loc[:, [ x+ColumnNames.ACTIVITY_DURATION for x in self.activity_locations.keys() ]+ + # [ x+ColumnNames.ACTIVITY_DURATION_INITIAL for x in self.activity_locations.keys() ] ] + Microsim.check_durations_sum_to_1(self.individuals, self.activity_locations.keys()) + else: + print("\tNot applying a lockdown multiplier") + + def update_venue_danger_and_risks(self, decimals=8): + """ + Update the danger score for each location, based on where the individuals who have the infection visit. + Then look through the individuals again, assigning some of that danger back to them as 'current risk'. + + :param risk_multiplier: Risk is calcuated as duration * flow * risk_multiplier. + :param decimals: Number of decimals to round the indivdiual risks and dangers to (defult 10). If 'None' + then do no rounding + """ + print("\tUpdating danger associated with visiting each venue") + + # Make a new list to keep the new risk for each individual (better than repeatedly accessing the dataframe) + # Make this 0 initialy as the risk is not cumulative; it gets reset each day + current_risk = [0] * len(self.individuals) + + # for name in tqdm(self.activity_locations, desc=f"Updating dangers and risks for activity locations"): + for activty_name in self.activity_locations: + + # + # ***** 1 - update dangers of each venue (infected people visitting places) + # + + print(f"\t\t{activty_name} activity") + # Get the details of the location activity + activity_location = self.activity_locations[activty_name] # Pointer to the ActivityLocation object + # Create a list to store the dangers associated with each location for this activity. + # Assume 0 initially, it should be reset each day + loc_dangers = [0] * len(activity_location.get_dangers()) + # loc_dangers = activity_location.get_dangers() # List of the current dangers associated with each place + + # Now look up those venues in the table of individuals + venues_col = f"{activty_name}{ColumnNames.ACTIVITY_VENUES}" # The names of the venues and + flows_col = f"{activty_name}{ColumnNames.ACTIVITY_FLOWS}" # flows in the individuals DataFrame + durations_col = f"{activty_name}{ColumnNames.ACTIVITY_DURATION}" # flows in the individuals DataFrame + + # 2D lists, for each individual: the venues they visit, the flows to the venue (i.e. how much they visit it) + # and the durations (how long they spend doing it) + statuses = self.individuals[ColumnNames.DISEASE_STATUS] + venues = self.individuals.loc[:, venues_col] + flows = self.individuals.loc[:, flows_col] + durations = self.individuals.loc[:, durations_col] + assert len(venues) == len(flows) and len(venues) == len(statuses) + for i, (v, f, s, duration) in enumerate(zip(venues, flows, statuses, durations)): # For each individual + # Only people with the disease who are infectious will add danger to a place + if s == ColumnNames.DiseaseStatuses.PRESYMPTOMATIC \ + or s == ColumnNames.DiseaseStatuses.SYMPTOMATIC \ + or s == ColumnNames.DiseaseStatuses.ASYMPTOMATIC: + # The hazard multiplier depends on the type of disease status that this person has + # There may be different multipliers passed in a dictionary as calibration parameters, if not + # then assume the multiplier is 1.0 + individual_hazard_multiplier: float = None + if not self.hazard_individual_multipliers: # The dictionary is empty + individual_hazard_multiplier = 1.0 + else: # A dict was passed, so find out what the values of the multiplier are by disease status + if s == ColumnNames.DiseaseStatuses.PRESYMPTOMATIC: + individual_hazard_multiplier = self.hazard_individual_multipliers['presymptomatic'] + elif s == ColumnNames.DiseaseStatuses.SYMPTOMATIC: + individual_hazard_multiplier = self.hazard_individual_multipliers['symptomatic'] + elif s == ColumnNames.DiseaseStatuses.ASYMPTOMATIC: + individual_hazard_multiplier = self.hazard_individual_multipliers['asymptomatic'] + assert individual_hazard_multiplier is not None + # There may also be a hazard multiplier for locations (i.e. some locations become more hazardous + # than others + location_hazard_multiplier = None + if not self.hazard_location_multipliers: # The dictionary is empty + location_hazard_multiplier = 1.0 + else: + location_hazard_multiplier = self.hazard_location_multipliers[activty_name] + assert location_hazard_multiplier is not None + + # v and f are lists of flows and venues for the individual. Go through each one + for venue_idx, flow in zip(v, f): + # print(i, venue_idx, flow, duration) + # Increase the danger by the flow multiplied by some disease risk + danger_increase = (flow * duration * individual_hazard_multiplier * location_hazard_multiplier) + if activty_name == ColumnNames.Activities.WORK: + warnings.warn("Temporarily reduce danger for work while we have virtual work locations") + work_danger = float(danger_increase / 20) + loc_dangers[venue_idx] += work_danger + else: + loc_dangers[venue_idx] += danger_increase + + # + # ***** 2 - risks for individuals who visit dangerous venues + # + + # It's useful to report the specific risks associated with *this* activity for each individual + activity_specific_risk = [0] * len(self.individuals) + + for i, (v, f, s, duration) in enumerate(zip(venues, flows, statuses, durations)): # For each individual + # v and f are lists of flows and venues for the individual. Go through each one + for venue_idx, flow in zip(v, f): + # Danger associated with the location (we just created these updated dangers in the previous loop) + danger = loc_dangers[venue_idx] + risk_increase = flow * danger * duration * self.risk_multiplier + current_risk[i] += risk_increase + activity_specific_risk[i] += risk_increase + + # Remember the (rounded) risk for this activity + if decimals is not None: + activity_specific_risk = [round(x, decimals) for x in activity_specific_risk] + self.individuals[f"{activty_name}{ColumnNames.ACTIVITY_RISK}"] = activity_specific_risk + + # Now we have the dangers associated with each location, apply these back to the main dataframe + if decimals is not None: # Round the dangers? + loc_dangers = [round(x, decimals) for x in loc_dangers] + activity_location.update_dangers(loc_dangers) + + # Round the current risk + if decimals is not None: + current_risk = [round(x, decimals) for x in current_risk] + + # Sanity check + assert len(current_risk) == len(self.individuals) + assert min(current_risk) >= 0 # Should not be risk less than 0 + # Santity check - do the risks of each activity add up to the total? + # (I can't get this to work, there are some really minor differences, but on the whole it looks fine) + # (I can't get this to work, there are some really minor differences, but on the whole it looks fine) + # if Microsim.debug: # replace with .debug + # total_risk = [0.0] * len(self.individuals) + # for activty_name in self.activity_locations: + # total_risk = [i + j for (i, j) in zip(total_risk, list(self.individuals[f"{activty_name}{ColumnNames.ACTIVITY_RISK}"]))] + # # Round both + # total_risk = [round(x, 5) for x in total_risk] + # current_risk_temp = [round(x, 5) for x in current_risk] + # assert current_risk_temp == total_risk + + self.individuals[ColumnNames.CURRENT_RISK] = current_risk + + return + + # No longer update disease counts per MSOA etc. Not needed + #def update_disease_counts(self): + # """Update some disease counters -- counts of diseases in MSOAs & households -- which are useful + # in estimating the probability of contracting the disease""" + # # Update the diseases per MSOA and household + # # TODO replace Nan's with 0 (not a problem with MSOAs because they're a cateogry so the value_counts() + # # returns all, including those with 0 counts, but with HID those with 0 count don't get returned + # # Get rows with cases + # cases = self.individuals.loc[(self.individuals[ColumnNames.DISEASE_STATUS] == 1) | + # (self.individuals[ColumnNames.DISEASE_STATUS] == 2), :] + # # Count cases per area (convert to a dataframe) + # case_counts = cases["area"].value_counts() + # case_counts = pd.DataFrame(data={"area": case_counts.index, "Count": case_counts}).reset_index(drop=True) + # # Link this back to the orignal data + # self.individuals[ColumnNames.MSOA_CASES] = self.individuals.merge(case_counts, on="area", how="left")["Count"] + # self.individuals[ColumnNames.MSOA_CASES].fillna(0, inplace=True) + # + # # Update HID cases + # case_counts = cases["House_ID"].value_counts() + # case_counts = pd.DataFrame(data={"House_ID": case_counts.index, "Count": case_counts}).reset_index(drop=True) + # self.individuals[ColumnNames.HID_CASES] = self.individuals.merge(case_counts, on="House_ID", how="left")[ + # "Count"] + # self.individuals[ColumnNames.HID_CASES].fillna(0, inplace=True) + + def calculate_new_disease_status(self) -> None: + """ + Call an R function to calculate the new disease status for all individuals. + Update the indivdiuals dataframe in place + :return: . Update the dataframe inplace + """ + # Remember the old status so that we can calculate whether it has changed + old_status: pd.Series = self.individuals[ColumnNames.DISEASE_STATUS].copy() + + # Calculate the new status (will return a new dataframe) + self.individuals = self.r_int.calculate_disease_status(self.individuals, self.iteration, self.disease_params) + + # Remember whose status has changed + new_status: pd.Series = self.individuals[ColumnNames.DISEASE_STATUS].copy() + self.individuals[ColumnNames.DISEASE_STATUS_CHANGED] = list(new_status != old_status) + + # For info, find out how the statuses have changed. + # Make a dict with all possible changes, then loop through and count them. + change = dict() + for old in ColumnNames.DiseaseStatuses.ALL: + for new in ColumnNames.DiseaseStatuses.ALL: + change[(old, new)] = 0 + for (old, new) in zip(old_status, new_status): + if new != old: + change[(old, new)] += 1 + + assert sum(change.values()) == len(new_status[new_status != old_status]) + + print(f"\t{len(new_status[new_status != old_status])} individuals have a different status. Status changes:") + for old in ColumnNames.DiseaseStatuses.ALL: + print(f"\t\t{old} -> ", end="") + for new in ColumnNames.DiseaseStatuses.ALL: + print(f" {new}:{change[(old,new)]} \t", end="") + print() + + def change_behaviour_with_disease(self) -> None: + """ + When people have the disease the proportions that they spend doing activities changes. This function applies + those changes inline to the individuals dataframe + :return: None. Update the dataframe inplace + """ + #print("Changing behaviour of infected individuals ... ",) + # Find out which people have changed status + change_idx = self.individuals.index[self.individuals[ColumnNames.DISEASE_STATUS_CHANGED] == True] + + # Now set their new behaviour + self.individuals.loc[change_idx] = \ + self.individuals.loc[change_idx].apply( + func=Microsim._set_new_behaviour, args=( list(self.activity_locations.keys()), ), axis=1) + # self.individuals.loc[change_idx].swifter.progress_bar(True, desc="Changing behaviour of infected"). \ + + print(f"\tCurrent statuses:" + f"\n\t\tSusceptible ({ColumnNames.DiseaseStatuses.SUSCEPTIBLE}): {len(self.individuals.loc[self.individuals[ColumnNames.DISEASE_STATUS] == ColumnNames.DiseaseStatuses.SUSCEPTIBLE])}" + f"\n\t\tExposed ({ColumnNames.DiseaseStatuses.EXPOSED}): {len(self.individuals.loc[self.individuals[ColumnNames.DISEASE_STATUS] == ColumnNames.DiseaseStatuses.EXPOSED])}" + f"\n\t\tPresymptomatic ({ColumnNames.DiseaseStatuses.PRESYMPTOMATIC}): {len(self.individuals.loc[self.individuals[ColumnNames.DISEASE_STATUS] == ColumnNames.DiseaseStatuses.PRESYMPTOMATIC])}" + f"\n\t\tSymptomatic ({ColumnNames.DiseaseStatuses.SYMPTOMATIC}): {len(self.individuals.loc[self.individuals[ColumnNames.DISEASE_STATUS] == ColumnNames.DiseaseStatuses.SYMPTOMATIC])}" + f"\n\t\tAsymptomatic ({ColumnNames.DiseaseStatuses.ASYMPTOMATIC}): {len(self.individuals.loc[self.individuals[ColumnNames.DISEASE_STATUS] == ColumnNames.DiseaseStatuses.ASYMPTOMATIC])}" + f"\n\t\tRecovered ({ColumnNames.DiseaseStatuses.RECOVERED}): {len(self.individuals.loc[self.individuals[ColumnNames.DISEASE_STATUS] == ColumnNames.DiseaseStatuses.RECOVERED])}" + f"\n\t\tRemoved/dead ({ColumnNames.DiseaseStatuses.DEAD}): {len(self.individuals.loc[self.individuals[ColumnNames.DISEASE_STATUS] == ColumnNames.DiseaseStatuses.DEAD])}") + + #self.individuals.loc[change_idx].apply(func=self._set_new_behaviour, axis=1) + #print("... finished") + + @staticmethod + def _set_new_behaviour(row: pd.Series, activities: List[str]): + """ + Define how someone with the disease should behave. This is called for every individual whose disease status + has changed between the current iteration and the previous one + :param row: A row of the individuals dataframe + :return: An updated row with new ACTIVITY_DURATION columns to reflect the changes in proportion of time that + the individual spends doing the different activities. + """ + # Maybe need to put non-symptomatic people back to normal behaviour (or do nothing if they e.g. transfer from + # Susceptible to Pre-symptomatic, which means they continue doing normal behaviour) + # Minor bug: this will erode any changes caused by lockdown behaviour for the rest of this iteration, but this + # only affects people whose status has just changed so only a minor problem + if row[ColumnNames.DISEASE_STATUS] in [ColumnNames.DiseaseStatuses.SUSCEPTIBLE, + ColumnNames.DiseaseStatuses.EXPOSED, + ColumnNames.DiseaseStatuses.PRESYMPTOMATIC, + ColumnNames.DiseaseStatuses.ASYMPTOMATIC, + ColumnNames.DiseaseStatuses.RECOVERED, + ColumnNames.DiseaseStatuses.DEAD]: + for activity in activities: + row[f"{activity}{ColumnNames.ACTIVITY_DURATION}"] = \ + row[f"{activity}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + + # Put newly symptomatic people at home + elif row[ColumnNames.DISEASE_STATUS] == ColumnNames.DiseaseStatuses.SYMPTOMATIC: + # Reduce all activities, replacing the lost time with time spent at home + non_home_activities = set(activities) + non_home_activities.remove(ColumnNames.Activities.HOME) + total_duration = 0.0 # Need to remember the total duration of time lost for non-home activities + for activity in non_home_activities: + #new_duration = row[f"{activity}{ColumnNames.ACTIVITY_DURATION}"] * 0.10 + new_duration = row[f"{activity}{ColumnNames.ACTIVITY_DURATION}"] * 0.50 + total_duration += new_duration + row[f"{activity}{ColumnNames.ACTIVITY_DURATION}"] = new_duration + # Now set home duration to fill in the time lost from doing other activities. + row[f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] = (1 - total_duration) + else: + raise Exception(f"Unrecognised disease state for individual {row['ID']}: {row[ColumnNames.DISEASE_STATUS] }") + return row + + @staticmethod + def _make_a_copy(m): + """ + When copying a microsim object, reset the seed + + :param m: A Microsim object + :return: A deep copy of the microsim object + """ + m.random.seed() + return copy.deepcopy(m) + + def step(self) -> None: + """ + Step (iterate) the model for 1 iteration + + :return: + """ + self.iteration += 1 + + print(f"\nIteration: {self.iteration}\n") + + # Unilaterally adjust the proportions of time that people spend doing different activities after lockdown + self.update_behaviour_during_lockdown() + + # Update the danger associated with each venue (i.e. the as people with the disease visit them they + # become more dangerous) then update the risk to each individual of going to those venues. + self.update_venue_danger_and_risks() + + # Update disease counters. E.g. count diseases in MSOAs & households + # No longer update disease counts per MSOA etc. Not needed + #self.update_disease_counts() + + # Calculate new disease status and update the people's behaviour + if not self.disable_disease_status: + self.calculate_new_disease_status() + self.change_behaviour_with_disease() + + def run(self, iterations: int, repnr: int) -> None: + """ + Run the model (call the step() function) for the given number of iterations. Repnr is used to create new unique directory for this repeat + :param iterations: + """ + # Create directories for the results + self._init_output(repnr) + + # Initialise the R interface. Do this here, rather than in init, because when in multiprocessing mode + # at this point the Microsim object will be in its own process + if not self.disable_disease_status: + self.r_int = RInterface(self.r_script_dir) + + # Step the model + for i in range(iterations): + iter_start_time = time.time() # time how long each iteration takes (for info) + self.step() + + # Add to items to pickle for visualisations + if self.output: + print("\tPreparing output ... " + + "(but not writing anything until the end)" if not self.output_every_iteration else "",) + # Add the new column names for this iteration's disease statuses + # (Force column names to have leading zeros) + self.individuals_to_pickle[f"{ColumnNames.DISEASE_STATUS}{(i + 1):03d}"] = self.individuals[ + ColumnNames.DISEASE_STATUS] + # Write the output at the end, or at every iteration + if i == (iterations-1) or self.output_every_iteration: + print("\t\tWriting individuals file... ") + fname = os.path.join(self.output_dir, "Individuals") + with open(fname + ".pickle", "wb") as pickle_out: + pickle.dump(self.individuals_to_pickle, pickle_out) + # Also make a (compressed) csv file for others + self.individuals_to_pickle.to_csv(fname + ".csv.gz", compression='gzip') + + for name in self.activity_locations: + # Get the details of the location activity + activity = self.activity_locations[name] # Pointer to the ActivityLocation object + loc_name = activity.get_name() # retail, school etc + # loc_ids = activity.get_ids() # List of the IDs of the locations + loc_dangers = activity.get_dangers() # List of the current dangers + # Add a new danger column to the previous dataframe + self.activities_to_pickle[loc_name][f"{ColumnNames.LOCATION_DANGER}{(i + 1):03d}"] = loc_dangers + # Save this activity location + if i == (iterations - 1) or self.output_every_iteration: + print(f"\t\tWriting activity file for {name}... ") + fname = os.path.join(self.output_dir, loc_name) + with open(fname + ".pickle", "wb") as pickle_out: + pickle.dump(self.activities_to_pickle[loc_name], pickle_out) + # Also make a (compressed) csv file for others + self.activities_to_pickle[loc_name].to_csv(fname + ".csv.gz", compression='gzip') + # self.activities_to_pickle[loc_name].to_csv(fname+".csv") # They not so big so don't compress + + print(f"\tIteration {i} took {round(float(time.time() - iter_start_time), 2)}s") + + print(f"Model finished running (iterations: {i+1})") + + # TEMP WRITE OUTPUT AT END + #fname = os.path.join(self.output_dir, "Individuals") + #with open(fname + ".pickle", "wb") as pickle_out: + # pickle.dump(self.individuals_to_pickle, pickle_out) + #self.individuals_to_pickle.to_csv(fname + ".csv.gz", compression='gzip') + #for name in self.activity_locations: + # loc_name = self.activity_locations[name].get_name() + # fname = os.path.join(self.output_dir, loc_name) + # with open(fname + ".pickle", "wb") as pickle_out: + # pickle.dump(self.activities_to_pickle[loc_name], pickle_out) + # # Also make a (compressed) csv file for others + # self.activities_to_pickle[loc_name].to_csv(fname + ".csv.gz", compression='gzip') + + +# ******** +# PROGRAM ENTRY POINT +# Uses 'click' library so that it can be run from the command line +# ******** +@click.command() +@click.option('-p', '--parameters_file', default="./model_parameters/default.yml", type=click.Path(exists=True), + help="Parameters file to use to configure the model. Default: ./model_parameters/default.yml") +@click.option('-npf', '--no-parameters-file', is_flag=True, + help="Don't read a parameters file, use command line arguments instead") +@click.option('-i', '--iterations', default=10, help='Number of model iterations. 0 means just run the initialisation') +@click.option('-s', '--scenario', default="default", help="Name this scenario; output results will be put into a " + "directory with this name.") +@click.option('--data-dir', default="devon_data", help='Root directory to load data from') +@click.option('--output/--no-output', default=True, + help='Whether to generate output data (default yes).') +@click.option('--output-every-iteration/--no-output-every-iteration', default=False, + help='Whether to generate output data at every iteration rather than just at the end (default no).') +@click.option('--debug/--no-debug', default=False, help="Whether to run some more expensive checks (default no debug)") +@click.option('-r', '--repetitions', default=1, help="How many times to run the model (default 1)") +@click.option('-l', '--lockdown-file', default="google_mobility_lockdown_daily.csv", + help="Optionally read lockdown mobility data from a file (default use google mobility). To have no " + "lockdown pass an empty string, i.e. --lockdown-file='' ") + +def run_script(parameters_file, no_parameters_file, iterations, data_dir, output, output_every_iteration, debug, + repetitions, lockdown_file, scenario): + + # First see if we're reading a parameters file or using command-line arguments. + if no_parameters_file: + print("Not reading a parameters file") + else: + print(f"Reading parameters file: {parameters_file}. Any other command-line arguments are being ignored") + with open(parameters_file, 'r') as f: + parameters = load(f, Loader=SafeLoader) + sim_params = parameters["microsim"] # Parameters for the dynamic microsim (python) + calibration_params = parameters["microsim_calibration"] + disease_params = parameters["disease"] # Parameters for the disease model (r) + # TODO Implement a more elegant way to set the parameters and pass them to the model. E.g.: + # self.params, self.params_changed = Model._init_kwargs(params, kwargs) + # [setattr(self, key, value) for key, value in self.params.items()] + # Utility parameters + scenario = sim_params["scenario"] + iterations = sim_params["iterations"] + data_dir = sim_params["data-dir"] + output = sim_params["output"] + output_every_iteration = sim_params["output-every-iteration"] + debug = sim_params["debug"] + repetitions = sim_params["repetitions"] + lockdown_file = sim_params["lockdown-file"] + + # Check the parameters are sensible + if iterations < 0: + raise ValueError("Iterations must be > 0") + if repetitions < 1: + raise ValueError("Repetitions must be greater than 0") + if (not output) and output_every_iteration: + raise ValueError("Can't choose to not output any data (output=False) but also write the data at every " + "iteration (output_every_iteration=True)") + + print(f"Running model with the following parameters:\n" + f"\tParameters file: {parameters_file}\n" + f"\tScenario directory: {scenario}\n" + f"\tNumber of iterations: {iterations}\n" + f"\tData dir: {data_dir}\n" + f"\tOutputting results?: {output}\n" + f"\tOutputting results at every iteration?: {output_every_iteration}\n" + f"\tDebug mode?: {debug}\n" + f"\tNumber of repetitions: {repetitions}\n" + f"\tLockdown file: {lockdown_file}\n" + f"\tCalibration parameters: {'N/A (not reading parameters file)' if no_parameters_file else str(calibration_params)}\n") + + if iterations == 0: + print("Iterations = 0. Not stepping model, just assigning the initial risks.") + + # To fix file path issues, use absolute/full path at all times + # Pick either: get working directory (if user starts this script in place, or set working directory + # Option A: copy current working directory: + base_dir = os.getcwd() # get current directory + data_dir = os.path.join(base_dir, data_dir) + r_script_dir = os.path.join(base_dir, "R", "py_int") + + # Temporarily only want to use Devon MSOAs + devon_msoas = pd.read_csv(os.path.join(data_dir, "devon_msoas.csv"), header=None, + names=["x", "y", "Num", "Code", "Desc"]) + + # Use same arguments whether running 1 repetition or many + msim_args = {"data_dir": data_dir, "scen_dir": scenario, "r_script_dir": r_script_dir, + "output": output, "output_every_iteration": output_every_iteration, "debug": debug, + "lockdown_file": lockdown_file, + } + + if not no_parameters_file: # When using a parameters file, include the calibration parameters + msim_args.update(**calibration_params) # python calibration parameters are unpacked now + # Also read the R calibration parameters (this is a separate section in the .yml file) + if disease_params is not None: + # (If the 'disease_params' section is included but has no calibration variables then we want to ignore it - + # it will be turned into an empty dictionary by the Microsim constructor) + msim_args["disease_params"] = disease_params # R parameters kept as a dictionary and unpacked later + + # Temporily use dummy data for testing + # data_dir = os.path.join(base_dir, "dummy_data") + # m = Microsim(data_dir=data_dir, testing=True, output=output) + + # Run it! + if repetitions == 1: + # Create a microsim object + m = Microsim(**msim_args) + copyfile(parameters_file,os.path.join(m.SCEN_DIR,"parameters.yml")) + m.run(iterations,0) + + else: # Run it multiple times in lots of cores + try: + with multiprocessing.Pool(processes=int(os.cpu_count())) as pool: + # Copy the model instance so we don't have to re-read the data each time + # (Use a generator so we don't need to store all the models in memory at once). + m = Microsim(**msim_args) + copyfile(parameters_file,os.path.join(m.SCEN_DIR,"parameters.yml")) + models = (Microsim._make_a_copy(m) for _ in range(repetitions)) + pickle_out = open(os.path.join("Models_m.pickle"), "wb") + pickle.dump(m, pickle_out) + pickle_out.close() + # models = ( Microsim(msim_args) for _ in range(repetitions)) + # Also need a list giving the number of iterations for each model (same for each model) + iters = (iterations for _ in range(repetitions)) + repnr = (r for r in range(repetitions)) + # Run the models by passing each model and the number of iterations + pool.starmap(_run_multicore, zip(models, iters,repnr)) + finally: # Make sure they get closed (shouldn't be necessary) + pool.close() + + print("End of program") + + +def _run_multicore(m, iter,rep): + return m.run(iter,rep) + + +if __name__ == "__main__": + run_script() + print("End of program") diff --git a/build/lib/microsim/r_interface.py b/build/lib/microsim/r_interface.py new file mode 100644 index 00000000..1196c018 --- /dev/null +++ b/build/lib/microsim/r_interface.py @@ -0,0 +1,94 @@ +import pandas as pd +import rpy2.rinterface +import rpy2.robjects.packages as rpackages # For installing packages +import rpy2.robjects as ro +from rpy2.robjects import pandas2ri +pandas2ri.activate() +from microsim.column_names import ColumnNames + +class RInterface(): + """ + An RInterface object can be used to create an R session, initialise everything that is needed for the disease + status estimation function, and then interact with session to calculate the disease status. + """ + + def __init__(self, script_dir): + """ + + :param script_dir: The directory where the scripts can be found + """ + print(f"Initialising R interface. Loading R scripts in {script_dir}.") + self.script_dir = script_dir # Useful to remember for debugging, but not actually needed + R = ro.r + R.setwd(script_dir) + try: + # Read the script (doesn't run any functions) + R.source("covid_run.R") + # Call a function to initialize the needed R packages and data + R.initialize_r() + except rpy2.rinterface.embedded.RRuntimeError as e: + # R libraries probably need installing. THe lines below *should* do this, but it's probably better + # to install them manually. + #print("\tInstalling required libraries") + #RInterface._install_libs(R) + print(f"\tError trying to start R: {str(e)}. Libraries probably need installing. Look in the file" + f"'R/py_int/covid_run.R' to see which libraries are needed.") + raise e + + # Remember the session + self.R = R + + def calculate_disease_status(self, individuals: pd.DataFrame, iteration: int, disease_params: dict ): + """ + Call the R 'run_status' function to calculate the new disease status. It will return a new dataframe with + a few columns, including the new status. + :param individuals: The individuals dataframe from which new statuses need to be calculated + :return: a new dataframe that includes new disease statuses + """ + print("\tCalculating new disease status...", end='') + # It's expesive to convert large dataframes, only give the required columns to R. + cols = ["area", "House_ID", "ID", "age", "Sex", ColumnNames.CURRENT_RISK, "pnothome", + ColumnNames.DISEASE_STATUS, ColumnNames.DISEASE_PRESYMP, ColumnNames.DISEASE_SYMP_DAYS, + ColumnNames.DISEASE_EXPOSED_DAYS] + individuals_reduced = individuals.loc[:, cols] + individuals_reduced["area"] = individuals_reduced.area.astype(str) + individuals_reduced["id"] = individuals_reduced.ID + del individuals_reduced["ID"] + individuals_reduced["house_id"] = individuals_reduced.House_ID + del individuals_reduced["House_ID"] + + # Call the R function. The returned object will be converted to a pandas dataframe implicitly + r_df = self.R.run_status(individuals_reduced, iteration, **disease_params) + + assert len(r_df) == len(individuals) + assert False not in list(r_df.ID.values == individuals.ID.values) # Check that person IDs are the same + + # Update the individuals dataframe with the new values + for col in [ColumnNames.DISEASE_STATUS, ColumnNames.DISEASE_PRESYMP, ColumnNames.DISEASE_SYMP_DAYS, ColumnNames.DISEASE_EXPOSED_DAYS]: + individuals[col] = list(r_df[col]) + assert False not in (individuals.loc[individuals[ColumnNames.DISEASE_STATUS] > 0, ColumnNames.DISEASE_STATUS].values == + r_df.loc[r_df[ColumnNames.DISEASE_STATUS] > 0, ColumnNames.DISEASE_STATUS].values) + + print(" .... finished.") + return individuals + + + @staticmethod + def _install_libs(R: rpy2.robjects.r): + """Install the libraries required to run the script using the given R session. + See https://rpy2.github.io/doc/latest/html/introduction.html.""" + # TODO (not important) see if this method works properly. Not important because libraries can be pre-loaded + # and included in the anaconda environment anyway + # import R's utility package + utils = rpackages.importr('utils') + # select a mirror for R packages + utils.chooseCRANmirror(ind=1) # select the first mirror in the list + # R package names + packnames = ("dplyr", "tidyr", "janitor", "readr", "mixdist") + # R vector of strings + from rpy2.robjects.vectors import StrVector + # Selectively install what needs to be install. + # We are fancy, just because we can. + names_to_install = [x for x in packnames if not rpackages.isinstalled(x)] + if len(names_to_install) > 0: + utils.install_packages(StrVector(names_to_install)) diff --git a/build/lib/microsim/utilities.py b/build/lib/microsim/utilities.py new file mode 100644 index 00000000..caa98701 --- /dev/null +++ b/build/lib/microsim/utilities.py @@ -0,0 +1,43 @@ +# Contains some useful utility functionality +import pandas as pd +from typing import List + +class Optimise: + """ + Functions to optimise the memory use of pandas dataframes. + From https://medium.com/bigdatarepublic/advanced-pandas-optimize-speed-and-memory-a654b53be6c2 + """ + + @staticmethod + def optimize(df: pd.DataFrame, datetime_features: List[str] = []): + return Optimise._optimize_floats(Optimise._optimize_ints(Optimise._optimize_objects(df, datetime_features))) + + + @staticmethod + def _optimize_floats(df: pd.DataFrame) -> pd.DataFrame: + floats = df.select_dtypes(include=['float64']).columns.tolist() + df[floats] = df[floats].apply(pd.to_numeric, downcast='float') + return df + + + @staticmethod + def _optimize_ints(df: pd.DataFrame) -> pd.DataFrame: + ints = df.select_dtypes(include=['int64']).columns.tolist() + df[ints] = df[ints].apply(pd.to_numeric, downcast='integer') + return df + + + @staticmethod + def _optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame: + for col in df.select_dtypes(include=['object']): + if col not in datetime_features: + num_unique_values = len(df[col].unique()) + num_total_values = len(df[col]) + if float(num_unique_values) / num_total_values < 0.5: + df[col] = df[col].astype('category') + else: + df[col] = pd.to_datetime(df[col]) + return df + + + diff --git a/build/lib/tests/__init__.py b/build/lib/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/lib/tests/test_dashboard.py b/build/lib/tests/test_dashboard.py new file mode 100644 index 00000000..c9ac9f5e --- /dev/null +++ b/build/lib/tests/test_dashboard.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue May 19 12:01:02 2020 + +@author: Natalie +""" + + +import pytest +import os +import pickle +import pandas as pd +import numpy as np +import microsim.dashboard as dash +from yaml import load, SafeLoader + + +# Check preprocessing with dummy data + +# # code to create pickle files from csv +# retail = pd.read_csv("Retail1.csv") +# pickle_out = open("Retail.pickle", "wb") +# pickle.dump(retail, pickle_out) +# pickle_out.close() +# individuals = pd.read_csv("Individuals1.csv") +# pickle_out = open("Individuals.pickle", "wb") +# pickle.dump(individuals, pickle_out) +# pickle_out.close() + + +# create parameters used in later tests +@pytest.fixture +def example_input_params(): + params = {} + params["base_dir"] = os.getcwd() # get current directory (usually RAMP-UA) + params["data_dir"] = "dummy_data" + params["start_day"] = 0 + params["end_day"] = 10 + params["start_run"] = 0 + params["end_run"] = 1 + params["sc_dir"] = "output" + params["sc_nam"] = "Test scenario" + + params["conditions_dict"] = { + "susceptible": 0, + "exposed": 1, + "presymptomatic": 2, + "symptomatic": 3, + "asymptomatic": 4, + "recovered": 5, + "dead": 6, + } + + params["locations_dict"] = { + "Retail": "Retail", + } + params["data_dir"] = os.path.join(params["base_dir"], params["data_dir"]) # update data dir + params["nr_days"] = params["end_day"] - params["start_day"] + 1 + params["nr_runs"] = params["end_run"] - params["start_run"] + 1 + params["r_range"] = range(params["start_run"], params["end_run"]+1) + return params + + +# just to check pytest +# def test_always_passes(): +# assert True + +# def test_always_fails(): +# assert False + + +# check input parameters +def test_check_input(example_input_params): + assert example_input_params["nr_days"] == 11 + +def test_calc_nr_days(example_input_params): + nr_days = dash.calc_nr_days(os.path.join(example_input_params["data_dir"], example_input_params["sc_dir"],"0","Retail.pickle")) + assert nr_days == example_input_params["nr_days"] + +@pytest.fixture +def example_dangers_dict(example_input_params): + dangers_dict, dangers_dict_std, dangers_dict_3d = dash.create_venue_dangers_dict(example_input_params["locations_dict"],example_input_params["r_range"],os.path.join(example_input_params["data_dir"],example_input_params["sc_dir"]),example_input_params["start_day"],example_input_params["end_day"],example_input_params["start_run"],example_input_params["nr_runs"]) + return dangers_dict + +def test_create_venue_dangers_dict(example_input_params): + dangers_dict, dangers_dict_std, dangers_dict_3d = dash.create_venue_dangers_dict(example_input_params["locations_dict"],example_input_params["r_range"],os.path.join(example_input_params["data_dir"],example_input_params["sc_dir"]),example_input_params["start_day"],example_input_params["end_day"],example_input_params["start_run"],example_input_params["nr_runs"]) + # check there are no missing data + assert dangers_dict_3d['Retail'].shape == (20, 11, 2) + assert dangers_dict_std['Retail'].shape == (20, 11) + assert dangers_dict['Retail'].shape == (20, 11) + + # check mean and standard deviation for 1 element + assert dangers_dict['Retail'].iloc[0,10] == (dangers_dict_3d['Retail'][0,10,0] + dangers_dict_3d['Retail'][0,10,1]) / 2 + assert dangers_dict_std['Retail'].iloc[0,10] == np.array([dangers_dict_3d['Retail'][0,10,0], dangers_dict_3d['Retail'][0,10,1]]).std() + + + +def test_create_difference_dict(example_input_params, example_dangers_dict): + # subtract from itself to give 0's + result = dash.create_difference_dict(example_dangers_dict,example_dangers_dict,example_input_params["locations_dict"]) + assert result["Retail"].sum(axis=1).sum(axis=0) == 0 + # create some dummy data + dict1 = {"Retail": pd.DataFrame ({'1': [1, 2],'2': [1, 2]}, columns = ['1','2'])} + dict2 = {"Retail": pd.DataFrame ({'1': [2, 3],'2': [2, 4]}, columns = ['1','2'])} + result2 = dash.create_difference_dict(dict1,dict2,example_input_params["locations_dict"]) + assert result2["Retail"].sum(axis=1).sum(axis=0) == 5 + assert result2["Retail"].iloc[0,0] == 1 + + +def test_create_msoa_dangers_dict(example_input_params, example_dangers_dict): + + msoa_codes = pd.Series(["E02004164","E02004164","E02004164","E02004164","E02004164","E02004164","E02004165","E02004165","E02004165","E02004165","E02004165","E02004165","E02004165","E02004169","E02004169","E02004169","E02004169", "E02004169","E02004169","E02004169"]) + + dangers_msoa_dict = dash.create_msoa_dangers_dict(example_dangers_dict,["Retail"],[msoa_codes]) + assert dangers_msoa_dict['Retail'].shape == (3, 11) + assert dangers_msoa_dict['Retail'].iloc[2,10] == example_dangers_dict['Retail'].iloc[13:20,10].mean() + + + +def test_create_counts_dict(example_input_params): + + age_cat = np.array([[0, 19], [20, 29], [30,44], [45,59], [60,74], [75,200]]) + msoas, totalcounts_dict, cumcounts_dict, agecounts_dict, msoacounts_dict, cumcounts_dict_3d, totalcounts_dict_std, cumcounts_dict_std, agecounts_dict_std, msoacounts_dict_std, totalcounts_dict_3d, agecounts_dict_3d, msoacounts_dict_3d, uniquecounts_dict_3d, uniquecounts_dict_std, uniquecounts_dict = dash.create_counts_dict(example_input_params["conditions_dict"],example_input_params["r_range"],os.path.join(example_input_params["data_dir"], example_input_params["sc_dir"]),example_input_params["start_day"],example_input_params["end_day"],example_input_params["start_run"],example_input_params["nr_runs"],age_cat) + + assert msoas == ["E02004164","E02004165","E02004169"] + + assert len(uniquecounts_dict_3d) == 7 + + assert uniquecounts_dict_3d['susceptible'].shape == (2,) + assert uniquecounts_dict_std['susceptible'].shape == (1,) + assert uniquecounts_dict['susceptible'].shape == (1,) + assert uniquecounts_dict_3d["asymptomatic"][1] == 0 + assert uniquecounts_dict['symptomatic'][0] == (uniquecounts_dict_3d['symptomatic'][0] + uniquecounts_dict_3d['symptomatic'][1]) / 2 + assert uniquecounts_dict_std['dead'][0] == np.array([uniquecounts_dict_3d['dead'][0], uniquecounts_dict_3d['dead'][1]]).std() + + assert cumcounts_dict_3d['exposed'].shape == (3,2) + assert cumcounts_dict_std['exposed'].shape == (3,) + assert cumcounts_dict['exposed'].shape == (3,) + assert cumcounts_dict_3d["asymptomatic"][2,0] == 1 + assert cumcounts_dict_3d["susceptible"][2,1] == 4 + assert cumcounts_dict['symptomatic'][1] == (cumcounts_dict_3d['symptomatic'][1,0] + cumcounts_dict_3d['symptomatic'][1,1]) / 2 + assert cumcounts_dict_std['dead'][0] == np.array([cumcounts_dict_3d['dead'][0,0], cumcounts_dict_3d['dead'][0,1]]).std() + + + assert totalcounts_dict_3d['dead'].shape == (11,2) + assert totalcounts_dict_std['dead'].shape == (11,) + assert totalcounts_dict['dead'].shape == (11,) + assert totalcounts_dict_3d["asymptomatic"][5,1] == 1 + assert totalcounts_dict['symptomatic'][10] == (totalcounts_dict_3d['symptomatic'][10,0] + totalcounts_dict_3d['symptomatic'][10,1]) / 2 + assert totalcounts_dict_std['dead'][2] == np.array([totalcounts_dict_3d['dead'][2,0], totalcounts_dict_3d['dead'][2,1]]).std() + + assert agecounts_dict_3d['asymptomatic'].shape == (6,11,2) + assert agecounts_dict_std['asymptomatic'].shape == (6,11) + assert agecounts_dict['asymptomatic'].shape == (6,11) + assert agecounts_dict_3d["presymptomatic"][3,5,1] == 1 + assert agecounts_dict['asymptomatic'].iloc[1,8] == (agecounts_dict_3d['asymptomatic'][1,8,0] + agecounts_dict_3d['asymptomatic'][1,8,1]) / 2 + assert agecounts_dict_std['exposed'].iloc[5,5] == np.array([agecounts_dict_3d['exposed'][5,5,0], agecounts_dict_3d['exposed'][5,5,1]]).std() + + assert msoacounts_dict_3d['asymptomatic'].shape == (3,11,2) + assert msoacounts_dict_std['asymptomatic'].shape == (3,11) + assert msoacounts_dict['asymptomatic'].shape == (3,11) + assert msoacounts_dict_3d["exposed"][1,6,1] == 3 + assert msoacounts_dict['presymptomatic'].iloc[1,5] == (msoacounts_dict_3d['presymptomatic'][1,5,0] + msoacounts_dict_3d['presymptomatic'][1,5,1]) / 2 + assert msoacounts_dict_std['dead'].iloc[0,10] == np.array([msoacounts_dict_3d['dead'][0,10,0], msoacounts_dict_3d['dead'][0,10,1]]).std() + + + + diff --git a/build/lib/tests/test_microsim_model.py b/build/lib/tests/test_microsim_model.py new file mode 100644 index 00000000..08ba2b03 --- /dev/null +++ b/build/lib/tests/test_microsim_model.py @@ -0,0 +1,687 @@ +import os +import pytest +import multiprocessing +import pandas as pd +import numpy as np +import copy +from microsim.microsim_model import Microsim +from microsim.column_names import ColumnNames +from microsim.activity_location import ActivityLocation + +# ******************************************************** +# These tests run through a whole dummy model process +# ******************************************************** + +test_dir = os.path.dirname(os.path.abspath(__file__)) + +# arguments used when calling the Microsim constructor. Usually these are the same +microsim_args = {"data_dir": os.path.join(test_dir, "dummy_data"), + "r_script_dir": os.path.normpath(os.path.join(test_dir, "..", "R/py_int")), + "testing": True, "debug": True, + "disable_disease_status": True, 'lockdown_file': ""} + + +# This 'fixture' means that other functions (e.g. step) can use the object created here. +# Note: Don't try to run this test, it will be called when running the others that need it, like `test_step()`. +@pytest.fixture() +def test_microsim(): + """Test the microsim constructor by reading dummy data. The microsim object created here can then be passed + to other functions for them to do their tests + """ + with pytest.raises(FileNotFoundError): + # This should fail because the directory doesn't exist + args = microsim_args.copy() + args['data_dir'] = "./bad_directory" + m = Microsim(**args) + + m = Microsim(**microsim_args) + + # Check that the dummy data have been read in correctly. E.g. check the number of individuals is + # accurate, that they link to households correctly, that they have the right *flows* to the right + # *destinations* and the right *durations* etc. + + assert len(m.individuals) == 17 + + # Households + # (The households df should be the same as the one in the corresponding activity location) + assert m.activity_locations[f"{ColumnNames.Activities.HOME}"]._locations.equals(m.households) + # All flows should be to one location (single element [1.0]) + for flow in m.individuals[f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_FLOWS}"]: + assert flow == [1.0] + + # House IDs are the same as the row index + assert False not in list(m.households.index == m.households.ID) + + # First two people live together in first household + assert list(m.individuals.loc[0:1, :][f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_VENUES}"].values) == [[0], [0]] + # This one lives on their own in the fourth house + assert list(m.individuals.loc[9:9, :][f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_VENUES}"].values) == [[3]] + # These three live together in the last house + assert list(m.individuals.loc[13:15, :][f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_VENUES}"].values) == [[6], [6], [6]] + + # Workplaces + # All flows should be to one location (single element [1.0]) + for flow in m.individuals[f"{ColumnNames.Activities.WORK}{ColumnNames.ACTIVITY_FLOWS}"]: + assert flow == [1.0] + # First person is the only one who does that job + assert len(list(m.individuals.loc[0:0, ][f"{ColumnNames.Activities.WORK}{ColumnNames.ACTIVITY_VENUES}"])) + job_index = list(m.individuals.loc[0:0, ][f"{ColumnNames.Activities.WORK}{ColumnNames.ACTIVITY_VENUES}"])[0][0] + for work_id in m.individuals.loc[1:len(m.individuals), f"{ColumnNames.Activities.WORK}{ColumnNames.ACTIVITY_VENUES}"]: + assert work_id[0] != job_index + # Three people do the same job as second person + job_index = list(m.individuals.loc[1:1, ][f"{ColumnNames.Activities.WORK}{ColumnNames.ACTIVITY_VENUES}"])[0] + assert list(m.individuals.loc[4:4, f"{ColumnNames.Activities.WORK}{ColumnNames.ACTIVITY_VENUES}"])[0] == job_index + assert list(m.individuals.loc[13:13, f"{ColumnNames.Activities.WORK}{ColumnNames.ACTIVITY_VENUES}"])[0] == job_index + # Not this person: + assert list(m.individuals.loc[15:15, f"{ColumnNames.Activities.WORK}{ColumnNames.ACTIVITY_VENUES}"])[0] != job_index + + # Test Shops + shop_locs = m.activity_locations[ColumnNames.Activities.RETAIL]._locations + assert len(shop_locs) == 248 + # First person has these flows and venues + venue_ids = list(m.individuals.loc[0:0, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_VENUES}"])[0] + # flows = list(m.individuals.loc[0:0, f"Retail{ColumnNames.ACTIVITY_FLOWS}"])[0] + # These are the venues in the filename: + raw_venues = sorted([24, 23, 22, 21, 19, 12, 13, 25, 20, 17]) + # Mark counts from 1, so these should be 1 greater than the ids + assert [x - 1 for x in raw_venues] == venue_ids + # Check the indexes point correctly + assert shop_locs.loc[0:0, ColumnNames.LOCATION_NAME].values[0] == "Co-op Lyme Regis" + assert shop_locs.loc[18:18, ColumnNames.LOCATION_NAME].values[0] == "Aldi Honiton" + + # Test Schools (similar to house/work above) (need to do for primary and secondary) + primary_locs = m.activity_locations[f"{ColumnNames.Activities.PRIMARY}"]._locations + secondary_locs = m.activity_locations[f"{ColumnNames.Activities.SECONDARY}"]._locations + # All schools are read in from one file, both primary and secondary + assert len(primary_locs) == 350 + assert len(secondary_locs) == 350 + assert primary_locs.equals(secondary_locs) + # Check primary and secondary indexes point to primary and secondary schools respectively + for indexes in m.individuals.loc[:, f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_VENUES}"]: + for index in indexes: + assert primary_locs.loc[index, "PhaseOfEducation_name"] == "Primary" + for indexes in m.individuals.loc[:, f"{ColumnNames.Activities.SECONDARY}{ColumnNames.ACTIVITY_VENUES}"]: + for index in indexes: + assert secondary_locs.loc[index, "PhaseOfEducation_name"] == "Secondary" + + # First person has these flows and venues to primary school + # (we know this because, by coincidence, the first person lives in the area that has the + # first area name if they were ordered alphabetically) + list(m.individuals.loc[0:0, "area"])[0] == "E00101308" + venue_ids = list(m.individuals.loc[0:0, f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_VENUES}"])[0] + raw_venues = sorted([12, 110, 118, 151, 163, 180, 220, 249, 280]) + # Mark counts from 1, so these should be 1 greater than the ids + assert [x - 1 for x in raw_venues] == venue_ids + # Check the indexes point correctly + assert primary_locs.loc[12:12, ColumnNames.LOCATION_NAME].values[0] == "Axminster Community Primary Academy" + assert primary_locs.loc[163:163, ColumnNames.LOCATION_NAME].values[0] == "Milton Abbot School" + + # Second to last person lives in 'E02004138' which will be the last area recorded in Mark's file + assert list(m.individuals.loc[9:9, "area"])[0] == "E02004159" + venue_ids = list(m.individuals.loc[9:9, f"{ColumnNames.Activities.SECONDARY}{ColumnNames.ACTIVITY_VENUES}"])[0] + raw_venues = sorted([335, 346]) + # Mark counts from 1, so these should be 1 greater than the ids + assert [x - 1 for x in raw_venues] == venue_ids + # Check these are both secondary schools + for idx in venue_ids: + assert secondary_locs.loc[idx, "PhaseOfEducation_name"] == "Secondary" + # Check the indexes point correctly + assert secondary_locs.loc[335:335, ColumnNames.LOCATION_NAME].values[0] == "South Dartmoor Community College" + + # Finished initialising the model. Pass it to other tests who need it. + yield m # (this could be 'return' but 'yield' means that any cleaning can be done here + + print("Cleaning up .... (actually nothing to clean up at the moment)") + + +# Test the home flows on the dummy data +def test_add_home_flows(test_microsim): + ind = test_microsim.individuals # save typine + # Using dummy data I know that there should be 2 person in household ID 0: + assert len(ind.loc[ind.House_ID == 0, :]) == 2 + # And 4 people in house ID 2 + assert len(ind.loc[ind.House_ID == 1, :]) == 3 + # And 1 in house ID 7 + assert len(ind.loc[ind.House_ID == 7, :]) == 1 + + +def test_read_school_flows_data(test_microsim): + """Check that flows to primary and secondary schools were read correctly """ + # Check priary and seconary have the same data (they're read together) + primary_schools = test_microsim.activity_locations[f"{ColumnNames.Activities.PRIMARY}"]._locations + secondary_schools = test_microsim.activity_locations[f"{ColumnNames.Activities.SECONDARY}"]._locations + assert primary_schools.equals(secondary_schools) + # But they don't point to the same dataframe + primary_schools["TestCol"] = 0 + assert "TestCol" not in list(secondary_schools.columns) + + schools = primary_schools # Just refer to them with one name + + # Check correct number of primary and secondary schools + # (these don't need to sum to total schools because there are a couple of nurseries in there + assert len(schools) == 350 + primary_schools = schools.loc[schools.PhaseOfEducation_name == "Primary"] + secondary_schools = schools.loc[schools.PhaseOfEducation_name == "Secondary"] + len(primary_schools) == 309 + len(secondary_schools) == 39 + + # Check all primary flows go to primary schools and secondary flows go to secondary schools + primary_flows = test_microsim.activity_locations[f"{ColumnNames.Activities.PRIMARY}"]._flows + secondary_flows = test_microsim.activity_locations[f"{ColumnNames.Activities.SECONDARY}"]._flows + # Following slice slice gives the total flow to each of the 350 schools (sum across rows for each colum and then + # drop the first two columns which are area ID and Code) + for school_no, flow in enumerate(primary_flows.sum(0)[2:]): + if flow > 0: + assert schools.iloc[school_no].PhaseOfEducation_name == "Primary" + for school_no, flow in enumerate(secondary_flows.sum(0)[2:]): + if flow > 0: + assert schools.iloc[school_no].PhaseOfEducation_name == "Secondary" + + +def test_read_msm_data(test_microsim): + """Checks the individual microsimulation data are read correctly""" + assert len(test_microsim.individuals) == 17 + assert len(test_microsim.households) == 8 + # Check correct number of 'homeless' (this is OK because of how I set up the data) + with pytest.raises(Exception) as e: + Microsim._check_no_homeless(test_microsim.individuals, test_microsim.households, warn=False) + # This should reaise an exception. Get the number of homeless. Should be 15 + num_homeless = [int(s) for s in e.message.split() if s.isdigit()][0] + print(f"Correctly found homeless: {num_homeless}") + assert num_homeless == 15 + + +# No longer updating disease counts. This can be removed. +# def test_update_disease_counts(test_microsim): +# """Check that disease counts for MSOAs and households are updated properly""" +# m = test_microsim # less typing +# # Make sure no one has the disease to start with +# m.individuals[ColumnNames.DISEASE_STATUS] = 0 +# # (Shouldn't use _PID any more, this is a hangover to old version, but works OK with dummy data) +# m.individuals.loc[9, ColumnNames.DISEASE_STATUS] = 1 # lives alone +# m.individuals.loc[13, ColumnNames.DISEASE_STATUS] = 1 # Lives with 3 people +# m.individuals.loc[11, ColumnNames.DISEASE_STATUS] = 1 # | Live +# m.individuals.loc[12, ColumnNames.DISEASE_STATUS] = 1 # | Together +# # m.individuals.loc[:, ["PID", "HID", "area", ColumnNames.DISEASE_STATUS, "MSOA_Cases", "HID_Cases"]] +# m.update_disease_counts() +# # This person has the disease +# assert m.individuals.at[9, "MSOA_Cases"] == 1 +# assert m.individuals.at[9, "HID_Cases"] == 1 +# # These people live with someone who has the disease +# for p in [13, 14, 15]: +# assert m.individuals.at[p, "MSOA_Cases"] == 1 +# assert m.individuals.at[p, "HID_Cases"] == 1 +# # Two people in this house have the disease +# for p in [11, 12]: +# assert m.individuals.at[p, "MSOA_Cases"] == 2 +# assert m.individuals.at[p, "HID_Cases"] == 2 +# +# # Note: Can't fully test MSOA cases because I don't have any examples of people from different +# # households living in the same MSOA in the test data + + +def test_change_behaviour_with_disease(test_microsim): + """Check that individuals behaviour changed correctly with the disease status""" + m = copy.deepcopy(test_microsim) # less typing and so as not to interfere with other tests + # Give some people the disease (these two chosen because they both spend a bit of time in retail + p1 = 1 + p2 = 6 + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC # Behaviour change + m.individuals.loc[p2, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.PRESYMPTOMATIC # No change + + m.step() + m.change_behaviour_with_disease() # (this isn't called by default when testing) + + # Nothing should have happended as we hadn't indicated a change in disease status + for p, act in zip([p1, p1, p2, p2], [ColumnNames.Activities.HOME, ColumnNames.Activities.RETAIL, + ColumnNames.Activities.HOME, ColumnNames.Activities.RETAIL]): + assert m.individuals.loc[p, f"{act}{ColumnNames.ACTIVITY_DURATION}"] == \ + m.individuals.loc[p, f"{act}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + + # Mark behaviour changed then try again + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS_CHANGED] = True + m.individuals.loc[p2, ColumnNames.DISEASE_STATUS_CHANGED] = True + + m.step() + m.change_behaviour_with_disease() # (this isn't called by default when testing) + + # First person should spend more time at home and less at work + assert m.individuals.loc[p1, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION}"] < m.individuals.loc[ + p1, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + assert m.individuals.loc[p1, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] > m.individuals.loc[ + p1, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + # Second person should be unchanged + assert m.individuals.loc[p2, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION}"] == m.individuals.loc[ + p2, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + assert m.individuals.loc[p2, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] == m.individuals.loc[ + p2, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + + # Mark behaviour changed then try again + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS_CHANGED] = True + m.individuals.loc[p2, ColumnNames.DISEASE_STATUS_CHANGED] = True + + m.step() + m.change_behaviour_with_disease() # (this isn't called by default when testing) + + # First person should spend more time at home and less at work + assert m.individuals.loc[p1, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION}"] < m.individuals.loc[ + p1, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + assert m.individuals.loc[p1, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] > m.individuals.loc[ + p1, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + + # Second person should be unchanged + assert m.individuals.loc[p2, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION}"] == m.individuals.loc[ + p2, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + assert m.individuals.loc[p2, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] == m.individuals.loc[ + p2, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + + # First person no longer infectious, behaviour should go back to normal + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.RECOVERED + m.step() + m.change_behaviour_with_disease() # (this isn't called by default when testing) + assert m.individuals.loc[p1, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION}"] == m.individuals.loc[ + p1, f"{ColumnNames.Activities.RETAIL}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + assert m.individuals.loc[p1, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] == m.individuals.loc[ + p1, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION_INITIAL}"] + + +def test_update_venue_danger_and_risks(test_microsim): + """Check that the current risk is updated properly""" + # This is actually tested as part of test_step + assert True + +def test_hazard_multipliers(test_microsim): + """ + This tests whether hazards for particular disease statuses or locations are multiplied properly. + The relevant code is in update_venue_danger_and_risks(). + + :param test_microsim: This is a pointer to the initialised model. Dummy data will have been read in, + but no stepping has taken place yet.""" + m = copy.deepcopy(test_microsim) # For less typing and so as not to interfere with other functions use test_microsim + + # Note: the following is a useul way to get relevant info about the individuals + # m.individuals.loc[:, ["ID", "PID", "HID", "area", ColumnNames.DISEASE_STATUS, "MSOA_Cases", "HID_Cases"]] + + # Set the hazard-related parameters. + + # As we don't specify them when the tests are set up, they should be empty dictionaries + assert not m.hazard_location_multipliers + assert not m.hazard_individual_multipliers + + # Manually create some hazards for individuals and locationsas per the parameters file + m.hazard_individual_multipliers["presymptomatic"] = 1.0 + m.hazard_individual_multipliers["asymptomatic"] = 2.0 + m.hazard_individual_multipliers["symptomatic"] = 3.0 + for act in ColumnNames.Activities.ALL: + m.hazard_location_multipliers[act] = 1.0 + + # Step 0 (initialisation): + + # Everyone should start without the disease (they will have been assigned a status as part of initialisation) + m.individuals[ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SUSCEPTIBLE + + # + # Person 1: lives with one other person (p2). Both people spend all their time at home doing nothing else + # + p1 = 0 + p2 = 1 + + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.PRESYMPTOMATIC # Give p1 the disease + for p in [p1, p2]: # Set their activity durations to 0 except for home + for name, activity in m.activity_locations.items(): + m.individuals.at[p, f"{name}{ColumnNames.ACTIVITY_DURATION}"] = 0.0 + m.individuals.at[p, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] = 1.0 + + m.step() + + # Check the disease has spread to the house with a multiplier of 1.0, but nowhere else + _check_hazard_spread(p1, p2, m.individuals, m.households, 1.0) + + # If the person is asymptomatic, we said the hazard should be doubled, so the risk should be doubled + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.ASYMPTOMATIC # Give p1 the disease + m.individuals.loc[p2, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SUSCEPTIBLE # Make sure p2 is clean + + m.step() + _check_hazard_spread(p1, p2, m.individuals, m.households, 2.0) + + # And for symptomatic we said 3.0 + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC # Give p1 the disease + m.individuals.loc[p2, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SUSCEPTIBLE # Make sure p2 is clean + + m.step() + _check_hazard_spread(p1, p2, m.individuals, m.households, 3.0) + + + # But if they both get sick then double danger and risk) + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC + m.individuals.loc[p2, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC + m.step() + _check_hazard_spread(p1, p2, m.individuals, m.households, 6.0) + + # + # Now see if the hazards for locations work. Check houses and schools + # + + # Both people are symptomatic. And double the hazard for home. So in total the new risk should + # be 3 * 2 * 5 = 30 + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC + m.individuals.loc[p2, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC + m.hazard_location_multipliers[ColumnNames.Activities.HOME] = 5.0 + + m.step() + _check_hazard_spread(p1, p2, m.individuals, m.households, 30.0) + + + # Check for school as well. Now give durations for home and school as 0.5. Make them asymptomatic so the additional + # hazard is 2.0 (set above). And make the risks for home 5.35 and for school 2.9. + + # Make sure all *other* individuals go to a different school (school 1), then make p1 and p2 go to the same school + # (school 0) below + # (annoying apply is because pandas doesn't like a list being assigned to a value in a cell) + m.individuals[f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_VENUES}"] = \ + m.individuals.loc[:, f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_VENUES}"].apply(lambda x: [1]) + m.individuals.loc[[p1, p2], f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_VENUES}"] = \ + m.individuals.loc[[p1, p2], f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_VENUES}"].apply(lambda x: [0]) + # All school flows need to be 1 (don't want the people to go to more than 1 school + m.individuals[f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_FLOWS}"] = \ + m.individuals.loc[:, f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_VENUES}"].apply(lambda x: [1.0]) + + for p in [p1, p2]: # Set their activity durations to 0.5 for home and school + for name, activity in m.activity_locations.items(): + m.individuals.at[p, f"{name}{ColumnNames.ACTIVITY_DURATION}"] = 0.0 + m.individuals.at[p, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] = 0.5 + m.individuals.at[p, f"{ColumnNames.Activities.PRIMARY}{ColumnNames.ACTIVITY_DURATION}"] = 0.5 + # Make them asymptomatic + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.ASYMPTOMATIC + m.individuals.loc[p2, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.ASYMPTOMATIC + # Set hazards for home and school + m.hazard_location_multipliers[ColumnNames.Activities.HOME] = 5.35 + m.hazard_location_multipliers[ColumnNames.Activities.PRIMARY] = 2.9 + + m.step() + + # Can't use _check_hazard_spread because it assumes only one activity (HOME) + # Current risks are: + # For home. 2 people * 2.0 asymptomatic hazard * 0.5 duration * 5.35 HOME risk = 10.7 + # For school. 2 people * 2.0 asymptomatic hazard * 0.5 duration * 2.9 PRIMARY risk = 5.8 + # Total risk for individuals: 10.7*0.5 + 5.8*0.5 = 8.25 + + # Individuals + for p in [p1, p2]: + assert m.individuals.at[p, ColumnNames.CURRENT_RISK] == 8.25 + for p in range(2, len(m.individuals)): + assert m.individuals.at[p, ColumnNames.CURRENT_RISK] == 0.0 + + # Households + assert m.households.at[0, ColumnNames.LOCATION_DANGER] == 10.7 + # (the self.households dataframe should be the same as the one stored in the activity_locations) + assert m.activity_locations[ColumnNames.Activities.HOME]._locations.at[0, ColumnNames.LOCATION_DANGER] == 10.7 + for h in range(1, len(m.households)): # all others are 0 + assert m.households.at[h, ColumnNames.LOCATION_DANGER] == 0.0 + + # Schools + assert m.activity_locations[ColumnNames.Activities.PRIMARY]._locations.at[0, ColumnNames.LOCATION_DANGER] == 5.8 + for h in range(1, len( m.activity_locations[ColumnNames.Activities.PRIMARY]._locations)): # all others are 0 + assert m.activity_locations[ColumnNames.Activities.PRIMARY]._locations.at[h, ColumnNames.LOCATION_DANGER] == 0.0 + + print("End of test hazard multipliers") + + +def _check_hazard_spread(p1, p2, individuals, households, risk): + """Checks how the disease is spreading. To save code repetition in test_hazard_multipliers""" + for p in [p1, p2]: + assert individuals.at[p, ColumnNames.CURRENT_RISK] == risk + for p in range(2, len(individuals)): + assert individuals.at[p, ColumnNames.CURRENT_RISK] == 0.0 + assert households.at[0, ColumnNames.LOCATION_DANGER] == risk + for h in range(1, len(households)): # all others are 0 + assert households.at[h, ColumnNames.LOCATION_DANGER] == 0.0 + + + +def test_step(test_microsim): + """ + Test the step method. This is the main test of the model. Simulate a deterministic run through and + make sure that the model runs as expected. + + Only thing it doesn't do is check for retail, shopping, etc., that danger and risk increase by the correct + amount. It just checks they go above 0 (or not). It does do that more precise checks for home activities though. + + :param test_microsim: This is a pointer to the initialised model. Dummy data will have been read in, + but no stepping has taken place yet.""" + m = copy.deepcopy(test_microsim) # For less typing and so as not to interfere with other functions use test_microsim + + # Note: the following is a useul way to get relevant info about the individuals + # m.individuals.loc[:, ["ID", "PID", "HID", "area", ColumnNames.DISEASE_STATUS, "MSOA_Cases", "HID_Cases"]] + + # Step 0 (initialisation): + + # Everyone should start without the disease (they will have been assigned a status as part of initialisation) + m.individuals[ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SUSCEPTIBLE + + # + # Person 1: lives with one other person (p2). Both people spend all their time at home doing nothing else + # + p1 = 0 + p2 = 1 + + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC # Give them the disease + for p in [p1, p2]: # Set their activity durations to 0 + for name, activity in m.activity_locations.items(): + m.individuals.at[p, f"{name}{ColumnNames.ACTIVITY_DURATION}"] = 0.0 + m.individuals.at[p, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] = 1.0 # Spend all their time at home + + m.step() + + # Check the disease has spread to the house but nowhere else + for p in [p1, p2]: + assert m.individuals.at[p, ColumnNames.CURRENT_RISK] == 1.0 + for p in range(2, len(m.individuals)): + assert m.individuals.at[p, ColumnNames.CURRENT_RISK] == 0.0 + assert m.households.at[0, ColumnNames.LOCATION_DANGER] == 1.0 + for h in range(1, len(m.households)): # all others are 0 + assert m.households.at[h, ColumnNames.LOCATION_DANGER] == 0.0 + + m.step() + + # Risk and danger stay the same (it does not cumulate over days) + for p in [p1, p2]: + assert m.individuals.at[p, ColumnNames.CURRENT_RISK] == 1.0 + for p in range(2, len(m.individuals)): + assert m.individuals.at[p, ColumnNames.CURRENT_RISK] == 0.0 + m.households.at[0, ColumnNames.LOCATION_DANGER] == 1.0 + for h in range(1, len(m.households)): + assert m.households.at[h, ColumnNames.LOCATION_DANGER] == 0.0 + + # If the infected person doesn't go home (in this test they do absolutely nothing) then danger and risks should go + # back to 0 + m.individuals.at[p1, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] = 0.0 + m.step() + for p in range(len(m.individuals)): + assert m.individuals.at[p, ColumnNames.CURRENT_RISK] == 0.0 + for h in range(0, len(m.households)): + assert m.households.at[h, ColumnNames.LOCATION_DANGER] == 0.0 + + # But if they both get sick then they should be 2.0 (double danger and risk) + m.individuals.loc[p1:p2, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC # Give them the disease + m.individuals.at[p1, f"{ColumnNames.Activities.HOME}{ColumnNames.ACTIVITY_DURATION}"] = 1.0 # Make the duration normal again + m.step() + for p in [p1, p2]: + assert m.individuals.at[p, ColumnNames.CURRENT_RISK] == 2.0 + assert m.households.at[0, ColumnNames.LOCATION_DANGER] == 2.0 + for h in range(1, len(m.households)): # All other houses are danger free + m.households.at[h, ColumnNames.LOCATION_DANGER] == 0.0 + + # + # Now see what happens when one person gets the disease and spreads it to schools, shops and work + # + del p1, p2 + p1 = 4 # The infected person is index 1 + # Make everyone better except for that one person + m.individuals[ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SUSCEPTIBLE + m.individuals.loc[p1, ColumnNames.DISEASE_STATUS] = ColumnNames.DiseaseStatuses.SYMPTOMATIC + # Assign everyone equal time doing all activities + for name, activity in m.activity_locations.items(): + m.individuals[f"{name}{ColumnNames.ACTIVITY_DURATION}"] = 1.0 / len(m.activity_locations) + + m.step() + + # Now check that the danger has propagated to locations and risk to people + # TODO Also check that the total risks and danger scores sum correctly + for name, activity in m.activity_locations.items(): + # Indices of the locations where this person visited + visited_idx = m.individuals.at[p1, f"{name}{ColumnNames.ACTIVITY_VENUES}"] + not_visited_idx = list(set(range(len(activity._locations))) - set(visited_idx)) + # Dangers should be >0.0 (or not if the person didn't visit there) + assert False not in list(activity._locations.loc[visited_idx, "Danger"].values > 0) + assert False not in list(activity._locations.loc[not_visited_idx, "Danger"].values == 0) + # Individuals should have an associated risk + for index, row in m.individuals.iterrows(): + for idx in visited_idx: + if idx in row[f"{name}{ColumnNames.ACTIVITY_VENUES}"]: + assert row[ColumnNames.CURRENT_RISK] > 0 + # Note: can't check if risk is equal to 0 becuase it might come from another activity + + print("End of test step") + + +# ******************************************************** +# Other (unit) tests +# ******************************************************** + +def _get_rand(microsim, N=100): + """Get a random number using the Microsimulation object's random number generator""" + for _ in range(N): + microsim.random.random() + return microsim.random.random() + + +def test_random(): + """ + Checks that random classes are produce different (or the same!) numbers when they should do + :return: + """ + m1 = Microsim(**microsim_args, read_data=False) + m2 = Microsim(**microsim_args, random_seed=2.0, read_data=False) + m3 = Microsim(**microsim_args, random_seed=2.0, read_data=False) + + # Genrate a random number from each model. The second two numbers should be the same + r1, r2, r3 = [_get_rand(x) for x in [m1, m2, m3]] + + assert r1 != r2 + assert r2 == r3 + + # Check that this still happens even if they are executed in pools. + # Create a large number of microsims and check that all random numbers are unique + pool = multiprocessing.Pool() + num_reps = 1000 + m = [Microsim(**microsim_args, read_data=False) for _ in range(num_reps)] + r = pool.map(_get_rand, m) + assert len(r) == len(set(r)) + + +def test_extract_msoas_from_indiviuals(): + """Check that a list of areas can be successfully extracted from a DataFrame of indviduals""" + individuals = pd.DataFrame(data={"area": ["C", "A", "F", "A", "A", "F"]}) + areas = Microsim.extract_msoas_from_indiviuals(individuals) + assert len(areas) == 3 + # Check the order is correct too + assert False not in [x == y for (x, y) in zip(areas, ["A", "C", "F"])] + + +@pytest.mark.skip(reason="No longer running this test as the functionality to select by study area has been removed") +def test_check_study_area(): + all_msoa_list = ["C", "A", "F", "B", "D", "E"] + individuals = pd.DataFrame( + data={"PID": [1, 2, 3, 4, 5, 6], "HID": [1, 1, 2, 2, 2, 3], "area": ["B", "B", "A", "A", "A", "D"], + "House_OA": ["B", "B", "A", "A", "A", "D"]}) + households = pd.DataFrame(data={"HID": [1, 2, 3], "area": ["B", "A", "D"]}) + + with pytest.raises(Exception): + # Check that it catches duplicate areas + assert Microsim.check_study_area(all_msoa_list, ["A", "A", "B"], individuals, households) + assert Microsim.check_study_area(all_msoa_list + ["A"], ["A", "B"], individuals, households) + # Check that it catches subset areas that aren't in the whole dataset + assert Microsim.check_study_area(all_msoa_list, ["A", "B", "G"], individuals, households) + + # Should return whole dataset if no subset is provided + assert Microsim.check_study_area(all_msoa_list, None, individuals, households)[0] == all_msoa_list + assert Microsim.check_study_area(all_msoa_list, [], individuals, households)[0] == all_msoa_list + + with pytest.raises(Exception): + # No individuals in area "E" so this should fail: + assert Microsim.check_study_area(all_msoa_list, ["A", "B", "E"], individuals, households) + + # Correctly subset and remove individuals + x = Microsim.check_study_area(all_msoa_list, ["A", "D"], individuals, households) + assert x[0] == ["A", "D"] # List of areas + assert list(x[1].PID.unique()) == [3, 4, 5, 6] # List of individuals + assert list(x[2].HID.unique()) == [2, 3] # List of households + + +def test__add_location_columns(): + df = pd.DataFrame(data={"Name": ['a', 'b', 'c', 'd']}) + with pytest.raises(Exception): # Should fail if lists are wrong length + Microsim._add_location_columns(df, location_names=["a", "b"], location_ids=None) + Microsim._add_location_columns(df, location_names=df.Name, location_ids=[1, 2]) + with pytest.raises(TypeError): # Can't get the length of None + Microsim._add_location_columns(df, location_names=None) + + # Call the function + x = Microsim._add_location_columns(df, location_names=df.Name) + assert x is None # Function shouldn't return anything. Does things inplace + # Default behaviour is just add columns + assert False not in (df.columns.values == ["Name", "ID", "Location_Name", "Danger"]) + assert False not in list(df.Location_Name == df.Name) + assert False not in list(df.ID == range(0, 4)) + assert False not in list(df.index == range(0, 4)) + # Adding columns again shouldn't change anything + Microsim._add_location_columns(df, location_names=df.Name) + assert False not in (df.columns.values == ["Name", "ID", "Location_Name", "Danger"]) + assert False not in list(df.Location_Name == df.Name) + assert False not in list(df.ID == range(0, 4)) + assert False not in list(df.index == range(0, 4)) + # See what happens if we give it IDs + Microsim._add_location_columns(df, location_names=df.Name, location_ids=[5, 7, 10, -1]) + assert False not in (df.columns.values == ["Name", "ID", "Location_Name", "Danger"]) + assert False not in list(df.ID == [5, 7, 10, -1]) + assert False not in list(df.index == range(0, 4)) # Index shouldn't change + # Shouldn't matter if IDs are Dataframes or Series + Microsim._add_location_columns(df, location_names=pd.Series(df.Name)) + assert False not in list(df.Location_Name == df.Name) + assert False not in list(df.index == range(0, 4)) # Index shouldn't change + Microsim._add_location_columns(df, location_names=df.Name, location_ids=np.array([5, 7, 10, -1])) + assert False not in list(df.ID == [5, 7, 10, -1]) + assert False not in list(df.index == range(0, 4)) # Index shouldn't change + # Set a weird index, the function should replace it with the row number + df = pd.DataFrame(data={"Name": ['a', 'b', 'c', 'd'], "Col2": [4, -6, 8, 1.4]}, ) + df.set_index("Col2") + Microsim._add_location_columns(df, location_names=df.Name) + assert False not in list(df.ID == range(0, 4)) + assert False not in list(df.index == range(0, 4)) + + # TODO dest that the _add_location_columns function correctly adds the required standard columns + # to a locaitons dataframe, and does appropriate checks for correct lengths of input lists etc. + + +def test__normalise(): + # TODO test the 'decimals' argument too. + # Should normalise so that the input list sums to 1 + # Fail if aa single number is given + for l in [2, 1]: + with pytest.raises(Exception): + Microsim._normalise(l) + + # 1-item lists should return [1.0] + for l in [[0.1], [5.3]]: + assert Microsim._normalise(l) == [1.0] + + # If numbers are the same (need to work out why these tests fail,the function seems OK) + # for l in [ [2, 2], [0, 0], [-1, -1], [1, 1] ]: + # assert Microsim._normalise(l) == [0.5, 0.5] + + # Other examples + assert Microsim._normalise([4, 6]) == [0.4, 0.6] + assert Microsim._normalise([40, 60]) == [0.4, 0.6] + assert Microsim._normalise([6, 6, 6, 6, 6]) == [0.2, 0.2, 0.2, 0.2, 0.2] + diff --git a/dist/RAMP_UA-0.2.1.dev0-py3.8.egg b/dist/RAMP_UA-0.2.1.dev0-py3.8.egg new file mode 100644 index 00000000..c266940c Binary files /dev/null and b/dist/RAMP_UA-0.2.1.dev0-py3.8.egg differ diff --git a/model_parameters/default.yml b/model_parameters/default.yml index 298d6090..a32b2c76 100644 --- a/model_parameters/default.yml +++ b/model_parameters/default.yml @@ -45,6 +45,11 @@ disease: # This is where the parameter for the disease model (the R part) can go asymp_rate: 0.4 output_switch: TRUE rank_assign: FALSE + local_outbreak_timestep: 7 + local_outbreak: TRUE + msoa_infect: "E02004161" + number_people_local: 100 + local_prob_increase: 1.0 overweight_sympt_mplier: 1.46 obesity_40: 1.9 obesity_35: 1.48 @@ -53,5 +58,4 @@ disease: # This is where the parameter for the disease model (the R part) can go cvd: 1 diabetes: 1 bloodpressure: 1 - improve_health: FALSE - + improve_health: FALSE \ No newline at end of file