Skip to content

Commit

Permalink
Merge pull request #22 from reichlab/Splines_code
Browse files Browse the repository at this point in the history
Adding code for splines models.
  • Loading branch information
IsaacMacarthur authored Jan 28, 2025
2 parents 9aa20f4 + 013241c commit 5005847
Show file tree
Hide file tree
Showing 10 changed files with 728 additions and 103 deletions.
45 changes: 45 additions & 0 deletions model-metadata/UMass-HMLR_FDSDN.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
team_name: "UMass Amherst"
team_abbr: "UMass"
model_name: "Hierarchical Multinomial Logistic Regression FDSDN"
model_abbr: "HMLR_FDSDN"
model_version: "1.0"
model_contributors: [
{
"name": "Isaac MacArthur",
"affiliation": "Department of Mathematics and Statistics, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Maryclare Griffin",
"affiliation": "Department of Mathematics and Statistics, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Evan L. Ray",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Nick G. Reich",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Thomas Robacker",
"affiliation": "Department of Mathematics and Statistics, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Benjamin W. Rogers",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachusetts Amherst",
"email": "[email protected]"
},
]
license: "CC-BY-4.0"
team_funding: "This work has been supported by the National Institutes of General Medical Sciences (R35GM119582) and the US Centers for Disease Control and Prevention’s Center for Forecasting and Outbreak Analytics (NU38FT000008). The content is solely the responsibility of the authors and does not necessarily represent the official views of NIGMS, the National Institutes of Health, or CDC."
methods: "A version of our model that has a shared sd between varaiants (sd_structure = F), uses time from start of dataset (time_treatment = D), uses a splines model for time, technically cubic-polynomials at the second (time_model = S), uses a Dirichlet-multinomial model (distribution = D), and has no spacial elements (spacial_correlation = N), it also assumes that the time coeffients for each variant are drawn from a N(bloc_v, bsd_v^2) distrubution, and bloc_v are idd N(0, 400) and bsd_v are idd draws from a trucated normal(1, 400) distrubution. The intercepts are drawn from a N(aloc_v, asd^2) distrubution, and aloc_v are idd N(0, 400) and asd is a draw from from a trucated normal(1, 400) distrubution. Here the prior on kappa is trucated normal(1,5)."
methods_url: "https://github.com/IsaacMacarthur/Umass-Virus_modeling-code/blob/main/Model%20Details.pdf"
data_sources: "NextStrain"
ensemble_of_models: false
ensemble_of_hub_models: false
repo_url: "https://github.com/IsaacMacarthur/Umass-Virus_modeling-code/tree/main"
45 changes: 45 additions & 0 deletions model-metadata/UMass-HMLR_FDSMN.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
team_name: "UMass Amherst"
team_abbr: "UMass"
model_name: "Hierarchical Multinomial Logistic Regression FDSMN"
model_abbr: "HMLR_FDSMN"
model_version: "1.0"
model_contributors: [
{
"name": "Isaac MacArthur",
"affiliation": "Department of Mathematics and Statistics, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Maryclare Griffin",
"affiliation": "Department of Mathematics and Statistics, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Evan L. Ray",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Nick G. Reich",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Thomas Robacker",
"affiliation": "Department of Mathematics and Statistics, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Benjamin W. Rogers",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachussetts Amherst",
"email": "[email protected]"
},
]
license: "CC-BY-4.0"
team_funding: "This work has been supported by the National Institutes of General Medical Sciences (R35GM119582) and the US Centers for Disease Control and Prevention’s Center for Forecasting and Outbreak Analytics (NU38FT000008). The content is solely the responsibility of the authors and does not necessarily represent the official views of NIGMS, the National Institutes of Health, or CDC."
methods: "A version of our model that has a shared sd between varaiants (sd_structure = F), uses time from start of dataset (time_treatment = D), uses a spline model for time, technically cubic-polynomials at the second (time_model = S), uses a multinomial model (distribution = M), and has no spacial elements (spacial_correlation = N), it also assumes that the time coeffients for each variant are drawn from a N(bloc_v, bsd_v^2) distrubution, and bloc_v are idd N(0, 400) and bsd_v are idd draws from a trucated normal(1, 400) distrubution. The intercepts are drawn from a N(aloc_v, asd^2) distrubution, and aloc_v are idd N(0, 400) and asd is a draw from from a trucated normal(1, 400) distrubution."
methods_url: "https://github.com/IsaacMacarthur/Umass-Virus_modeling-code/blob/main/Model%20Details.pdf"
data_sources: "NextStrain"
ensemble_of_models: false
ensemble_of_hub_models: false
repo_url: "https://github.com/IsaacMacarthur/Umass-Virus_modeling-code/tree/main"
45 changes: 45 additions & 0 deletions model-metadata/UMass-HMLR_TDSDN.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
team_name: "UMass Amherst"
team_abbr: "UMass"
model_name: "Hierarchical Multinomial Logistic Regression TDSDN"
model_abbr: "HMLR_TDSDN"
model_version: "1.0"
model_contributors: [
{
"name": "Isaac MacArthur",
"affiliation": "Department of Mathematics and Statistics, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Maryclare Griffin",
"affiliation": "Department of Mathematics and Statistics, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Evan L. Ray",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Nick G. Reich",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Thomas Robacker",
"affiliation": "Department of Mathematics and Statistics, University of Massachusetts Amherst",
"email": "[email protected]"
},
{
"name": "Benjamin W. Rogers",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachusetts Amherst",
"email": "[email protected]"
},
]
license: "CC-BY-4.0"
team_funding: "This work has been supported by the National Institutes of General Medical Sciences (R35GM119582) and the US Centers for Disease Control and Prevention’s Center for Forecasting and Outbreak Analytics (NU38FT000008). The content is solely the responsibility of the authors and does not necessarily represent the official views of NIGMS, the National Institutes of Health, or CDC."
methods: "A version of our model that has different sd for each varaiant (sd_structure = T), uses time from start of dataset (time_treatment = D), uses a spline model for time, technically cubic-polynomials at the second (time_model = S), uses a Dirichlet-multinomial model (distribution = D), and has no spacial elements (spacial_correlation = N), it also assumes that the time coeffients for each variant are drawn from a N(bloc_v, bsd_v^2) distrubution, and bloc_v are idd N(0, 400) and bsd_v are idd draws from a trucated normal(1, 400) distrubution. The intercepts are drawn from a N(aloc_v, asd_v^2) distrubution, and aloc_v are idd N(0, 400) and asd_v are idd draws from a trucated normal(1, 400) distrubution. Here the prior on kappa is trucated normal(1,5)"
methods_url: "https://github.com/IsaacMacarthur/Umass-Virus_modeling-code/blob/main/Model%20Details.pdf"
data_sources: "NextStrain"
ensemble_of_models: false
ensemble_of_hub_models: false
repo_url: "https://github.com/IsaacMacarthur/Umass-Virus_modeling-code/tree/main"
45 changes: 45 additions & 0 deletions model-metadata/UMass-HMLR_TDSMN.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
team_name: "UMass Amherst"
team_abbr: "UMass"
model_name: "Hierarchical Multinomial Logistic Regression TDSMN"
model_abbr: "HMLR_TDSMN"
model_version: "1.0"
model_contributors: [
{
"name": "Isaac MacArthur",
"affiliation": "Department of Mathematics and Statistics, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Maryclare Griffin",
"affiliation": "Department of Mathematics and Statistics, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Evan L. Ray",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Nick G. Reich",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Thomas Robacker",
"affiliation": "Department of Mathematics and Statistics, University of Massachussetts Amherst",
"email": "[email protected]"
},
{
"name": "Benjamin W. Rogers",
"affiliation": "Department of Biostatistics and Epidemiology, University of Massachussetts Amherst",
"email": "[email protected]"
},
]
license: "CC-BY-4.0"
team_funding: "This work has been supported by the National Institutes of General Medical Sciences (R35GM119582) and the US Centers for Disease Control and Prevention’s Center for Forecasting and Outbreak Analytics (NU38FT000008). The content is solely the responsibility of the authors and does not necessarily represent the official views of NIGMS, the National Institutes of Health, or CDC."
methods: "A version of our model that has different sd for each varaiant (sd_structure = T), uses time from start of dataset (time_treatment = D), uses a spline model for time, technically cubic-polynomials at the second (time_model = S), uses a multinomial model (distribution = M), and has no spacial elements (spacial_correlation = N), it also assumes that the time coeffients for each variant are drawn from a N(bloc_v, bsd_v^2) distrubution, and bloc_v are idd N(0, 400) and bsd_v are idd draws from a trucated normal(1, 400) distrubution. The intercepts are drawn from a N(aloc_v, asd_v^2) distrubution, and aloc_v are idd N(0, 400) and asd_v are idd draws from a trucated normal(1, 400) distrubution."
methods_url: "https://github.com/IsaacMacarthur/Umass-Virus_modeling-code/blob/main/Model%20Details.pdf"
data_sources: "NextStrain"
ensemble_of_models: false
ensemble_of_hub_models: false
repo_url: "https://github.com/IsaacMacarthur/Umass-Virus_modeling-code/tree/main"
128 changes: 128 additions & 0 deletions src-model/HMLR_FDSDN.stan
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
functions {
// function for dirichlet_multinomial_lpmf comes from https://discourse.mc-stan.org/t/transforming-a-multinomial-model-into-a-dirichlet-multinomial/26399/2
real dirichlet_multinomial_lpmf(int[] y, vector alpha) {
real sum_alpha = sum(alpha);
return lgamma(sum_alpha) - lgamma(sum(y) + sum_alpha)
+ sum(lgamma(to_vector(y) + alpha)) - sum(lgamma(alpha));
}

vector map_rect_likelihood(vector phi,vector params, real[] data_r, int[] data_i) {
int K = data_i[1]; // Number of clades
int B = data_i[2]; // Degrees of freedom for spline
int N = data_i[3]; // Number of days
int y[N, K];
{
int idx = 1;
for (n in 1:N) {
for (k in 1:K) {
y[n, k] = data_i[3 + idx];
idx += 1;
}
}
}

// Extract shared data
matrix[B, N] x = to_matrix(data_r[1:(B * N)], B, N);
vector[K-1] aloc = phi[1:K-1];
vector[K-1] bloc = phi[K:(2*K-2)];
real asd = phi[2*K -1];
real bsd = phi[2*K];
real kappa = phi[2*K+1];

// Extract location-specific parameters
vector[K-1] alpha_nc = segment(params, 1, K-1);
matrix[K-1, B] beta_nc = to_matrix(segment(params, K, (K-1)*B), K-1, B);

// Compute raw alpha and beta
vector[K] alpha = append_row(0, aloc + asd * alpha_nc);
matrix[K, B] beta;
for (b in 1:B) {
beta[:, b] = append_row(0, bloc + bsd * beta_nc[:, b]);
}

// Calculate log-likelihood
real acc = 0;
for (n in 1:N) {
if (sum(y[n, :]) > 0) {
vector[K] lambda = kappa * exp(alpha + beta * x[:, n]) / sum(exp(alpha + beta * x[:, n]));
acc += dirichlet_multinomial_lpmf(y[n, :] | lambda);
}
}

return [acc]'; // Return log-likelihood as a vector
}
}

data {
int<lower=0> N; // Number of days of samples
int<lower=1> L; // Number of locations
int<lower=1> K; // Number of clades
int<lower=3> B; // Degrees of freedom for the spline
int<lower=0> y[N, L, K]; // Count data
matrix[B, N] x; // Spline matrix
}

transformed data{
// Prepare shared data indices
int shared_data_i[L, 4 + N * K -1];
real shared_data_r[L,(B * N)];
for (l in 1:L) {
shared_data_r[l] = to_array_1d(x);
shared_data_i[l,1] = K;
shared_data_i[l,2] = B;
shared_data_i[l, 3] = N;
shared_data_i[l,(4):(4 + (N * K)-1)] = to_array_1d(y[:, l, :]);
}
}
parameters {
real<lower=0> bsd; // Prior sd for betas
real<lower=0> asd; // Prior sd for the alphas
real<lower=0> kappa; // Scale for the Dirichlet
real bloc[K-1]; // Prior means for betas
real aloc[K-1]; // Prior means for the alphas
vector[K-1] alpha_noncentered[L]; // Non-centered alpha parameters
matrix[K-1, B] beta_noncentered[L]; // Non-centered beta parameters
}

model {
// Priors
bsd ~ normal(1, 400);
asd ~ normal(1, 400);
bloc ~ normal(0, 400);
aloc ~ normal(0, 400);
kappa ~ normal(1, 5);

for (l in 1:L) {
alpha_noncentered[l] ~ normal(0, 1);
to_vector(beta_noncentered[l]) ~ normal(0, 1);
}

// Vectorize parameters for map_rect
vector[(K-1) + (K-1)*B] param_shards[L];
for (l in 1:L) {
param_shards[l] = append_row(alpha_noncentered[l], to_vector(beta_noncentered[l]));
}

// Prepare shared data
vector[K-1] aloc_vec = to_vector(aloc);
vector[K-1] bloc_vec = to_vector(bloc);
vector[3] scalar_vec = [asd, bsd, kappa]';
vector[2*K + 1] phi = append_row(aloc_vec, append_row(bloc_vec, scalar_vec));

// Call map_rect with the correct arguments
target += sum(map_rect(map_rect_likelihood,phi, param_shards, shared_data_r, shared_data_i));

}
generated quantities {
// Declare raw_alpha and raw_beta
vector[K-1] raw_alpha[L];
matrix[K-1, B] raw_beta[L];
for (l in 1:L) {
for (k in 1:(K-1)) {
raw_alpha[l, k] = aloc[k] + asd * alpha_noncentered[l, k]; // Reparameterized alpha
for (b in 1:B) {
raw_beta[l, k, b] = bloc[k] + bsd * beta_noncentered[l, k, b]; // Reparameterized beta
}
}
}
}
71 changes: 71 additions & 0 deletions src-model/HMLR_FDSMN.stan
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
functions {
real partial_sum_lpmf(int[] slice_y, int start, int end, int[] ll, matrix x,
row_vector weights, vector[] raw_alpha, matrix[] raw_beta,
int K, int B) {
real acc = 0;
for (n in 1:size(slice_y)) {
vector[K] alpha = append_row(0, raw_alpha[ll[start + n - 1]]);
matrix[K, B] beta;
for (b in 1:B) {
beta[:, b] = append_row(0, raw_beta[ll[start + n - 1], :, b]);
}
acc += weights[start + n - 1] * categorical_logit_lpmf(slice_y[n] | alpha + beta * x[:, start + n - 1]);
}
return acc;
}
}

data {
int<lower=0> N; // the number of samples
int<lower=1> L; // number of locations
int<lower=1> K; // number of clades
int<lower=1> B; // the number of df
array[N] int<lower=1, upper=K> y; // the clades
array[N] int<lower=1, upper=L> ll; // locations
matrix[B, N] x; // the spline over the days of the samples
row_vector[N] weights; // number of seq per location
}

parameters {
real<lower=0> bsd; // prior sd for betas
real<lower=0> asd; // prior sd for alphas
array[K-1] real aloc; // prior means for the alpha's
array[K-1] real bloc; // prior means for betas

// Non-centered parameterization
vector[K-1] alpha_nc[L]; // raw alpha's
matrix[K-1, B] beta_nc[L]; // raw betas
}

transformed parameters {
// Apply non-centered parameterization
vector[K-1] raw_alpha[L];
matrix[K-1, B] raw_beta[L];
for (l in 1:L) {
for (k in 1:(K-1)) {
raw_alpha[l, k] = aloc[k] + asd * alpha_nc[l, k]; // Reparameterized alpha
for (b in 1:B) {
raw_beta[l, k, b] = bloc[k] + bsd * beta_nc[l, k, b]; // Reparameterized beta
}
}
}
}

model {
// Priors
bsd ~ normal(1, 400); // prior for the sd
bloc ~ normal(0, 400); // priors for the betas
asd ~ normal(1, 400); // prior for the alpha sd
aloc ~ normal(0, 400); // prior for the alpha means

// Priors on raw parameters (standard normal)
for (l in 1:L) {
alpha_nc[l] ~ normal(0, 1); // standard normal for alpha_nc
for (k in 1:(K-1)) {
beta_nc[l, k] ~ normal(0, 1); // standard normal for beta_nc
}
}

// Likelihood using reduce_sum
target += reduce_sum(partial_sum_lpmf, y, 1, ll, x, weights, raw_alpha, raw_beta, K, B);
}
Loading

0 comments on commit 5005847

Please sign in to comment.