From 766badc86c875799b13bf274797a8e2d16dc48e2 Mon Sep 17 00:00:00 2001 From: Taras Savchyn Date: Mon, 13 Nov 2023 00:17:34 +0100 Subject: [PATCH 1/6] Clean models --- flexynesis/models/direct_pred.py | 1 - flexynesis/models/supervised_vae.py | 1 - flexynesis/models/triplet_encoder.py | 1 - 3 files changed, 3 deletions(-) delete mode 100644 flexynesis/models/direct_pred.py delete mode 100644 flexynesis/models/supervised_vae.py delete mode 100644 flexynesis/models/triplet_encoder.py diff --git a/flexynesis/models/direct_pred.py b/flexynesis/models/direct_pred.py deleted file mode 100644 index 79ad19b..0000000 --- a/flexynesis/models/direct_pred.py +++ /dev/null @@ -1 +0,0 @@ -from ..model_DirectPred import DirectPred diff --git a/flexynesis/models/supervised_vae.py b/flexynesis/models/supervised_vae.py deleted file mode 100644 index dd93f28..0000000 --- a/flexynesis/models/supervised_vae.py +++ /dev/null @@ -1 +0,0 @@ -from ..model_SVAE import supervised_vae as SupervisedVAE diff --git a/flexynesis/models/triplet_encoder.py b/flexynesis/models/triplet_encoder.py deleted file mode 100644 index af5dc8e..0000000 --- a/flexynesis/models/triplet_encoder.py +++ /dev/null @@ -1 +0,0 @@ -from ..model_TripletEncoder import MultiTripletNetwork From 2852eaa2fffcd0c4dfdb7797c39c6f4749bb6612 Mon Sep 17 00:00:00 2001 From: Taras Savchyn Date: Mon, 13 Nov 2023 00:33:51 +0100 Subject: [PATCH 2/6] Move to models --- flexynesis/models/__init__.py | 4 ++-- flexynesis/{model_DirectPred.py => models/direct_pred.py} | 0 flexynesis/{model_SVAE.py => models/supervised_vae.py} | 0 .../{model_TripletEncoder.py => models/triplet_encoder.py} | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename flexynesis/{model_DirectPred.py => models/direct_pred.py} (100%) rename flexynesis/{model_SVAE.py => models/supervised_vae.py} (100%) rename flexynesis/{model_TripletEncoder.py => models/triplet_encoder.py} (100%) diff --git a/flexynesis/models/__init__.py b/flexynesis/models/__init__.py index 2eda149..a4d3dfb 100644 --- a/flexynesis/models/__init__.py +++ b/flexynesis/models/__init__.py @@ -1,6 +1,6 @@ from .direct_pred import DirectPred from .direct_pred_cnn import DirectPredCNN -from .supervised_vae import SupervisedVAE +from .supervised_vae import supervised_vae from .triplet_encoder import MultiTripletNetwork -__all__ = ["DirectPred", "DirectPredCNN", "SupervisedVAE", "MultiTripletNetwork"] +__all__ = ["DirectPred", "DirectPredCNN", "supervised_vae", "MultiTripletNetwork"] diff --git a/flexynesis/model_DirectPred.py b/flexynesis/models/direct_pred.py similarity index 100% rename from flexynesis/model_DirectPred.py rename to flexynesis/models/direct_pred.py diff --git a/flexynesis/model_SVAE.py b/flexynesis/models/supervised_vae.py similarity index 100% rename from flexynesis/model_SVAE.py rename to flexynesis/models/supervised_vae.py diff --git a/flexynesis/model_TripletEncoder.py b/flexynesis/models/triplet_encoder.py similarity index 100% rename from flexynesis/model_TripletEncoder.py rename to flexynesis/models/triplet_encoder.py From 8df4643150bdae92b24aebb87221cde0287b471a Mon Sep 17 00:00:00 2001 From: Taras Savchyn Date: Mon, 13 Nov 2023 00:35:57 +0100 Subject: [PATCH 3/6] Remove tmp modules --- flexynesis/modules.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 flexynesis/modules.py diff --git a/flexynesis/modules.py b/flexynesis/modules.py deleted file mode 100644 index d5c0169..0000000 --- a/flexynesis/modules.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -from torch import nn - -from .models_shared import Encoder, Decoder, MLP, EmbeddingNetwork, Classifier -from .model_TripletEncoder import MultiEmbeddingNetwork - -__all__ = ["Encoder", "Decoder", "MLP", "EmbeddingNetwork", "MultiEmbeddingNetwork", "Classifier", "CNN"] - - -class CNN(nn.Module): - def __init__(self, input_dim, hidden_dim, output_dim): - super().__init__() - - self.layer_1 = nn.Conv1d(input_dim, hidden_dim, kernel_size=1) - self.batchnorm = nn.BatchNorm1d(hidden_dim) - self.relu = nn.ReLU() - self.dropout = nn.Dropout(p=0.1) - self.layer_out = nn.Conv1d(hidden_dim, output_dim, kernel_size=1) - - def forward(self, x): - """(N, C) -> (N, C, L) -> (N, C). - """ - x = x.unsqueeze(-1) - - x = self.layer_1(x) - # TODO: for 1 at train - x = self.batchnorm(x) - x = self.relu(x) - x = self.dropout(x) - - x = self.layer_out(x) - - x = x.squeeze(-1) - return x From 0eca186966734f565806fbf624a2c6b9d9c49357 Mon Sep 17 00:00:00 2001 From: Taras Savchyn Date: Mon, 13 Nov 2023 01:01:22 +0100 Subject: [PATCH 4/6] Move sharesd to modules --- flexynesis/models/direct_pred.py | 2 +- flexynesis/models/supervised_vae.py | 2 +- flexynesis/models/triplet_encoder.py | 4 ++-- flexynesis/{models_shared.py => modules.py} | 0 4 files changed, 4 insertions(+), 4 deletions(-) rename flexynesis/{models_shared.py => modules.py} (100%) diff --git a/flexynesis/models/direct_pred.py b/flexynesis/models/direct_pred.py index 88926fd..431c58b 100644 --- a/flexynesis/models/direct_pred.py +++ b/flexynesis/models/direct_pred.py @@ -12,7 +12,7 @@ from captum.attr import IntegratedGradients -from .models_shared import * +from ..modules import * diff --git a/flexynesis/models/supervised_vae.py b/flexynesis/models/supervised_vae.py index 3ef9160..e7f9441 100644 --- a/flexynesis/models/supervised_vae.py +++ b/flexynesis/models/supervised_vae.py @@ -12,7 +12,7 @@ from captum.attr import IntegratedGradients -from .models_shared import * +from ..modules import * # Supervised Variational Auto-encoder that can train one or more layers of omics datasets # num_layers: number of omics layers in the input diff --git a/flexynesis/models/triplet_encoder.py b/flexynesis/models/triplet_encoder.py index 7a6bb77..f342ef9 100644 --- a/flexynesis/models/triplet_encoder.py +++ b/flexynesis/models/triplet_encoder.py @@ -9,8 +9,8 @@ import pytorch_lightning as pl -from .models_shared import * -from .data import TripletMultiOmicDataset +from ..modules import * +from ..data import TripletMultiOmicDataset from captum.attr import IntegratedGradients diff --git a/flexynesis/models_shared.py b/flexynesis/modules.py similarity index 100% rename from flexynesis/models_shared.py rename to flexynesis/modules.py From 5d2b568f60ba73549728ee0b1c7fd8b0d5697e3f Mon Sep 17 00:00:00 2001 From: Taras Savchyn Date: Mon, 13 Nov 2023 01:22:16 +0100 Subject: [PATCH 5/6] Fix modules module --- flexynesis/__init__.py | 7 ++----- flexynesis/modules.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/flexynesis/__init__.py b/flexynesis/__init__.py index a08068d..eb92695 100644 --- a/flexynesis/__init__.py +++ b/flexynesis/__init__.py @@ -50,14 +50,11 @@ Bora Uyar, bora.uyar@mdc-berlin.de """ -from .models_shared import * +from .modules import * from .data import * from .main import * -from .model_DirectPred import * -from .model_SVAE import * -from .model_TripletEncoder import * +from .models import * from .feature_selection import * from .data_augmentation import * from .utils import * from .config import * -from . import models diff --git a/flexynesis/modules.py b/flexynesis/modules.py index 900887e..ee0ff49 100644 --- a/flexynesis/modules.py +++ b/flexynesis/modules.py @@ -3,6 +3,8 @@ import torch from torch import nn +__all__ = ["Encoder", "Decoder", "MLP", "EmbeddingNetwork", "Classifier", "CNN"] + class Encoder(nn.Module): """ @@ -220,4 +222,31 @@ def forward(self, x): for layer in self.layers[:-1]: x = torch.relu(layer(x)) x = self.layers[-1](x) - return x \ No newline at end of file + return x + + +class CNN(nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim): + super().__init__() + + self.layer_1 = nn.Conv1d(input_dim, hidden_dim, kernel_size=1) + self.batchnorm = nn.BatchNorm1d(hidden_dim) + self.relu = nn.ReLU() + self.dropout = nn.Dropout(p=0.1) + self.layer_out = nn.Conv1d(hidden_dim, output_dim, kernel_size=1) + + def forward(self, x): + """(N, C) -> (N, C, L) -> (N, C). + """ + x = x.unsqueeze(-1) + + x = self.layer_1(x) + # TODO: for 1 at train + x = self.batchnorm(x) + x = self.relu(x) + x = self.dropout(x) + + x = self.layer_out(x) + + x = x.squeeze(-1) + return x From ebd753c92e660cf4865711c684c7f379f6eb1000 Mon Sep 17 00:00:00 2001 From: Taras Savchyn Date: Mon, 13 Nov 2023 01:41:58 +0100 Subject: [PATCH 6/6] Extract a base class from DirectPred class --- flexynesis/models/base_direct_pred.py | 231 ++++++++++++++++++++++++++ flexynesis/models/direct_pred_cnn.py | 14 +- 2 files changed, 239 insertions(+), 6 deletions(-) create mode 100644 flexynesis/models/base_direct_pred.py diff --git a/flexynesis/models/base_direct_pred.py b/flexynesis/models/base_direct_pred.py new file mode 100644 index 0000000..dd72a0c --- /dev/null +++ b/flexynesis/models/base_direct_pred.py @@ -0,0 +1,231 @@ +import torch +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split + +import numpy as np +import pandas as pd + +import pytorch_lightning as pl + +from captum.attr import IntegratedGradients + + +class BaseDirectPred(pl.LightningModule): + def __init__(self, config, dataset, target_variables, batch_variables=None, val_size=0.2): + super().__init__() + self.config = config + self.dataset = dataset + self.target_variables = target_variables + self.batch_variables = batch_variables + self.variables = target_variables + batch_variables if batch_variables else target_variables + self.val_size = val_size + self.dat_train, self.dat_val = self.prepare_data() + self.feature_importances = {} + # Instantiate layers. + self._init_encoders() + self._init_output_layers() + + def _init_encoders(self): + raise NotImplementedError + + def _init_output_layers(self): + raise NotImplementedError + + def forward(self, x_list): + embeddings_list = [] + # Process each input matrix with its corresponding Encoder + for i, x in enumerate(x_list): + embeddings_list.append(self.encoders[i](x)) + embeddings_concat = torch.cat(embeddings_list, dim=1) + + outputs = {} + for var, mlp in self.MLPs.items(): + outputs[var] = mlp(embeddings_concat) + return outputs + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.config["lr"]) + return optimizer + + def compute_loss(self, var, y, y_hat): + if self.dataset.variable_types[var] == "numerical": + # Ignore instances with missing labels for numerical variables + valid_indices = ~torch.isnan(y) + if valid_indices.sum() > 0: # only calculate loss if there are valid targets + y_hat = y_hat[valid_indices] + y = y[valid_indices] + loss = F.mse_loss(torch.flatten(y_hat), y.float()) + else: + loss = 0 # if no valid labels, set loss to 0 + else: + # Ignore instances with missing labels for categorical variables + # Assuming that missing values were encoded as -1 + valid_indices = (y != -1) & (~torch.isnan(y)) + if valid_indices.sum() > 0: # only calculate loss if there are valid targets + y_hat = y_hat[valid_indices] + y = y[valid_indices] + loss = F.cross_entropy(y_hat, y.long()) + else: + loss = 0 + return loss + + def training_step(self, train_batch, batch_idx): + dat, y_dict = train_batch + layers = dat.keys() + x_list = [dat[x] for x in layers] + outputs = self.forward(x_list) + losses = {} + for var in self.target_variables: + y_hat = outputs[var] + y = y_dict[var] + loss = self.compute_loss(var, y, y_hat) + losses[var] = loss + total_loss = sum(losses.values()) + losses["train_loss"] = total_loss + self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True) + return total_loss + + def validation_step(self, val_batch, batch_idx): + dat, y_dict = val_batch + layers = dat.keys() + x_list = [dat[x] for x in layers] + outputs = self.forward(x_list) + losses = {} + for var in self.target_variables: + y_hat = outputs[var] + y = y_dict[var] + loss = self.compute_loss(var, y, y_hat) + losses[var] = loss + total_loss = sum(losses.values()) + losses["val_loss"] = total_loss + self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True) + return total_loss + + def prepare_data(self): + lt = int(len(self.dataset) * (1 - self.val_size)) + lv = len(self.dataset) - lt + dat_train, dat_val = random_split(self.dataset, [lt, lv], generator=torch.Generator().manual_seed(42)) + return dat_train, dat_val + + def train_dataloader(self): + return DataLoader( + self.dat_train, + batch_size=int(self.config["batch_size"]), + num_workers=0, + pin_memory=True, + shuffle=True, + drop_last=True, + ) + + def val_dataloader(self): + return DataLoader( + self.dat_val, batch_size=int(self.config["batch_size"]), num_workers=0, pin_memory=True, shuffle=False + ) + + def predict(self, dataset): + self.eval() + layers = dataset.dat.keys() + x_list = [dataset.dat[x] for x in layers] + outputs = self.forward(x_list) + + predictions = {} + for var in self.target_variables: + y_pred = outputs[var].detach().numpy() + if self.dataset.variable_types[var] == "categorical": + predictions[var] = np.argmax(y_pred, axis=1) + else: + predictions[var] = y_pred + return predictions + + def transform(self, dataset): + self.eval() + embeddings_list = [] + # Process each input matrix with its corresponding Encoder + for i, x in enumerate(dataset.dat.values()): + embeddings_list.append(self.encoders[i](x)) + embeddings_concat = torch.cat(embeddings_list, dim=1) + + # Converting tensor to numpy array and then to DataFrame + embeddings_df = pd.DataFrame( + embeddings_concat.detach().numpy(), + index=dataset.samples, + columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])], + ) + return embeddings_df + + # Adaptor forward function for captum integrated gradients. + def forward_target(self, *args): + input_data = list(args[:-2]) # one or more tensors (one per omics layer) + target_var = args[-2] # target variable of interest + steps = args[-1] # number of steps for IntegratedGradients().attribute + outputs_list = [] + for i in range(steps): + # get list of tensors for each step into a list of tensors + x_step = [input_data[j][i] for j in range(len(input_data))] + out = self.forward(x_step) + outputs_list.append(out[target_var]) + return torch.cat(outputs_list, dim=0) + + def compute_feature_importance(self, target_var, steps=5): + x_list = [self.dataset.dat[x] for x in self.dataset.dat.keys()] + + # Initialize the Integrated Gradients method + ig = IntegratedGradients(self.forward_target) + + input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list]) + + # Define a baseline (you might need to adjust this depending on your actual data) + baseline = tuple([torch.zeros_like(data) for data in input_data]) + + # Get the number of classes for the target variable + if self.dataset.variable_types[target_var] == "numerical": + num_class = 1 + else: + num_class = len(np.unique(self.dataset.ann[target_var])) + + # Compute the feature importance for each class + attributions = [] + if num_class > 1: + for target_class in range(num_class): + attributions.append( + ig.attribute( + input_data, + baseline, + additional_forward_args=(target_var, steps), + target=target_class, + n_steps=steps, + ) + ) + else: + attributions.append( + ig.attribute(input_data, baseline, additional_forward_args=(target_var, steps), n_steps=steps) + ) + + # summarize feature importances + # Compute absolute attributions + abs_attr = [[torch.abs(a) for a in attr_class] for attr_class in attributions] + # average over samples + imp = [[a.mean(dim=1) for a in attr_class] for attr_class in abs_attr] + + # combine into a single data frame + df_list = [] + layers = list(self.dataset.dat.keys()) + for i in range(num_class): + for j in range(len(layers)): + features = self.dataset.features[layers[j]] + importances = imp[i][j][0].detach().numpy() + df_list.append( + pd.DataFrame( + { + "target_variable": target_var, + "target_class": i, + "layer": layers[j], + "name": features, + "importance": importances, + } + ) + ) + df_imp = pd.concat(df_list, ignore_index=True) + + # save the computed scores in the model + self.feature_importances[target_var] = df_imp diff --git a/flexynesis/models/direct_pred_cnn.py b/flexynesis/models/direct_pred_cnn.py index 6aeb9f5..8ae9846 100644 --- a/flexynesis/models/direct_pred_cnn.py +++ b/flexynesis/models/direct_pred_cnn.py @@ -1,18 +1,20 @@ import numpy as np from torch import nn -from .direct_pred import DirectPred from ..modules import CNN +from .base_direct_pred import BaseDirectPred -class DirectPredCNN(DirectPred): +class DirectPredCNN(BaseDirectPred): def _init_encoders(self): layers = list(self.dataset.dat.keys()) input_dims = [len(self.dataset.features[layers[i]]) for i in range(len(layers))] - self.encoders = nn.ModuleList([ - CNN(input_dim=input_dims[i], hidden_dim=self.config["hidden_dim"], output_dim=self.config["latent_dim"]) - for i in range(len(layers)) - ]) + self.encoders = nn.ModuleList( + [ + CNN(input_dim=input_dims[i], hidden_dim=self.config["hidden_dim"], output_dim=self.config["latent_dim"]) + for i in range(len(layers)) + ] + ) def _init_output_layers(self): layers = list(self.dataset.dat.keys())