Skip to content

Commit

Permalink
Repo Restructuring (#10)
Browse files Browse the repository at this point in the history
* Clean models

* Move to models

* Remove tmp modules

* Move shared to modules

* Fix modules module

* Extract a base class from DirectPred class
  • Loading branch information
trsvchn authored Nov 13, 2023
1 parent b80c972 commit 5091dc0
Show file tree
Hide file tree
Showing 11 changed files with 1,380 additions and 1,158 deletions.
7 changes: 2 additions & 5 deletions flexynesis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,11 @@
Bora Uyar, [email protected]
"""

from .models_shared import *
from .modules import *
from .data import *
from .main import *
from .model_DirectPred import *
from .model_SVAE import *
from .model_TripletEncoder import *
from .models import *
from .feature_selection import *
from .data_augmentation import *
from .utils import *
from .config import *
from . import models
410 changes: 0 additions & 410 deletions flexynesis/model_SVAE.py

This file was deleted.

368 changes: 0 additions & 368 deletions flexynesis/model_TripletEncoder.py

This file was deleted.

4 changes: 2 additions & 2 deletions flexynesis/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .direct_pred import DirectPred
from .direct_pred_cnn import DirectPredCNN
from .supervised_vae import SupervisedVAE
from .supervised_vae import supervised_vae
from .triplet_encoder import MultiTripletNetwork

__all__ = ["DirectPred", "DirectPredCNN", "SupervisedVAE", "MultiTripletNetwork"]
__all__ = ["DirectPred", "DirectPredCNN", "supervised_vae", "MultiTripletNetwork"]
215 changes: 77 additions & 138 deletions flexynesis/model_DirectPred.py → flexynesis/models/base_direct_pred.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,37 @@
import torch
from torch import nn
from torch.nn import functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.data import DataLoader, random_split

import pandas as pd
import numpy as np
import os, argparse
from scipy import stats
from functools import reduce

from captum.attr import IntegratedGradients
import pandas as pd

from .models_shared import *
import pytorch_lightning as pl

from captum.attr import IntegratedGradients


class DirectPred(pl.LightningModule):
def __init__(self, config, dataset, target_variables, batch_variables = None, val_size = 0.2):
super(DirectPred, self).__init__()
class BaseDirectPred(pl.LightningModule):
def __init__(self, config, dataset, target_variables, batch_variables=None, val_size=0.2):
super().__init__()
self.config = config
self.dataset = dataset
self.target_variables = target_variables
self.batch_variables = batch_variables
self.variables = target_variables + batch_variables if batch_variables else target_variables
self.val_size = val_size
self.dat_train, self.dat_val = self.prepare_data()
self.feature_importances = {}
self.feature_importances = {}
# Instantiate layers.
self._init_encoders()
self._init_output_layers()

def _init_encoders(self):
layers = list(self.dataset.dat.keys())
input_dims = [len(self.dataset.features[layers[i]]) for i in range(len(layers))]
self.encoders = nn.ModuleList([
MLP(input_dim=input_dims[i], hidden_dim=self.config["hidden_dim"], output_dim=self.config["latent_dim"])
for i in range(len(layers))
])
raise NotImplementedError

def _init_output_layers(self):
layers = list(self.dataset.dat.keys())
self.MLPs = nn.ModuleDict() # using ModuleDict to store multiple MLPs
for var in self.target_variables:
if self.dataset.variable_types[var] == "numerical":
num_class = 1
else:
num_class = len(np.unique(self.dataset.ann[var]))
self.MLPs[var] = MLP(
input_dim=self.config["latent_dim"] * len(layers),
hidden_dim=self.config["hidden_dim"],
output_dim=num_class,
)
raise NotImplementedError

def forward(self, x_list):
"""
Forward pass of the DirectPred model.
Args:
x_list (list of torch.Tensor): A list of input matrices (omics layers), one for each layer.
Returns:
dict: A dictionary where each key-value pair corresponds to the target variable name and its predicted output respectively.
"""
embeddings_list = []
# Process each input matrix with its corresponding Encoder
for i, x in enumerate(x_list):
Expand All @@ -72,30 +41,22 @@ def forward(self, x_list):
outputs = {}
for var, mlp in self.MLPs.items():
outputs[var] = mlp(embeddings_concat)
return outputs


def configure_optimizers(self):
"""
Configure the optimizer for the DirectPred model.
Returns:
torch.optim.Optimizer: The configured optimizer.
"""
return outputs

optimizer = torch.optim.Adam(self.parameters(), lr=self.config['lr'])
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.config["lr"])
return optimizer

def compute_loss(self, var, y, y_hat):
if self.dataset.variable_types[var] == 'numerical':
if self.dataset.variable_types[var] == "numerical":
# Ignore instances with missing labels for numerical variables
valid_indices = ~torch.isnan(y)
if valid_indices.sum() > 0: # only calculate loss if there are valid targets
y_hat = y_hat[valid_indices]
y = y[valid_indices]
loss = F.mse_loss(torch.flatten(y_hat), y.float())
else:
loss = 0 # if no valid labels, set loss to 0
loss = 0 # if no valid labels, set loss to 0
else:
# Ignore instances with missing labels for categorical variables
# Assuming that missing values were encoded as -1
Expand All @@ -104,21 +65,12 @@ def compute_loss(self, var, y, y_hat):
y_hat = y_hat[valid_indices]
y = y[valid_indices]
loss = F.cross_entropy(y_hat, y.long())
else:
else:
loss = 0
return loss

def training_step(self, train_batch, batch_idx):
"""
Perform a single training step.
Args:
train_batch (tuple): A tuple containing the input data and labels for the current batch.
batch_idx (int): The index of the current batch.
Returns:
torch.Tensor: The total loss for the current training step.
"""

dat, y_dict = train_batch
dat, y_dict = train_batch
layers = dat.keys()
x_list = [dat[x] for x in layers]
outputs = self.forward(x_list)
Expand All @@ -129,23 +81,12 @@ def training_step(self, train_batch, batch_idx):
loss = self.compute_loss(var, y, y_hat)
losses[var] = loss
total_loss = sum(losses.values())
losses['train_loss'] = total_loss
losses["train_loss"] = total_loss
self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True)
return total_loss


def validation_step(self, val_batch, batch_idx):
"""
Perform a single validation step.
Args:
val_batch (tuple): A tuple containing the input data and labels for the current batch.
batch_idx (int): The index of the current batch.
Returns:
torch.Tensor: The total loss for the current validation step.
"""
dat, y_dict = val_batch
dat, y_dict = val_batch
layers = dat.keys()
x_list = [dat[x] for x in layers]
outputs = self.forward(x_list)
Expand All @@ -156,34 +97,32 @@ def validation_step(self, val_batch, batch_idx):
loss = self.compute_loss(var, y, y_hat)
losses[var] = loss
total_loss = sum(losses.values())
losses['val_loss'] = total_loss
losses["val_loss"] = total_loss
self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True)
return total_loss


def prepare_data(self):
lt = int(len(self.dataset)*(1-self.val_size))
lv = len(self.dataset)-lt
dat_train, dat_val = random_split(self.dataset, [lt, lv],
generator=torch.Generator().manual_seed(42))
lt = int(len(self.dataset) * (1 - self.val_size))
lv = len(self.dataset) - lt
dat_train, dat_val = random_split(self.dataset, [lt, lv], generator=torch.Generator().manual_seed(42))
return dat_train, dat_val

def train_dataloader(self):
return DataLoader(self.dat_train, batch_size=int(self.config['batch_size']), num_workers=0, pin_memory=True, shuffle=True, drop_last=True)
return DataLoader(
self.dat_train,
batch_size=int(self.config["batch_size"]),
num_workers=0,
pin_memory=True,
shuffle=True,
drop_last=True,
)

def val_dataloader(self):
return DataLoader(self.dat_val, batch_size=int(self.config['batch_size']), num_workers=0, pin_memory=True, shuffle=False)

def predict(self, dataset):
"""
Evaluate the DirectPred model on a given dataset.
Args:
dataset: The dataset to evaluate the model on.
return DataLoader(
self.dat_val, batch_size=int(self.config["batch_size"]), num_workers=0, pin_memory=True, shuffle=False
)

Returns:
A dictionary where each key is a target variable and the corresponding value is the predicted output for that variable.
"""
def predict(self, dataset):
self.eval()
layers = dataset.dat.keys()
x_list = [dataset.dat[x] for x in layers]
Expand All @@ -192,24 +131,13 @@ def predict(self, dataset):
predictions = {}
for var in self.target_variables:
y_pred = outputs[var].detach().numpy()
if self.dataset.variable_types[var] == 'categorical':
if self.dataset.variable_types[var] == "categorical":
predictions[var] = np.argmax(y_pred, axis=1)
else:
predictions[var] = y_pred
return predictions

def transform(self, dataset):
"""
Transform the input data into a lower-dimensional space using the trained encoders.
Args:
dataset: The input dataset containing the omics data.
Returns:
pd.DataFrame: A dataframe of embeddings where the row indices are
dataset.samples and the column names are created by appending
the substring "E" to each dimension index.
"""
self.eval()
embeddings_list = []
# Process each input matrix with its corresponding Encoder
Expand All @@ -218,36 +146,29 @@ def transform(self, dataset):
embeddings_concat = torch.cat(embeddings_list, dim=1)

# Converting tensor to numpy array and then to DataFrame
embeddings_df = pd.DataFrame(embeddings_concat.detach().numpy(),
index=dataset.samples,
columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])])
embeddings_df = pd.DataFrame(
embeddings_concat.detach().numpy(),
index=dataset.samples,
columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])],
)
return embeddings_df
# Adaptor forward function for captum integrated gradients.

# Adaptor forward function for captum integrated gradients.
def forward_target(self, *args):
input_data = list(args[:-2]) # one or more tensors (one per omics layer)
target_var = args[-2] # target variable of interest
steps = args[-1] # number of steps for IntegratedGradients().attribute
steps = args[-1] # number of steps for IntegratedGradients().attribute
outputs_list = []
for i in range(steps):
# get list of tensors for each step into a list of tensors
x_step = [input_data[j][i] for j in range(len(input_data))]
out = self.forward(x_step)
outputs_list.append(out[target_var])
return torch.cat(outputs_list, dim = 0)

def compute_feature_importance(self, target_var, steps = 5):
"""
Compute the feature importance.
return torch.cat(outputs_list, dim=0)

Args:
input_data (torch.Tensor): The input data to compute the feature importance for.
target_var (str): The target variable to compute the feature importance for.
Returns:
attributions (list of torch.Tensor): The feature importances for each class.
"""
def compute_feature_importance(self, target_var, steps=5):
x_list = [self.dataset.dat[x] for x in self.dataset.dat.keys()]

# Initialize the Integrated Gradients method
ig = IntegratedGradients(self.forward_target)

Expand All @@ -257,7 +178,7 @@ def compute_feature_importance(self, target_var, steps = 5):
baseline = tuple([torch.zeros_like(data) for data in input_data])

# Get the number of classes for the target variable
if self.dataset.variable_types[target_var] == 'numerical':
if self.dataset.variable_types[target_var] == "numerical":
num_class = 1
else:
num_class = len(np.unique(self.dataset.ann[target_var]))
Expand All @@ -266,27 +187,45 @@ def compute_feature_importance(self, target_var, steps = 5):
attributions = []
if num_class > 1:
for target_class in range(num_class):
attributions.append(ig.attribute(input_data, baseline, additional_forward_args=(target_var, steps), target=target_class, n_steps=steps))
attributions.append(
ig.attribute(
input_data,
baseline,
additional_forward_args=(target_var, steps),
target=target_class,
n_steps=steps,
)
)
else:
attributions.append(ig.attribute(input_data, baseline, additional_forward_args=(target_var, steps), n_steps=steps))
attributions.append(
ig.attribute(input_data, baseline, additional_forward_args=(target_var, steps), n_steps=steps)
)

# summarize feature importances
# Compute absolute attributions
abs_attr = [[torch.abs(a) for a in attr_class] for attr_class in attributions]
# average over samples
# average over samples
imp = [[a.mean(dim=1) for a in attr_class] for attr_class in abs_attr]

# combine into a single data frame
# combine into a single data frame
df_list = []
layers = list(self.dataset.dat.keys())
for i in range(num_class):
for j in range(len(layers)):
features = self.dataset.features[layers[j]]
importances = imp[i][j][0].detach().numpy()
df_list.append(pd.DataFrame({'target_variable': target_var, 'target_class': i, 'layer': layers[j], 'name': features, 'importance': importances}))
df_imp = pd.concat(df_list, ignore_index = True)

df_list.append(
pd.DataFrame(
{
"target_variable": target_var,
"target_class": i,
"layer": layers[j],
"name": features,
"importance": importances,
}
)
)
df_imp = pd.concat(df_list, ignore_index=True)

# save the computed scores in the model
self.feature_importances[target_var] = df_imp


Loading

0 comments on commit 5091dc0

Please sign in to comment.