Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repo Restructuring #10

Merged
merged 6 commits into from
Nov 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions flexynesis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,11 @@
Bora Uyar, [email protected]
"""

from .models_shared import *
from .modules import *
from .data import *
from .main import *
from .model_DirectPred import *
from .model_SVAE import *
from .model_TripletEncoder import *
from .models import *
from .feature_selection import *
from .data_augmentation import *
from .utils import *
from .config import *
from . import models
410 changes: 0 additions & 410 deletions flexynesis/model_SVAE.py

This file was deleted.

368 changes: 0 additions & 368 deletions flexynesis/model_TripletEncoder.py

This file was deleted.

4 changes: 2 additions & 2 deletions flexynesis/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .direct_pred import DirectPred
from .direct_pred_cnn import DirectPredCNN
from .supervised_vae import SupervisedVAE
from .supervised_vae import supervised_vae
from .triplet_encoder import MultiTripletNetwork

__all__ = ["DirectPred", "DirectPredCNN", "SupervisedVAE", "MultiTripletNetwork"]
__all__ = ["DirectPred", "DirectPredCNN", "supervised_vae", "MultiTripletNetwork"]
215 changes: 77 additions & 138 deletions flexynesis/model_DirectPred.py → flexynesis/models/base_direct_pred.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,37 @@
import torch
from torch import nn
from torch.nn import functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.data import DataLoader, random_split

import pandas as pd
import numpy as np
import os, argparse
from scipy import stats
from functools import reduce

from captum.attr import IntegratedGradients
import pandas as pd

from .models_shared import *
import pytorch_lightning as pl

from captum.attr import IntegratedGradients


class DirectPred(pl.LightningModule):
def __init__(self, config, dataset, target_variables, batch_variables = None, val_size = 0.2):
super(DirectPred, self).__init__()
class BaseDirectPred(pl.LightningModule):
def __init__(self, config, dataset, target_variables, batch_variables=None, val_size=0.2):
super().__init__()
self.config = config
self.dataset = dataset
self.target_variables = target_variables
self.batch_variables = batch_variables
self.variables = target_variables + batch_variables if batch_variables else target_variables
self.val_size = val_size
self.dat_train, self.dat_val = self.prepare_data()
self.feature_importances = {}
self.feature_importances = {}
# Instantiate layers.
self._init_encoders()
self._init_output_layers()

def _init_encoders(self):
layers = list(self.dataset.dat.keys())
input_dims = [len(self.dataset.features[layers[i]]) for i in range(len(layers))]
self.encoders = nn.ModuleList([
MLP(input_dim=input_dims[i], hidden_dim=self.config["hidden_dim"], output_dim=self.config["latent_dim"])
for i in range(len(layers))
])
raise NotImplementedError

def _init_output_layers(self):
layers = list(self.dataset.dat.keys())
self.MLPs = nn.ModuleDict() # using ModuleDict to store multiple MLPs
for var in self.target_variables:
if self.dataset.variable_types[var] == "numerical":
num_class = 1
else:
num_class = len(np.unique(self.dataset.ann[var]))
self.MLPs[var] = MLP(
input_dim=self.config["latent_dim"] * len(layers),
hidden_dim=self.config["hidden_dim"],
output_dim=num_class,
)
raise NotImplementedError

def forward(self, x_list):
"""
Forward pass of the DirectPred model.

Args:
x_list (list of torch.Tensor): A list of input matrices (omics layers), one for each layer.

Returns:
dict: A dictionary where each key-value pair corresponds to the target variable name and its predicted output respectively.
"""
embeddings_list = []
# Process each input matrix with its corresponding Encoder
for i, x in enumerate(x_list):
Expand All @@ -72,30 +41,22 @@ def forward(self, x_list):
outputs = {}
for var, mlp in self.MLPs.items():
outputs[var] = mlp(embeddings_concat)
return outputs


def configure_optimizers(self):
"""
Configure the optimizer for the DirectPred model.

Returns:
torch.optim.Optimizer: The configured optimizer.
"""
return outputs

optimizer = torch.optim.Adam(self.parameters(), lr=self.config['lr'])
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.config["lr"])
return optimizer

def compute_loss(self, var, y, y_hat):
if self.dataset.variable_types[var] == 'numerical':
if self.dataset.variable_types[var] == "numerical":
# Ignore instances with missing labels for numerical variables
valid_indices = ~torch.isnan(y)
if valid_indices.sum() > 0: # only calculate loss if there are valid targets
y_hat = y_hat[valid_indices]
y = y[valid_indices]
loss = F.mse_loss(torch.flatten(y_hat), y.float())
else:
loss = 0 # if no valid labels, set loss to 0
loss = 0 # if no valid labels, set loss to 0
else:
# Ignore instances with missing labels for categorical variables
# Assuming that missing values were encoded as -1
Expand All @@ -104,21 +65,12 @@ def compute_loss(self, var, y, y_hat):
y_hat = y_hat[valid_indices]
y = y[valid_indices]
loss = F.cross_entropy(y_hat, y.long())
else:
else:
loss = 0
return loss

def training_step(self, train_batch, batch_idx):
"""
Perform a single training step.
Args:
train_batch (tuple): A tuple containing the input data and labels for the current batch.
batch_idx (int): The index of the current batch.
Returns:
torch.Tensor: The total loss for the current training step.
"""

dat, y_dict = train_batch
dat, y_dict = train_batch
layers = dat.keys()
x_list = [dat[x] for x in layers]
outputs = self.forward(x_list)
Expand All @@ -129,23 +81,12 @@ def training_step(self, train_batch, batch_idx):
loss = self.compute_loss(var, y, y_hat)
losses[var] = loss
total_loss = sum(losses.values())
losses['train_loss'] = total_loss
losses["train_loss"] = total_loss
self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True)
return total_loss


def validation_step(self, val_batch, batch_idx):
"""
Perform a single validation step.

Args:
val_batch (tuple): A tuple containing the input data and labels for the current batch.
batch_idx (int): The index of the current batch.

Returns:
torch.Tensor: The total loss for the current validation step.
"""
dat, y_dict = val_batch
dat, y_dict = val_batch
layers = dat.keys()
x_list = [dat[x] for x in layers]
outputs = self.forward(x_list)
Expand All @@ -156,34 +97,32 @@ def validation_step(self, val_batch, batch_idx):
loss = self.compute_loss(var, y, y_hat)
losses[var] = loss
total_loss = sum(losses.values())
losses['val_loss'] = total_loss
losses["val_loss"] = total_loss
self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True)
return total_loss


def prepare_data(self):
lt = int(len(self.dataset)*(1-self.val_size))
lv = len(self.dataset)-lt
dat_train, dat_val = random_split(self.dataset, [lt, lv],
generator=torch.Generator().manual_seed(42))
lt = int(len(self.dataset) * (1 - self.val_size))
lv = len(self.dataset) - lt
dat_train, dat_val = random_split(self.dataset, [lt, lv], generator=torch.Generator().manual_seed(42))
return dat_train, dat_val

def train_dataloader(self):
return DataLoader(self.dat_train, batch_size=int(self.config['batch_size']), num_workers=0, pin_memory=True, shuffle=True, drop_last=True)
return DataLoader(
self.dat_train,
batch_size=int(self.config["batch_size"]),
num_workers=0,
pin_memory=True,
shuffle=True,
drop_last=True,
)

def val_dataloader(self):
return DataLoader(self.dat_val, batch_size=int(self.config['batch_size']), num_workers=0, pin_memory=True, shuffle=False)

def predict(self, dataset):
"""
Evaluate the DirectPred model on a given dataset.

Args:
dataset: The dataset to evaluate the model on.
return DataLoader(
self.dat_val, batch_size=int(self.config["batch_size"]), num_workers=0, pin_memory=True, shuffle=False
)

Returns:
A dictionary where each key is a target variable and the corresponding value is the predicted output for that variable.
"""
def predict(self, dataset):
self.eval()
layers = dataset.dat.keys()
x_list = [dataset.dat[x] for x in layers]
Expand All @@ -192,24 +131,13 @@ def predict(self, dataset):
predictions = {}
for var in self.target_variables:
y_pred = outputs[var].detach().numpy()
if self.dataset.variable_types[var] == 'categorical':
if self.dataset.variable_types[var] == "categorical":
predictions[var] = np.argmax(y_pred, axis=1)
else:
predictions[var] = y_pred
return predictions

def transform(self, dataset):
"""
Transform the input data into a lower-dimensional space using the trained encoders.

Args:
dataset: The input dataset containing the omics data.

Returns:
pd.DataFrame: A dataframe of embeddings where the row indices are
dataset.samples and the column names are created by appending
the substring "E" to each dimension index.
"""
self.eval()
embeddings_list = []
# Process each input matrix with its corresponding Encoder
Expand All @@ -218,36 +146,29 @@ def transform(self, dataset):
embeddings_concat = torch.cat(embeddings_list, dim=1)

# Converting tensor to numpy array and then to DataFrame
embeddings_df = pd.DataFrame(embeddings_concat.detach().numpy(),
index=dataset.samples,
columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])])
embeddings_df = pd.DataFrame(
embeddings_concat.detach().numpy(),
index=dataset.samples,
columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])],
)
return embeddings_df
# Adaptor forward function for captum integrated gradients.

# Adaptor forward function for captum integrated gradients.
def forward_target(self, *args):
input_data = list(args[:-2]) # one or more tensors (one per omics layer)
target_var = args[-2] # target variable of interest
steps = args[-1] # number of steps for IntegratedGradients().attribute
steps = args[-1] # number of steps for IntegratedGradients().attribute
outputs_list = []
for i in range(steps):
# get list of tensors for each step into a list of tensors
x_step = [input_data[j][i] for j in range(len(input_data))]
out = self.forward(x_step)
outputs_list.append(out[target_var])
return torch.cat(outputs_list, dim = 0)

def compute_feature_importance(self, target_var, steps = 5):
"""
Compute the feature importance.
return torch.cat(outputs_list, dim=0)

Args:
input_data (torch.Tensor): The input data to compute the feature importance for.
target_var (str): The target variable to compute the feature importance for.
Returns:
attributions (list of torch.Tensor): The feature importances for each class.
"""
def compute_feature_importance(self, target_var, steps=5):
x_list = [self.dataset.dat[x] for x in self.dataset.dat.keys()]

# Initialize the Integrated Gradients method
ig = IntegratedGradients(self.forward_target)

Expand All @@ -257,7 +178,7 @@ def compute_feature_importance(self, target_var, steps = 5):
baseline = tuple([torch.zeros_like(data) for data in input_data])

# Get the number of classes for the target variable
if self.dataset.variable_types[target_var] == 'numerical':
if self.dataset.variable_types[target_var] == "numerical":
num_class = 1
else:
num_class = len(np.unique(self.dataset.ann[target_var]))
Expand All @@ -266,27 +187,45 @@ def compute_feature_importance(self, target_var, steps = 5):
attributions = []
if num_class > 1:
for target_class in range(num_class):
attributions.append(ig.attribute(input_data, baseline, additional_forward_args=(target_var, steps), target=target_class, n_steps=steps))
attributions.append(
ig.attribute(
input_data,
baseline,
additional_forward_args=(target_var, steps),
target=target_class,
n_steps=steps,
)
)
else:
attributions.append(ig.attribute(input_data, baseline, additional_forward_args=(target_var, steps), n_steps=steps))
attributions.append(
ig.attribute(input_data, baseline, additional_forward_args=(target_var, steps), n_steps=steps)
)

# summarize feature importances
# Compute absolute attributions
abs_attr = [[torch.abs(a) for a in attr_class] for attr_class in attributions]
# average over samples
# average over samples
imp = [[a.mean(dim=1) for a in attr_class] for attr_class in abs_attr]

# combine into a single data frame
# combine into a single data frame
df_list = []
layers = list(self.dataset.dat.keys())
for i in range(num_class):
for j in range(len(layers)):
features = self.dataset.features[layers[j]]
importances = imp[i][j][0].detach().numpy()
df_list.append(pd.DataFrame({'target_variable': target_var, 'target_class': i, 'layer': layers[j], 'name': features, 'importance': importances}))
df_imp = pd.concat(df_list, ignore_index = True)

df_list.append(
pd.DataFrame(
{
"target_variable": target_var,
"target_class": i,
"layer": layers[j],
"name": features,
"importance": importances,
}
)
)
df_imp = pd.concat(df_list, ignore_index=True)

# save the computed scores in the model
self.feature_importances[target_var] = df_imp


Loading
Loading