Exploring-Count-Matrix.Rmd

--- 
title: "Profiling Beta cell heterogeneity in T2D"
author: "Jonathan Anzules"
date: "2023-12-29"
output: html_document
---

```{r}
library(Seurat)
library(dplyr)
library(ggplot2)
library(slingshot) # Pseudotime analysis
library(grDevices) # Pseudotime colors
library(RColorBrewer) # Pseudotime colors
library(monocle3) # Pseudotime 2
library(SeuratWrappers) # For cell_data_set
```


# Data Prep
```{r - Loading, normalizing, identifying variaility,and scaling}
HQ_counts <- read.csv("/Documents and Settings/jonan/Documents/1Work/scWork/Beta Cell Study/Data/FirstCountMatrix.csv", row.names = 1)

#Creating a Seurat object
pancr_cells <- CreateSeuratObject(counts = HQ_counts)

#Normalizing to 10000 count
pancr_cells <- NormalizeData(pancr_cells, normalization.method = "LogNormalize", scale.factor = 10000)

# Scaling
all.genes <- rownames(pancr_cells)
pancr_cells <- ScaleData(pancr_cells, features = all.genes)

# Identifying variable features
pancr_cells <- FindVariableFeatures(pancr_cells, selection.method = "vst", nfeatures = 2000)
```
```{r - Exploring values}

head(pancr_cells[["RNA"]]$counts, n = 10)[,1:3] 
head(pancr_cells[["RNA"]]$data, n = 10)[,1:3]
head(pancr_cells[["RNA"]]$scale.data, n = 10)[,1:3]
# Why does an observation out a value gets a value when scaled?
```

```{r - Visualizing variable genes}

top10_hvg <- head(VariableFeatures(pancr_cells), 10)

plot1 <- VariableFeaturePlot(pancr_cells)
plot2 <- LabelPoints(plot = plot1, points = top10_hvg, repel = TRUE)

plot_hvg <- plot1 + plot2

# Non-variable count: 38605
# Variable count: 2000

plot_hvg +
  theme(axis.text.x = element_text(size = 2), 
        axis.text.y = element_text(size = 2),
        axis.title = element_text(size = 6),
        plot.title = element_text(size = 13),
        legend.position = "none"
        )
```

# Dimensionality Reduction (PCA)

```{r}
pancr_cells <- RunPCA(pancr_cells, features = VariableFeatures(object = pancr_cells))

print(pancr_cells[["pca"]], dims = 1:5, nfeatures = 6 )
```

```{r - Visualizing the PCA}

VizDimLoadings(pancr_cells, dims = 1:2, reduction = "pca")
VizDimLoadings(pancr_cells, dims = 3:4, reduction = "pca")
VizDimLoadings(pancr_cells, dims = 5, reduction = "pca")

DimPlot(pancr_cells, reduction = "pca") + NoLegend()

DimHeatmap(pancr_cells, dims = 1, cells = 500, balanced = TRUE)

```


```{r = PCA Plot}

DimPlot(pancr_cells, reduction = "pca") + NoLegend()

```

```{r - Heatmaps }
DimHeatmap(pancr_cells, dims = 1:6, cells = 500, balance = TRUE)
```

```{r - Elbow plot}
ElbowPlot(pancr_cells)

```

# Cluster Analysis

```{r - Identifying clusters}
pancr_cells <- FindNeighbors(pancr_cells, dims = 1:10)
pancr_cells <- FindClusters(pancr_cells, resolution = 0.4)
head(Idents(pancr_cells), 5)
```

```{r - Visualizing}
pancr_cells <- RunUMAP(pancr_cells, dims = 1:10)
DimPlot(pancr_cells, reduction = "umap")

#Saving results
# saveRDS(pancr_cells, file = "/Documents and Settings/jonan/Documents/1Work/scWork/Data/pancr_cells_first.rds")
```
## Listing markers
```{r}
clstr_0 <- FindMarkers(pancr_cells, ident.1 = 0)
clstr_1 <- FindMarkers(pancr_cells, ident.1 = 1)
clstr_2 <- FindMarkers(pancr_cells, ident.1 = 2)
clstr_3 <- FindMarkers(pancr_cells, ident.1 = 3)
clstr_4 <- FindMarkers(pancr_cells, ident.1 = 4)
clstr_5 <- FindMarkers(pancr_cells, ident.1 = 5)
clstr_6 <- FindMarkers(pancr_cells, ident.1 = 6)

devtools::install_github('immunogenomics/presto')

```
```{r}
pancr_cells.markers <- FindAllMarkers(pancr_cells, only.pos = TRUE)
pancr_cells.markers %>%
  group_by(cluster) %>%
  dplyr::filter(avg_log2FC > 1)
```


```{r}
# head(clstr_0, n = 5)
# head(clstr_1, n = 5)
head(clstr_2, n = 5) # Most likely to be the beta cell cluster
# head(clstr_3, n = 5)
head(clstr_4, n = 5) # Most likely to be the beta cell cluster
head(clstr_5, n = 5) # Most likely to be the beta cell cluster
# head(clstr_6, n = 5) 
```
# Visualizing cluster markers
```{r - }

#TODO explore RidgePlot(), CellScatter(), and DotPlot()

beta_features <- c("INS", "PDX1", "NKX6-1", "MAFA", "NEUROD1", "ADCYAP1")
alpha_features <- c("GCG", "ARX", "TTR")
delta_features <- c("SST", "HHEX")
mad_beta_study <- c("AMY2A", "KLF6", "GARIN4", "PPM1N", "ATF3", "CLPS")
VlnPlot(pancr_cells, features = mad_beta_study)
VlnPlot(pancr_cells, features = beta_features)
VlnPlot(pancr_cells, features = alpha_features)
VlnPlot(pancr_cells, features = delta_features)
VlnPlot(pancr_cells, features = "INS")


```

```{r - Featureplot}
# FeaturePlot(pancr_cells, features = beta_features)
# FeaturePlot(pancr_cells, features = alpha_features)
# FeaturePlot(pancr_cells, features = delta_features)
FeaturePlot(pancr_cells, features = "INS")
FeaturePlot(pancr_cells, features = "AMY2A") # Could these be signs of dedifferentiated beta cells?
FeaturePlot(pancr_cells, features = "KLF6") # Could these be signs of dedifferentiated beta cells?
FeaturePlot(pancr_cells, features = mad_beta_study)
```

```{r - Heat map}

pancr_cells.markers %>%
  group_by(cluster) %>%
  slice_head(n=10) %>%
  ungroup() -> top10
DoHeatmap(pancr_cells, features = top10$gene) + NoLegend()

#Saving image
heatmap_plot <- DoHeatmap(pancr_cells, features = top10$gene) + NoLegend()

# ggsave("/Documents and Settings/jonan/Documents/1Work/scWork/Figures/MarkersClustersHeatmap.png", plot = heatmap_plot, width = 20, height = 15, dpi = 300)

```

```{r - Visualizign donor grouping}

metadata_df <- read.csv("/Documents and Settings/jonan/Documents/1Work/scWork/Beta Cell Study/Data/PANCDB/Cleaned-N-Aligned-hpapdata/metadata_T2D.csv")

#removing the quantfile location column
metadata_df$quant_file <- NULL

metadata_df$cell <- paste0("X", metadata_df$cell)
rownames(metadata_df) <- metadata_df$cell
metadata_df$cell <- NULL

pancr_cells <- AddMetaData(pancr_cells, metadata = metadata_df)

head(metadata_df)
head(pancr_cells@meta.data)

DimPlot(pancr_cells, group.by = "donor")
```

# Pseudotime of Betacells

```{r}
beta_cells <- subset(pancr_cells, idents = c("2", "4", "5"))
# Get the row names (cell labels) from the beta_cells Seurat object
beta_cell_labels <- rownames(beta_cells@meta.data)
beta_cell_labels <- substr(beta_cell_labels, 2, nchar(beta_cell_labels))
write.csv(beta_cell_labels, "../Data/beta_cell_labels_prototype.csv", row.names = FALSE)
# Checking
head(beta_cells@meta.data)
head(beta_cells@reductions$pca)
head(Embeddings(beta_cells, "umap"))
```

```{r - Pseudotime - UMAP}
# converting to single cell experimenta
sce_pancreas <- as.SingleCellExperiment(beta_cells)

sce_pancreas <- slingshot(sce_pancreas, clusterLabels = "seurat_clusters", reducedDim = "UMAP", useNames = TRUE)

colors <- colorRampPalette(brewer.pal(11, 'Spectral')[-6])(100)
plotcol <- colors[cut(sce_pancreas$slingPseudotime_1, breaks = 100)]

plot(reducedDims(sce_pancreas)$UMAP, col = plotcol, pch = 16, asp = 1)
```


```{r}
sce_pancreas$
plot(reducedDims(sce_pancreas)$UMAP, col = brewer.pal(9,'Set1')[sce_pancreas$GMM], pch=16, asp = 1)
lines(SlingshotDataSet(sce), lwd=2, type = 'lineages', col = 'black')
```


# converting to single cell experimenta
```{r - Pseudotime PCA}
sce_pancreas <- as.SingleCellExperiment(beta_cells)

sce_pancreas <- slingshot(sce_pancreas, clusterLabels = "seurat_clusters", reducedDim = "PCA")

colors <- colorRampPalette(brewer.pal(11, 'Spectral')[-6])(100)
plotcol <- colors[cut(sce_pancreas$slingPseudotime_1, breaks = 100)]

plot(reducedDims(sce_pancreas)$PCA, col = plotcol, pch = 16, asp = 1)
```


```{r}
sce_pancreas
```


```{r}
plot(reducedDims(sce_pancreas)$PCA, col = brewer.pal(9,'Set1')[sce_pancreas$GMM], pch=16, asp = 1)
plot(reducedDims(sce_pancreas)$PCA, col = plotcol, pch = 16, asp = 1)
lines(SlingshotDataSet(sce_pancreas), lwd=2, type = 'lineages', col = 'black')
```


# Pseudotime with monocle

```{r}
HQ_counts <- read.csv("/Documents and Settings/jonan/Documents/1Work/scWork/Beta Cell Study/Data/FirstCountMatrix.csv", row.names = 1)

#Creating a Seurat object
pancr_cells <- CreateSeuratObject(counts = HQ_counts)

# Convert Seurat object to Monocle's CellDataSet format
cds <- as.cell_data_set(pancr_cells)

# Preprocess the data (normalization and feature selection)
cds <- preprocess_cds(cds, num_dim = 50)

# Reduce the dimensionality using UMAP (or PCA, if preferred)
cds <- reduce_dimension(cds, reduction_method = "UMAP")

# Cluster the cells
cds <- cluster_cells(cds)

# Learn the trajectory graph
cds <- learn_graph(cds)

# Order cells in pseudotime
cds <- order_cells(cds)

# Plot the pseudotime trajectory
plot_cells(cds, color_cells_by = "pseudotime")
```


# From other script 

```{r}
# Get the rownames from the Seurat object
gene_names <- rownames(pancr_cells)

grep("ALDH ", gene_names, ignore.case = TRUE, value = TRUE)


```


```{r}

# List of genes
genes <- c("PDX1", "NKX6-1", "NANOG", "MYCL")

# Create multiple violin plots
p <- VlnPlot(pancr_cells, features = genes)

p1 <- list() 
for (i in seq_along(p)){
    #Change x and y tick label font size.
    p1[[i]] = p[[i]] + theme(axis.text.x = element_text(size = 2), 
                             axis.text.y = element_text(size = 2),
                             axis.title = element_text(size = 6),
                             plot.title = element_text(size = 13))
}

  
plot_grid(plotlist = p1, ncol = 2) #display all vlnplots. Notice that I used `p1`
# p <- p + labs(x = "", y = "") + 
# theme(
#   axis.title.x = element_blank(),
#   axis.title.y = element_blank()
# )
# 
# print(p)

# Adjust the size of the text within the VlnPlot function
p <- VlnPlot(pancr_cells, features = genes, pt.size = 0.1) + 
  NoLegend() + 
  theme_minimal(base_size = 10) +
  theme(
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 8),
    axis.title = element_text(size = 8),
    legend.position = "none"
  )

# If using Seurat v3.2.3 or higher, you can use the following to remove legends for multiple plots
p$legend <- NULL

print(p)


```

```{r}
pancr_cells <- RunPCA(pancr_cells, features = VariableFeatures(object = pancr_cells))

FeaturePlot(pancr_cells, features = c("INS", "PDX1", "NKX6-1"))
```

```{r - t-SNE plot highlighting INS}
# Run t-SNE
pancr_cells <- RunTSNE(pancr_cells, dims = 1:15)

# Generate a t-SNE plot
DimPlot(pancr_cells, reduction = "tsne")

# Create a t-SNE plot highlighting the "INS" expression
FeaturePlot(pancr_cells, features = "INS", min.cutoff = 4, reduction = "tsne")

```


```{r}

FeaturePlot(pancr_cells, features = append(genes, "INS"), reduction = "tsne")
```


```{r}
# Load necessary libraries
library(monocle)
library(Seurat)

# Convert Seurat object to Monocle's CellDataSet format
# Make sure to use the correct assay if you have more than one (e.g., "RNA", "SCT", etc.)
cds <- as.cell_data_set(pancr_cells)

# Choose dimensionality reduction and clustering information from Seurat to use in Monocle
cds <- reduceDimension(cds, reduction_method = 'DDRTree')
cds <- clusterCells(cds)

# Order cells in pseudotime
cds <- orderCells(cds)

# Plot the pseudotime trajectory
plot_cell_trajectory(cds, color_by = "pseudotime")

```

```{r}
library(Seurat)
library(slingshot)
library(SingleCellExperiment)

# Assuming your Seurat object is named 'pancr_cells' and has PCA, clustering, and UMAP/t-SNE computed
# Define the threshold for INS expression
ins_cutoff <- 4  # Adjust this based on your t-SNE plot and analysis

# Create a metadata column for beta cell identification
pancr_cells$beta_cell <- ifelse(pancr_cells@assays$RNA@data["INS", ] > ins_cutoff, "Beta", "Non-Beta")

# Convert Seurat object to SingleCellExperiment object
sce <- as.SingleCellExperiment(pancr_cells)

# Prepare Slingshot input: use reducedDimNames() to check the names of the reduced dimensions in your object
# and update the 'reducedDims' argument below if needed
sce <- slingshot(sce, clusterLabels = 'seurat_clusters', reducedDimNames = 'PCA')

# Run Slingshot to get the pseudotime trajectory
sce <- slingshot(sce, clusterLabels = pancr_cells$seurat_clusters, reducedDimNames = 'PCA')

# Plot the lineages and color cells by the beta cell metadata
plot(slingshot(sce), colData = sce$beta_cell)

```

```{r}
# Define the threshold for INS expression
ins_cutoff <- 4  # Adjust this based on your t-SNE plot and analysis

# Create a metadata column for beta cell identification
# Using FetchData to get the data for "INS"
ins_expression <- FetchData(pancr_cells, vars = "INS")

# Label cells as "Beta" or "Non-Beta" based on the expression of INS
pancr_cells$beta_cell <- ifelse(ins_expression > ins_cutoff, "Beta", "Non-Beta")

```


```{r - Exploring Seurat object}
# Metadata - Cell-level data
pancr_cells@meta.data
pancr_cells[[]]

#Assay data
names(pancr_cells@assays)
GetAssayData(pancr_cells, assay = "RNA", slot = "data")


Embeddings(pancr_cells)
```