Creating-tx2gene.Rmd

---
title: "Creating Tx2Gene file"
author: "Jonathan Anzules"
date: "2023-09-27"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(tximport) # Importing quant_sf files
library(GenomicFeatures) #Useful genomics tools
library(BiocManager)
library(rhdf5) #To make an hdf5 files
library(Rtsne)
library(DESeq2) # For normalizing
library(scran) # normalization based on cell clusters
library(SingleCellExperiment) #For holding single-cell RNA_seq data
library(biomaRt) # Switching names from ensemble ID to gene symbol
library(Seurat)
library(ggplot2)
library(cowplot) # For violin plots
```

**To Do**
- Create metadata of hdf5 file
- Learn how to work with the txi data
- Change ensembl_ID to gene symbols in the hdf5 file and make code smoother
    in this aspect
- Remove low quality reads based on MT statistics
- Normalize data
- cluster the cells and visualize with tsne and Umap
- Identify beta cell clusters
- Perform a pseudo time analysis
- Why are there empty and duplicate rows


# Makign count matrix
```{r}
# txdb <- makeTxDbFromGFF("/Documents and Settings/jonan/Documents/Genomes/Homo-Sapiens/GRCh38/gencode.v44.chr_patch_hapl_scaff.annotation.gtf.gz")
# 
# # keytypes(txdb)
# k <- keys(txdb, keytype = "TXNAME")
# 
# tx2gene <- select(txdb, k, "GENEID", "TXNAME") # only works om 4.3.0 # Restart and clear output if this doesn't work
# # I think code is getting confused with another select() in another package, I'm not sure which one it is yet.
# tx2gene <- as.data.frame(tx2gene)
# tx2gene <- tx2gene[, c("TXNAME", "GENEID")]
# 
# #Saving the Dataframe
# write.csv(tx2gene, "/Documents and Settings/jonan/Documents/Genomes/Homo-Sapiens/GRCh38/tx2gene-Mappings", row.names=FALSE)
```

```{r - Preparing quantfiles}
#Grabbing the quant.sf files metadata and editing paths to work with R
T2D_files <- read.csv("/Documents and Settings/jonan/Documents/1Work/scWork/Data/PANCDB/Cleaned-N-Aligned-hpapdata/metadata_T2D.csv")
T2D_files$quant_file <- gsub("^/mnt/c", "C:", T2D_files$quant_file)

# dim(T2D_files)

#Reading in the tx2gene dataframe
tx2gene <- read.csv("/Documents and Settings/jonan/Documents/Genomes/Homo-Sapiens/GRCh38/tx2gene-Mappings")
```

```{r - Removing files that have failed}
# Grabbing files address
files <- T2D_files$quant_file

missing_files <- files[!file.exists(files)]

# Removing missing files from T2D_files dataframe
T2D_files <- T2D_files[!T2D_files$quant_file %in% missing_files,]

dim(T2D_files)
length(missing_files)
```

```{r - Creating the HDF5 file}

# Grabbing files address
files <- T2D_files$quant_file

txi <- tximport(files, type = "salmon", tx2gene = tx2gene)

# Assign cell IDs from metadata as column names of the count matrix
colnames(txi$counts) <- T2D_files$cell
str(txi)


txi$length
# # Now proceed to save the count matrix as an HDF5 file
h5createFile("/Documents and Settings/jonan/Documents/1Work/scWork/Data/PANCDB/Cleaned-N-Aligned-hpapdata/txi_data_T2D.h5")
h5write(txi$counts, "/Documents and Settings/jonan/Documents/1Work/scWork/Data/PANCDB/Cleaned-N-Aligned-hpapdata/txi_data.h5", "counts")
```

```{r}
# Create an HDF5 file
hdf5_filename <- "/Documents and Settings/jonan/Documents/1Work/scWork/Data/PANCDB/Cleaned-N-Aligned-hpapdata/txi_data_T2D.h5"
h5createFile(hdf5_filename)

# Store each component of the txi list in the HDF5 file
h5write(txi$abundance, hdf5_filename, "abundance")
h5write(txi$counts, hdf5_filename, "counts")
h5write(txi$length, hdf5_filename, "length")
h5write(txi$countsFromAbundance, hdf5_filename, "countsFromAbundance")
h5write(colnames(txi$counts), hdf5_filename, "counts_colnames")
h5write(rownames(txi$counts), hdf5_filename, "counts_rownames")

```
# Loading count matrix and some processing

```{r - Reading in the count matrix}

hdf5_filename <- "/Documents and Settings/jonan/Documents/1Work/scWork/Data/PANCDB/Cleaned-N-Aligned-hpapdata/txi_data_T2D.h5"

# Read each component from the HDF5 file into a list
txi_read <- list(
  abundance = h5read(hdf5_filename, "abundance"),
  counts = h5read(hdf5_filename, "counts"),
  length = h5read(hdf5_filename, "length"),
  countsFromAbundance = h5read(hdf5_filename, "countsFromAbundance")
)

# Reading and assigning row and column names
rownames(txi_read$counts) <- h5read(hdf5_filename, "counts_rownames")
colnames(txi_read$counts) <- h5read(hdf5_filename, "counts_colnames")

str(txi_read)
```


```{r}
head(rownames(txi_read$counts))
```
```{r}
# Removing version number
rownames(txi_read$counts) <- gsub("\\.\\d+$", "", ensembl_ids)

# Get a list of your Ensembl IDs
ensembl_ids <- rownames(txi_read$counts)

# Connecting to Ensembl database
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

# Convert Ensemble IDs to ensembl_gene_id Gene IDs
gene_symbols_ids <- getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
                       filters = "ensembl_gene_id",
                       values = ensembl_ids,
                       mart = ensembl)

# Replacing original Ensembl IDs - Testing done below
rownames(txi_read$counts) <- gene_symbols_ids$external_gene_name

str(txi_read)
```


```{r - Testing}
if (nrow(txi_read$counts) == length(gene_symbols)) {
  print("numbers ok")
    
  # rownames(txi_read$counts) <- gene_symbols$external_gene_name
} else {
    # Handle the case where the lengths differ
    # This could involve error handling or additional logic
    warning("The number of genes in txi_read$counts does not match the number of gene symbols retrieved.")
}

# Creating a named vector for testing
gene_symbols <- setNames(gene_symbols_ids$external_gene_name, gene_symbols_ids$ensembl_gene_id)
head(gene_symbols)

# Extracting Ensembl IDs from the named vector
ensembl_ids_from_vector <- names(gene_symbols)

# Checking if the order matches with txi_read$counts row names
is_order_matching <- all(ensembl_ids_from_vector == rownames(txi_read$counts))

# Print the result
if (is_order_matching) {
    print("The order of Ensembl IDs matches.")
} else {
    print("The order of Ensembl IDs does not match.")
}


```

# Analyzing count matrix before normalization

```{r - density plots of expression data}

# Set up the graphics window
par(mfrow = c(1, 2))


plot(density(log1p(rowMeans(txi_read$counts))), main="Density of Log-transformed Average Counts", 
     xlim = c(0, 10))

plot(density(log1p(rowMeans(txi_read$counts))), main="zoomed in", 
     xlim = c(1, 10),
     ylim = c(0, 0.2))

# Reset the graphical parameters back to default (optional)
par(mfrow = c(1, 1))
```


```{r - PCA visualization}

pca_res <- prcomp(t(log1p(txi_read$counts)))
plot(pca_res$x[,1], pca_res$x[,2], xlab="PC1", ylab="PC2", main="PCA of Counts Data")
```
```{r - Troubleshooting duplicates and empty}
# Check for empty row names
empty_row_names <- which(rownames(txi_read$counts) == "")
num_empty_row_names <- length(empty_row_names)

print(paste("Number of empty row names:", num_empty_row_names))

# Find duplicate row names
duplicate_row_names <- rownames(txi_read$counts)[duplicated(rownames(txi_read$counts))]
num_duplicate_row_names <- length(unique(duplicate_row_names))

print(paste("Number of duplicate row names:", num_duplicate_row_names))

```

```{r - Clearing duplicates and empties}

# Identify rows with empty row names
empty_rows <- which(rownames(txi_read$counts) == "")

# Remove these rows
if (length(empty_rows) > 0) {
    txi_read$counts <- txi_read$counts[-empty_rows, ]
}

# Identify and remove duplicate rows
duplicate_rows <- duplicated(rownames(txi_read$counts))
txi_read$counts <- txi_read$counts[!duplicate_rows, ]


```


```{r - Looking at mitochondrial gene statistics}

# Assuming txi_read$counts is your count matrix
seurat_object <- CreateSeuratObject(counts = txi_read$counts)

# Calculating the percentage of mitochondrial genes
mitochondrial_genes <- grep(pattern = "^MT-", x = rownames(x = seurat_object), value = TRUE)

percent.mt <- Matrix::colSums(GetAssayData(seurat_object, assay = "RNA", slot = "counts")[mitochondrial_genes, ]) /
              Matrix::colSums(GetAssayData(seurat_object, assay = "RNA", slot = "counts")) * 100

# Add percent.mt to the metadata of your Seurat object
seurat_object[["percent.mt"]] <- percent.mt


names(percent.mt) <- colnames(seurat_object)


# Plotting
VlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)


```
Threshold
A - Lower: 4500
    Upper: 8900
B - Lower: 190k
    Upper: 1.750M
C - Upper: 40


```{r - Removing low quality reads}

# Define thresholds
lower.threshold.nFeature <- 4500
upper.threshold.nFeature <- 8900
lower.threshold.nCount <- 190000 # 190k
upper.threshold.nCount <- 1750000 # 1.750M
upper.threshold.percent.mt <- 50 # It's possible the beta cells have high mt

# Subset Seurat object based on these thresholds
seurat_object <- subset(seurat_object, subset = nFeature_RNA > lower.threshold.nFeature & 
                                                  nFeature_RNA < upper.threshold.nFeature & 
                                                  nCount_RNA > lower.threshold.nCount & 
                                                  nCount_RNA < upper.threshold.nCount & 
                                                  percent.mt < upper.threshold.percent.mt)
VlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

```
  
```{r - Saving higher quality dataset}

# Extract the count matrix from the Seurat object
high_quality_counts <- GetAssayData(seurat_object, slot = "counts")

# Save the count matrix to a CSV file
write.csv(as.matrix(high_quality_counts), file = "/Documents and Settings/jonan/Documents/1Work/scWork/Data/FirstCountMatrix.csv")


```

```{r - Visualizing the quality metrics}

plot1 <- FeatureScatter(seurat_object, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(seurat_object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

plot1 + plot2
```


#Normalizing and Searching for Beta cells

```{r - Loading, normalizing, identifying variaility,and scaling}
HQ_counts <- read.csv("/Documents and Settings/jonan/Documents/1Work/scWork/Data/FirstCountMatrix.csv", row.names = 1)

#Creating a Seurat object
seurat_object <- CreateSeuratObject(counts = HQ_counts)

#Normalizing to 10000 count
seurat_object <- NormalizeData(seurat_object, normalization.method = "LogNormalize", scale.factor = 10000)

# Scaling
seurat_object <- ScaleData(seurat_object)

# Identifying variable features
seurat_object <- FindVariableFeatures(seurat_object, selection.method = "vst", nfeatures = 2000)

#
top10_hvg <- head(VariableFeatures(seurat_object), 10)

head(seurat_object[["RNA"]]$counts, n = 10)[,1:3]
head(seurat_object[["RNA"]]$data, n = 10)[,1:3]
head(seurat_object[["RNA"]]$scale.data, n = 10)[,1:3]


```


```{r - Preping data for cell identification}


```

```{r}
# Get the rownames from the Seurat object
gene_names <- rownames(seurat_object)

grep("ALDH ", gene_names, ignore.case = TRUE, value = TRUE)


```


```{r}

# List of genes
genes <- c("PDX1", "NKX6-1", "NANOG", "MYCL")

# Create multiple violin plots
p <- VlnPlot(seurat_object, features = genes)

p1 <- list() 
for (i in seq_along(p)){
    #Change x and y tick label font size.
    p1[[i]] = p[[i]] + theme(axis.text.x = element_text(size = 2), 
                             axis.text.y = element_text(size = 2),
                             axis.title = element_text(size = 6),
                             plot.title = element_text(size = 13))
}

  
plot_grid(plotlist = p1, ncol = 2) #display all vlnplots. Notice that I used `p1`
# p <- p + labs(x = "", y = "") + 
# theme(
#   axis.title.x = element_blank(),
#   axis.title.y = element_blank()
# )
# 
# print(p)

# Adjust the size of the text within the VlnPlot function
p <- VlnPlot(seurat_object, features = genes, pt.size = 0.1) + 
  NoLegend() + 
  theme_minimal(base_size = 10) +
  theme(
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 8),
    axis.title = element_text(size = 8),
    legend.position = "none"
  )

# If using Seurat v3.2.3 or higher, you can use the following to remove legends for multiple plots
p$legend <- NULL

print(p)


```

```{r}
seurat_object <- RunPCA(seurat_object, features = VariableFeatures(object = seurat_object))

FeaturePlot(seurat_object, features = c("INS", "PDX1", "NKX6-1"))
```
```{r - t-SNE plot highlighting INS}
# Run t-SNE
seurat_object <- RunTSNE(seurat_object, dims = 1:15)

# Generate a t-SNE plot
DimPlot(seurat_object, reduction = "tsne")

# Create a t-SNE plot highlighting the "INS" expression
FeaturePlot(seurat_object, features = "INS", min.cutoff = 4, reduction = "tsne")

```
```{r}

FeaturePlot(seurat_object, features = append(genes, "INS"), reduction = "tsne")
```
```{r}
# Load necessary libraries
library(monocle)
library(Seurat)

# Convert Seurat object to Monocle's CellDataSet format
# Make sure to use the correct assay if you have more than one (e.g., "RNA", "SCT", etc.)
cds <- as.cell_data_set(seurat_object)

# Choose dimensionality reduction and clustering information from Seurat to use in Monocle
cds <- reduceDimension(cds, reduction_method = 'DDRTree')
cds <- clusterCells(cds)

# Order cells in pseudotime
cds <- orderCells(cds)

# Plot the pseudotime trajectory
plot_cell_trajectory(cds, color_by = "pseudotime")

```

```{r}
library(Seurat)
library(slingshot)
library(SingleCellExperiment)

# Assuming your Seurat object is named 'seurat_object' and has PCA, clustering, and UMAP/t-SNE computed
# Define the threshold for INS expression
ins_cutoff <- 4  # Adjust this based on your t-SNE plot and analysis

# Create a metadata column for beta cell identification
seurat_object$beta_cell <- ifelse(seurat_object@assays$RNA@data["INS", ] > ins_cutoff, "Beta", "Non-Beta")

# Convert Seurat object to SingleCellExperiment object
sce <- as.SingleCellExperiment(seurat_object)

# Prepare Slingshot input: use reducedDimNames() to check the names of the reduced dimensions in your object
# and update the 'reducedDims' argument below if needed
sce <- slingshot(sce, clusterLabels = 'seurat_clusters', reducedDimNames = 'PCA')

# Run Slingshot to get the pseudotime trajectory
sce <- slingshot(sce, clusterLabels = seurat_object$seurat_clusters, reducedDimNames = 'PCA')

# Plot the lineages and color cells by the beta cell metadata
plot(slingshot(sce), colData = sce$beta_cell)

```

```{r}
# Define the threshold for INS expression
ins_cutoff <- 4  # Adjust this based on your t-SNE plot and analysis

# Create a metadata column for beta cell identification
# Using FetchData to get the data for "INS"
ins_expression <- FetchData(seurat_object, vars = "INS")

# Label cells as "Beta" or "Non-Beta" based on the expression of INS
seurat_object$beta_cell <- ifelse(ins_expression > ins_cutoff, "Beta", "Non-Beta")

```


```{r - Exploring Seurat object}
# Metadata - Cell-level data
seurat_object@meta.data
seurat_object[[]]

#Assay data
names(seurat_object@assays)
GetAssayData(seurat_object, assay = "RNA", slot = "data")


Embeddings(seurat_object)
```