From 04019434d7daf70ea9372032c706c74578bf3b76 Mon Sep 17 00:00:00 2001 From: Ashley Richardson Date: Thu, 29 Feb 2024 10:23:58 -0500 Subject: [PATCH] Seurat Objects --- Fibril_Seurat.RMD | 9 +- Fibril_Seurat.html | 886 +++++++++++++++++++++++++++++++++++++++++++ Monomer_Seurat.RMD | 34 +- Monomer_Seurat.html | 894 +++++++++++++++++++++++++++++++++++++++++++ Unstim_Seurat.RMD | 38 +- Unstim_Seurat.html | 896 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 2688 insertions(+), 69 deletions(-) create mode 100644 Fibril_Seurat.html create mode 100644 Monomer_Seurat.html create mode 100644 Unstim_Seurat.html diff --git a/Fibril_Seurat.RMD b/Fibril_Seurat.RMD index 7ac20c8..e4b4dd5 100644 --- a/Fibril_Seurat.RMD +++ b/Fibril_Seurat.RMD @@ -15,8 +15,9 @@ knitr::opts_chunk$set(echo = TRUE) .libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths())) -library(Seurat) -library(patchwork, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library") +library(sctransform, lib.loc = "/hpc/users/richaa21/.Rlib") +library(Seurat, lib.loc = "/hpc/users/richaa21/.Rlib") +library(patchwork) library(dplyr) ``` @@ -170,6 +171,8 @@ head(pbmc.f@meta.data) # find markers for every cluster compared to all remaining cells, report only the positive # ones markers_f <- FindAllMarkers(pbmc.f, only.pos = TRUE) + +cluster5.markers <- FindMarkers(pbmc.f, ident.1 = 5, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE) ``` @@ -197,7 +200,7 @@ FeaturePlot(pbmc.f, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCG ```{r} .libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths())) -library(patchwork, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.3.0/site-library") +library(patchwork) library(tidyr) f_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/aF.best", header = T, stringsAsFactors = F, check.names = F) head(f_demuxlet) diff --git a/Fibril_Seurat.html b/Fibril_Seurat.html new file mode 100644 index 0000000..36dad92 --- /dev/null +++ b/Fibril_Seurat.html @@ -0,0 +1,886 @@ + + + + + + + + + + + + + + + +Parkinson Disease Stimulation Exp - Unstim Samples + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Rmarkdown to pre-process scRNA seq from fibril stimulated +PBMCs.

+
+
+

Create the Seurat Object.

+
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
+
+library(sctransform, lib.loc = "/hpc/users/richaa21/.Rlib")
+library(Seurat, lib.loc = "/hpc/users/richaa21/.Rlib")
+
## Loading required package: SeuratObject
+
## Loading required package: sp
+
## 'SeuratObject' was built under R 4.2.0 but the current version is
+## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
+## for R may have changed
+
## 'SeuratObject' was built with package 'Matrix' 1.6.4 but the current
+## version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
+## the ABI for 'Matrix' may have changed
+
## 
+## Attaching package: 'SeuratObject'
+
## The following object is masked from 'package:base':
+## 
+##     intersect
+
library(patchwork)
+library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/aF/outs/per_sample_outs/aF/count/sample_filtered_feature_bc_matrix")
+
+pbmc.f <- CreateSeuratObject(counts = pbmc.data, project = "fibril", min.cells = 3, min.features = 200)
+pbmc.f
+
## An object of class Seurat 
+## 23791 features across 18327 samples within 1 assay 
+## Active assay: RNA (23791 features, 0 variable features)
+##  1 layer present: counts
+

Lets check how many cells and features we are starting with.

+
length(colnames(pbmc.f)) #number of cells
+
## [1] 18327
+
length(rownames(pbmc.f)) #number of features 
+
## [1] 23791
+
+
+

Quality Control (QC)

+
a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT. 
+b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data. 
+
pbmc.f[["percent.mt"]] <- PercentageFeatureSet(pbmc.f, pattern = "^MT-")
+
+# Show QC metrics for the first 5 cells
+head(pbmc.f@meta.data, 5)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
+## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
+## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
+## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
+## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
+
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
+VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
+

+
plot1 <- FeatureScatter(pbmc.f, feature1 = "nCount_RNA", feature2 = "percent.mt")
+plot2 <- FeatureScatter(pbmc.f, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
+plot1 
+

+
plot2
+

+
# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells. 
+pbmc.f <- subset(pbmc.f, subset = nFeature_RNA > 200 & percent.mt < 10)
+head(pbmc.f@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
+## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
+## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
+## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
+## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
+## AAACCTGAGCTGAAAT-1     fibril       7030         2652   3.911807
+
+
+

Normalize the data.

+

The data is normalized based on the feature (number of genes in a +cell) by the total expression. This number is multiplied by 10,000 and +then log transformed. The function to do this is “NormalizeData.” The +values specied below are the default values of this function.

+
pbmc.f <- NormalizeData(pbmc.f, normalization.method = "LogNormalize", scale.factor = 10000)
+
## Normalizing layer: counts
+
+
+

Identification of highly variable features

+
pbmc.f <- FindVariableFeatures(pbmc.f, selection.method = "vst", nfeatures = 2000)
+
## Finding variable features for layer counts
+
# Identify the 10 most highly variable genes
+top10 <- head(VariableFeatures(pbmc.f), 10)
+
+# plot variable features with and without labels
+plot1 <- VariableFeaturePlot(pbmc.f)
+plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
+plot1 
+

+
plot2
+

+## Scaling the Data The data is scaled by doing a linear transformation. +The ScaleData function does this by shifting the expression of genes so +that the mean expression becomes 0 and the variance is 1. By default, +only variable genes are scaled. This is changed by features = +all.genes

+
##Scaling RNA data, we only scale the variable features here for efficiency
+all.genes <- rownames(pbmc.f)
+pbmc.f <- ScaleData(pbmc.f, vars.to.regress = c("percent.mt"))
+
## Regressing out percent.mt
+
## Centering and scaling data matrix
+
+
+

Perform linear dimensional reduction

+

For the first principal components, RunPCA shows the most positive +(correlated) and most negative (anticorrelated) genes

+
pbmc.f <- RunPCA(pbmc.f, features = VariableFeatures(object = pbmc.f))
+
## PC_ 1 
+## Positive:  LTB, TSC22D3, IL7R, FP236383.3, TRIM22, TCF7, AQP3, IFITM1, TNFSF8, LEF1 
+##     PIM2, CCR7, JAML, C1orf162, LINC00861, CORO1B, NEAT1, MAL, MYC, CD5 
+##     NCDN, BEX3, TIMP1, CD4, SESN3, S100A6, BBC3, HERPUD1, AC139720.1, CYSLTR1 
+## Negative:  RRM2, TYMS, MKI67, TUBA1B, STMN1, PCLAF, CDK1, UBE2C, TOP2A, KIFC1 
+##     NUSAP1, ASF1B, ZWINT, TUBB, CCNA2, ASPM, CENPF, GTSE1, PKMYT1, HIST1H1B 
+##     KNL1, CDCA8, CDKN3, SPC25, GGH, TPX2, CKAP2L, BIRC5, CENPU, CDCA2 
+## PC_ 2 
+## Positive:  CTSW, PRF1, GZMB, CCL4, NKG7, CCL3, KLRD1, TYROBP, GZMA, KLRC1 
+##     FCER1G, AOAH, CST7, IL2RB, CCL5, SRGN, TRDC, GNLY, HOPX, KLRK1 
+##     FCGR3A, GZMK, NCAM1, GSTP1, KIR2DL4, LYST, EOMES, KLRC2, CD38, SH2D1B 
+## Negative:  IL7R, TIMP1, AQP3, S100A6, LIME1, COTL1, S100A10, S100A4, CD5, TSC22D3 
+##     SNED1, IFITM1, GPR183, WDR86-AS1, CD4, ANP32B, CORO1B, LEF1, ITGB1, LMNA 
+##     MYC, CRIP1, STAT1, PLP2, PRDX1, SOS1, INPP4B, RGCC, UBXN11, IL9R 
+## PC_ 3 
+## Positive:  PLK1, CLIC3, PLEK, TYROBP, FCER1G, CEBPD, FGFBP2, CENPA, CX3CR1, CCNB1 
+##     DLGAP5, FCGR3A, ASPM, KLRD1, KIF20A, PSRC1, FGR, HMMR, CCNB2, CDKN2D 
+##     CD300A, PRF1, GZMK, SH2D1B, KIF14, CENPF, PIMREG, TROAP, NEK2, TRDC 
+## Negative:  GINS2, HSP90AB1, MCM2, MCM6, CSF2, MCM5, TRBV6-2, PLPP1, PAICS, CDC6 
+##     TRAV12-2, RANBP1, MCM7, UNG, MCM3, MCM4, HSPE1, NME1, MIF, CDC45 
+##     SRM, HSPD1, DCTPP1, E2F1, CD82, TCTN3, IL2RA, TPI1, PHLDA1, DUSP4 
+## PC_ 4 
+## Positive:  TRBV6-2, CD27, TRAV12-2, LRRN3, PLPP1, CD8A, PECAM1, CD8B, MYO1E, SNX9 
+##     SIRPG, REG4, RASGRF2, BCL2L11, CBLB, BACH2, CSF2, LAIR2, AL136962.1, PDE3B 
+##     APBB2, BHLHE40-AS1, CRTAM, LYST, DUSP2, TRBV5-4, CCR7, GNG8, ITGA1, TCF7 
+## Negative:  HLA-DRB1, HLA-DPA1, S100A4, LGALS1, S100A6, S100A10, HLA-DPB1, HLA-DRA, HLA-DQA1, PLEK 
+##     ANXA2, TXN, HLA-DQB1, LGALS3, HLA-DRB5, CRIP1, CD74, TMSB10, PFN1, CD300A 
+##     TAGLN2, FTL, TYROBP, CLU, GINS2, TBXAS1, IFITM1, LIME1, TIMP1, EFHD2 
+## PC_ 5 
+## Positive:  LGALS1, S100A4, S100A6, ANXA2, C12orf75, ACTB, S100A10, TXN, HOPX, PFN1 
+##     GAPDH, ACTG1, TMSB10, CFL1, CD74, LAG3, LINC02694, PRDX1, HLA-DPA1, VIM 
+##     PHLDA1, FLNA, HLA-DRB1, JPT1, AHNAK, HLA-DRA, THEMIS, ALOX5AP, CD82, TIMP1 
+## Negative:  CCR7, TCF7, LEF1, TRIM22, SESN3, TYROBP, CXCR4, FAM111B, FCER1G, PTMA 
+##     TXK, RNF130, PLAC8, MCM7, DHRS3, PTGDR, DTL, CYSLTR1, CDC6, CCNE2 
+##     CLSPN, SELL, TK1, MYBL2, SH2D1B, FEN1, LINC00861, FHIT, KCNQ5, NCAM1
+
print(pbmc.f[["pca"]], dims = 1:5, nfeatures = 5)
+
## PC_ 1 
+## Positive:  LTB, TSC22D3, IL7R, FP236383.3, TRIM22 
+## Negative:  RRM2, TYMS, MKI67, TUBA1B, STMN1 
+## PC_ 2 
+## Positive:  CTSW, PRF1, GZMB, CCL4, NKG7 
+## Negative:  IL7R, TIMP1, AQP3, S100A6, LIME1 
+## PC_ 3 
+## Positive:  PLK1, CLIC3, PLEK, TYROBP, FCER1G 
+## Negative:  GINS2, HSP90AB1, MCM2, MCM6, CSF2 
+## PC_ 4 
+## Positive:  TRBV6-2, CD27, TRAV12-2, LRRN3, PLPP1 
+## Negative:  HLA-DRB1, HLA-DPA1, S100A4, LGALS1, S100A6 
+## PC_ 5 
+## Positive:  LGALS1, S100A4, S100A6, ANXA2, C12orf75 
+## Negative:  CCR7, TCF7, LEF1, TRIM22, SESN3
+

The PCA results can be visualized in different ways.

+
VizDimLoadings(pbmc.f, dims = 1:2, reduction = "pca")
+

+
DimPlot(pbmc.f, reduction = "pca") + NoLegend()
+

+
DimHeatmap(pbmc.f, dims = 1, cells = 500, balanced = TRUE)
+

+
DimHeatmap(pbmc.f, dims = 1:15, cells = 500, balanced = TRUE)
+

+
+
+

Determine the ‘dimensionality’ of the dataset

+

Cells will be clustered based on PCA. How many PC to use is dependent +on many factors. For example, if trying to analyze a rare cell subset, +you might want to add more PCs. Usually, the first 10 is good to see +dimensionality of the data.

+
ElbowPlot(pbmc.f)
+

+
+
+

Cluster the cells.

+
##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
+pbmc.f <- FindNeighbors(pbmc.f, reduction = "pca", dims = 1:15)
+
## Computing nearest neighbor graph
+
## Computing SNN
+
pbmc.f <- FindClusters(pbmc.f, resolution = 0.5, verbose = FALSE)
+
head(Idents(pbmc.f), 5)
+
## AAACCTGAGAAACCAT-1 AAACCTGAGAAGGACA-1 AAACCTGAGAATCTCC-1 AAACCTGAGAATGTGT-1 
+##                  4                  1                  9                  1 
+## AAACCTGAGACGCACA-1 
+##                  7 
+## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12
+
+
+

Run non-linear dimensional reduction (UMAP/tSNE)

+
pbmc.f <- RunUMAP(pbmc.f, dims = 1:15)
+
## 10:42:02 UMAP embedding parameters a = 0.9922 b = 1.112
+
## 10:42:02 Read 17726 rows and found 15 numeric columns
+
## 10:42:02 Using Annoy for neighbor search, n_neighbors = 30
+
## 10:42:02 Building Annoy index with metric = cosine, n_trees = 50
+
## 0%   10   20   30   40   50   60   70   80   90   100%
+
## [----|----|----|----|----|----|----|----|----|----|
+
## **************************************************|
+## 10:42:04 Writing NN index file to temp file /tmp/RtmpRlAiI7/filed09935ef3302
+## 10:42:04 Searching Annoy index using 1 thread, search_k = 3000
+## 10:42:09 Annoy recall = 100%
+## 10:42:10 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 10:42:11 Initializing from normalized Laplacian + noise (using RSpectra)
+## 10:42:11 Commencing optimization for 200 epochs, with 755922 positive edges
+## 10:42:33 Optimization finished
+
# note that you can set `label = TRUE` or use the LabelClusters function to help label
+# individual clusters
+DimPlot(pbmc.f, reduction = "umap", label = TRUE)
+

+
pbmc.f <- RunTSNE(pbmc.f, reduction = "pca", dims = 1:15)
+DimPlot(pbmc.f, reduction = "tsne", label = TRUE)
+

+

Lets check our metadata now of the seurat object to see what has been +added.

+
head(pbmc.f@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
+## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
+## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
+## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
+## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
+## AAACCTGAGCTGAAAT-1     fibril       7030         2652   3.911807
+##                    RNA_snn_res.0.5 seurat_clusters
+## AAACCTGAGAAACCAT-1               4               4
+## AAACCTGAGAAGGACA-1               1               1
+## AAACCTGAGAATCTCC-1               9               9
+## AAACCTGAGAATGTGT-1               1               1
+## AAACCTGAGACGCACA-1               7               7
+## AAACCTGAGCTGAAAT-1               0               0
+
#We now have seurat clusters and RNA_snn_res.0.5 columns added. 
+
+
+

Finding differentially expressed features (cluster biomarkers)

+
# find markers for every cluster compared to all remaining cells, report only the positive
+# ones
+markers_f <- FindAllMarkers(pbmc.f, only.pos = TRUE)
+
## Calculating cluster 0
+
## Calculating cluster 1
+
## Calculating cluster 2
+
## Calculating cluster 3
+
## Calculating cluster 4
+
## Calculating cluster 5
+
## Calculating cluster 6
+
## Calculating cluster 7
+
## Calculating cluster 8
+
## Calculating cluster 9
+
## Calculating cluster 10
+
## Calculating cluster 11
+
## Calculating cluster 12
+

VlnPlot will display the differential expression across the clusters. +For example, I am looking here at CD8A and CD4 expression in the +clusters.

+
VlnPlot(pbmc.f, features = c("CD8A", "CD4"))
+

+

The raw counts can also be shown instead by adding some +parameters.

+
VlnPlot(pbmc.f, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)
+

+
FeaturePlot(pbmc.f, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
+    "CD8A"))
+

+
+

Combine Seurat object with demuxlet output.

+
+
+
+

First, load and edit the .best demuxlet output file to make it more +compatible.

+
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
+
+library(patchwork)
+library(tidyr)
+f_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/aF.best", header = T, stringsAsFactors = F, check.names = F)
+head(f_demuxlet)
+
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP                    BEST
+## 1 AAACCTGAGAAACCAT-1   20230    2729    2289  1191 SNG-GSA8_0_NYUMD0334-01
+## 2 AAACCTGAGAAGGACA-1   28220    3441    2915  1485 SNG-GSA8_0_NYUMD0289-01
+## 3 AAACCTGAGAATCTCC-1   43192    6406    5064  2716 SNG-GSA8_0_NYUMD0327-01
+## 4 AAACCTGAGAATGTGT-1   23164    3096    2623  1317 SNG-GSA8_0_NYUMD0334-01
+## 5 AAACCTGAGACGCACA-1   80482   12257   10037  4265 SNG-GSA8_0_NYUMD0334-01
+## 6 AAACCTGAGCTGAAAT-1   18804    2299    1930  1096 SNG-GSA8_0_NYUMD0289-01
+##               SNG.1ST   SNG.LLK1             SNG.2ND  SNG.LLK2   SNG.LLK0
+## 1 GSA8_0_NYUMD0334-01  -436.0108 GSA8_0_NYUMD0289-01 -1144.530  -700.9854
+## 2 GSA8_0_NYUMD0289-01  -551.2455 GSA8_0_NYUMD0327-01 -1348.805  -849.1844
+## 3 GSA8_0_NYUMD0327-01  -978.9799 GSA8_0_NYUMD0334-01 -2477.158 -1538.9178
+## 4 GSA8_0_NYUMD0334-01  -501.0333 GSA8_0_NYUMD0315-01 -1271.823  -786.8994
+## 5 GSA8_0_NYUMD0334-01 -1514.3772 GSA8_0_NYUMD0289-01 -3999.847 -2477.8828
+## 6 GSA8_0_NYUMD0289-01  -422.6331 GSA8_0_NYUMD0334-01 -1018.341  -640.8195
+##               DBL.1ST             DBL.2ND ALPHA      LLK12       LLK1
+## 1 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01   0.5  -650.5111 -1179.6639
+## 2 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0327-01   0.5  -798.6921  -551.2455
+## 3 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5 -1423.6142  -978.9799
+## 4 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5  -735.1817 -1273.6766
+## 5 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01   0.5 -2273.7404 -3999.8471
+## 6 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01   0.5  -599.9499  -422.6331
+##         LLK2      LLK10      LLK20      LLK00   PRB.DBL PRB.SNG1
+## 1  -436.0108 -1007.9044  -658.1725  -717.9619  2.33e-94        1
+## 2 -1348.8051  -555.0604  -798.6921  -875.2910 1.14e-108        1
+## 3 -2477.1579 -1464.5035 -2200.8787 -1583.0830 2.63e-194        1
+## 4  -501.0333 -1136.8613  -738.3541  -809.4175 8.21e-103        1
+## 5 -1514.3772 -4005.0462 -2273.7404 -2537.6268  0.00e+00        1
+## 6 -1018.3409  -424.8764  -599.9499  -659.8594  3.29e-78        1
+

To edit: I will split the Best column into multiple columns.

+
f_demuxlet_edit = f_demuxlet %>% 
+  mutate(BEST = gsub("-01","", BEST)) %>%
+  separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
+  separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
+  separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
+  select(-contains("garbage"))
+
## Warning: Expected 3 pieces. Additional pieces discarded in 2522 rows [13, 18,
+## 20, 25, 27, 29, 35, 48, 63, 66, 69, 71, 79, 82, 100, 102, 103, 107, 116, 124,
+## ...].
+
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 15896 rows [1,
+## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 19, 21, 22, 23, ...].
+
head(f_demuxlet_edit)
+
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
+## 1 AAACCTGAGAAACCAT-1   20230    2729    2289  1191                       SNG
+## 2 AAACCTGAGAAGGACA-1   28220    3441    2915  1485                       SNG
+## 3 AAACCTGAGAATCTCC-1   43192    6406    5064  2716                       SNG
+## 4 AAACCTGAGAATGTGT-1   23164    3096    2623  1317                       SNG
+## 5 AAACCTGAGACGCACA-1   80482   12257   10037  4265                       SNG
+## 6 AAACCTGAGCTGAAAT-1   18804    2299    1930  1096                       SNG
+##   DMX_maxID DMX_secondID             SNG.1ST   SNG.LLK1             SNG.2ND
+## 1 NYUMD0334         <NA> GSA8_0_NYUMD0334-01  -436.0108 GSA8_0_NYUMD0289-01
+## 2 NYUMD0289         <NA> GSA8_0_NYUMD0289-01  -551.2455 GSA8_0_NYUMD0327-01
+## 3 NYUMD0327         <NA> GSA8_0_NYUMD0327-01  -978.9799 GSA8_0_NYUMD0334-01
+## 4 NYUMD0334         <NA> GSA8_0_NYUMD0334-01  -501.0333 GSA8_0_NYUMD0315-01
+## 5 NYUMD0334         <NA> GSA8_0_NYUMD0334-01 -1514.3772 GSA8_0_NYUMD0289-01
+## 6 NYUMD0289         <NA> GSA8_0_NYUMD0289-01  -422.6331 GSA8_0_NYUMD0334-01
+##    SNG.LLK2   SNG.LLK0             DBL.1ST             DBL.2ND ALPHA      LLK12
+## 1 -1144.530  -700.9854 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01   0.5  -650.5111
+## 2 -1348.805  -849.1844 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0327-01   0.5  -798.6921
+## 3 -2477.158 -1538.9178 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5 -1423.6142
+## 4 -1271.823  -786.8994 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5  -735.1817
+## 5 -3999.847 -2477.8828 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01   0.5 -2273.7404
+## 6 -1018.341  -640.8195 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01   0.5  -599.9499
+##         LLK1       LLK2      LLK10      LLK20      LLK00   PRB.DBL PRB.SNG1
+## 1 -1179.6639  -436.0108 -1007.9044  -658.1725  -717.9619  2.33e-94        1
+## 2  -551.2455 -1348.8051  -555.0604  -798.6921  -875.2910 1.14e-108        1
+## 3  -978.9799 -2477.1579 -1464.5035 -2200.8787 -1583.0830 2.63e-194        1
+## 4 -1273.6766  -501.0333 -1136.8613  -738.3541  -809.4175 8.21e-103        1
+## 5 -3999.8471 -1514.3772 -4005.0462 -2273.7404 -2537.6268  0.00e+00        1
+## 6  -422.6331 -1018.3409  -424.8764  -599.9499  -659.8594  3.29e-78        1
+
table(f_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
+
## 
+##   AMB   DBL   SNG 
+##    17  2505 15896
+
table(f_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
+
## 
+## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334 
+##      6750      3559      3609      4500
+
table(f_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
+
##                          DMX_maxID
+## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
+##                       AMB         5         4         4         4
+##                       DBL      1481       610       399        15
+##                       SNG      5264      2945      3206      4481
+
+
+

Next, I will add this edited demuxlet to the Seurat object.

+
f_demuxlet_edit.subset <- f_demuxlet_edit[f_demuxlet_edit$BARCODE %in% colnames(pbmc.f),]
+
+pbmc.f@meta.data <- cbind(pbmc.f@meta.data,f_demuxlet_edit.subset$DMX_maxID,f_demuxlet_edit.subset$DMX_classification.global, f_demuxlet_edit.subset$BARCODE)
+
+head(pbmc.f@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
+## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
+## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
+## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
+## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
+## AAACCTGAGCTGAAAT-1     fibril       7030         2652   3.911807
+##                    RNA_snn_res.0.5 seurat_clusters
+## AAACCTGAGAAACCAT-1               4               4
+## AAACCTGAGAAGGACA-1               1               1
+## AAACCTGAGAATCTCC-1               9               9
+## AAACCTGAGAATGTGT-1               1               1
+## AAACCTGAGACGCACA-1               7               7
+## AAACCTGAGCTGAAAT-1               0               0
+##                    f_demuxlet_edit.subset$DMX_maxID
+## AAACCTGAGAAACCAT-1                        NYUMD0334
+## AAACCTGAGAAGGACA-1                        NYUMD0289
+## AAACCTGAGAATCTCC-1                        NYUMD0327
+## AAACCTGAGAATGTGT-1                        NYUMD0334
+## AAACCTGAGACGCACA-1                        NYUMD0334
+## AAACCTGAGCTGAAAT-1                        NYUMD0289
+##                    f_demuxlet_edit.subset$DMX_classification.global
+## AAACCTGAGAAACCAT-1                                              SNG
+## AAACCTGAGAAGGACA-1                                              SNG
+## AAACCTGAGAATCTCC-1                                              SNG
+## AAACCTGAGAATGTGT-1                                              SNG
+## AAACCTGAGACGCACA-1                                              SNG
+## AAACCTGAGCTGAAAT-1                                              SNG
+##                    f_demuxlet_edit.subset$BARCODE
+## AAACCTGAGAAACCAT-1             AAACCTGAGAAACCAT-1
+## AAACCTGAGAAGGACA-1             AAACCTGAGAAGGACA-1
+## AAACCTGAGAATCTCC-1             AAACCTGAGAATCTCC-1
+## AAACCTGAGAATGTGT-1             AAACCTGAGAATGTGT-1
+## AAACCTGAGACGCACA-1             AAACCTGAGACGCACA-1
+## AAACCTGAGCTGAAAT-1             AAACCTGAGCTGAAAT-1
+
+
+

Pre-Processing the new object.

+

The Seurat Object has already been preprocessed in my case, so this +should be clean)

+
pbmc.f[["percent.mt"]] <- PercentageFeatureSet(pbmc.f, pattern = "^MT-")
+
+library(cowplot)
+
## 
+## Attaching package: 'cowplot'
+
## The following object is masked from 'package:patchwork':
+## 
+##     align_plots
+
# look at distribution of metrics by classification
+plot_grid(VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "f_demuxlet_edit.subset$DMX_maxID"))
+

+
VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "f_demuxlet_edit.subset$DMX_classification.global")
+

+

Now, select for MT content under 10% and for nfeatureRNA > 200 to +ensure getting alive cells. I previously did this, so should see no +change in cell numnbers.

+
pbmc.f <- subset(pbmc.f, subset = nFeature_RNA > 200 & percent.mt < 10)
+length(colnames(pbmc.f))
+
## [1] 17726
+
length(rownames(pbmc.f))
+
## [1] 23791
+

Add column to meta data to identify seurat object as Basline +condition. Also rename some columns for clarity purposes.

+
pbmc.f@meta.data$condition <- 'Fibril'
+names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
+names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
+names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$BARCODE"] <- "Barcode"
+head(pbmc.f@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
+## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
+## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
+## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
+## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
+## AAACCTGAGCTGAAAT-1     fibril       7030         2652   3.911807
+##                    RNA_snn_res.0.5 seurat_clusters DMX_maxID
+## AAACCTGAGAAACCAT-1               4               4 NYUMD0334
+## AAACCTGAGAAGGACA-1               1               1 NYUMD0289
+## AAACCTGAGAATCTCC-1               9               9 NYUMD0327
+## AAACCTGAGAATGTGT-1               1               1 NYUMD0334
+## AAACCTGAGACGCACA-1               7               7 NYUMD0334
+## AAACCTGAGCTGAAAT-1               0               0 NYUMD0289
+##                    DMX_classification.global            Barcode condition
+## AAACCTGAGAAACCAT-1                       SNG AAACCTGAGAAACCAT-1    Fibril
+## AAACCTGAGAAGGACA-1                       SNG AAACCTGAGAAGGACA-1    Fibril
+## AAACCTGAGAATCTCC-1                       SNG AAACCTGAGAATCTCC-1    Fibril
+## AAACCTGAGAATGTGT-1                       SNG AAACCTGAGAATGTGT-1    Fibril
+## AAACCTGAGACGCACA-1                       SNG AAACCTGAGACGCACA-1    Fibril
+## AAACCTGAGCTGAAAT-1                       SNG AAACCTGAGCTGAAAT-1    Fibril
+
saveRDS(pbmc.f, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.fibril.final.RDS")
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/Monomer_Seurat.RMD b/Monomer_Seurat.RMD index 1ff4df4..deff9e4 100644 --- a/Monomer_Seurat.RMD +++ b/Monomer_Seurat.RMD @@ -16,7 +16,7 @@ knitr::opts_chunk$set(echo = TRUE) .libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths())) library(Seurat) -library(patchwork, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library") +library(patchwork) library(dplyr) ``` @@ -197,7 +197,7 @@ FeaturePlot(pbmc.m, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCG ```{r} .libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths())) -library(patchwork, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.3.0/site-library") +library(patchwork) library(tidyr) m_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/aM.best", header = T, stringsAsFactors = F, check.names = F) head(m_demuxlet) @@ -271,36 +271,6 @@ names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$BARCO head(pbmc.m@meta.data) ``` - -## Remove douplets. -Note - It is a good idea to compare doublet determination by dumuxlet and another resource (i.e. DoupletFinder or scDblFinder). However, these packages do not work with SeuratV5, which is what I used to create my SeuratObject. -For now, I am just removing doublets defined by demuxlet. - -```{r} -#Idents(pbmc.m) <- pbmc.m$DMX_classification.global -#pbmc.m_sing <- subset(x = pbmc.m, idents = c("SNG"), invert = F) - -## confirm that there are no doublets left -#table(pbmc.m_sing$DMX_classification.global) -``` - -# Add metadata information. -```{r} -# Add patient data to Seurat object meta data. - -#pt_data <- data.frame(DMX_maxID = c("NYUMD0289", "NYUMD0315", "NYUMD0327", "NYUMD0334"), - #DX = c("PD", "PD", "CO", "CO"), - # Sex = c("Male", "Male", "Male", "Male"), - #Age = c(68, 65, 65, 60)) - -#merged_df <- merge(pbmc.m_sing@meta.data, pt_data, by = "DMX_maxID") - -#pbmc.m_sing@meta.data <- merged_df - -#head(pbmc.m_sing@meta.data) -``` - - ```{r Save as .finalRDS file} saveRDS(pbmc.m, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.monomer.final.RDS") ``` diff --git a/Monomer_Seurat.html b/Monomer_Seurat.html new file mode 100644 index 0000000..fa17bc9 --- /dev/null +++ b/Monomer_Seurat.html @@ -0,0 +1,894 @@ + + + + + + + + + + + + + + + +Parkinson Disease Stimulation Exp - Unstim Samples + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Rmarkdown to pre-process scRNA seq from monomer stimulated +PBMCs.

+
+
+

Create the Seurat Object.

+
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
+
+library(Seurat)
+
## Loading required package: SeuratObject
+
## Loading required package: sp
+
## 'SeuratObject' was built under R 4.2.0 but the current version is
+## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
+## for R may have changed
+
## 
+## Attaching package: 'SeuratObject'
+
## The following object is masked from 'package:base':
+## 
+##     intersect
+
library(patchwork)
+library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/aM/outs/per_sample_outs/aM/count/sample_filtered_feature_bc_matrix")
+
+pbmc.m <- CreateSeuratObject(counts = pbmc.data, project = "pbmc_m", min.cells = 3, min.features = 200)
+pbmc.m
+
## An object of class Seurat 
+## 24658 features across 18951 samples within 1 assay 
+## Active assay: RNA (24658 features, 0 variable features)
+##  1 layer present: counts
+

Lets check how many cells and features we are starting with.

+
length(colnames(pbmc.m)) #number of cells
+
## [1] 18951
+
length(rownames(pbmc.m)) #number of features 
+
## [1] 24658
+
+
+

Quality Control (QC)

+
a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT. 
+b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data. 
+
pbmc.m[["percent.mt"]] <- PercentageFeatureSet(pbmc.m, pattern = "^MT-")
+
+# Show QC metrics for the first 5 cells
+head(pbmc.m@meta.data, 5)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
+## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
+## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
+## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
+## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
+
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
+VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
+

+
plot1 <- FeatureScatter(pbmc.m, feature1 = "nCount_RNA", feature2 = "percent.mt")
+plot2 <- FeatureScatter(pbmc.m, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
+plot1 
+

+
plot2
+

+
# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells. 
+pbmc.m <- subset(pbmc.m, subset = nFeature_RNA > 200 & percent.mt < 10)
+head(pbmc.m@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
+## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
+## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
+## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
+## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
+## AAACCTGCAACACCTA-1     pbmc_m        670          460   3.731343
+
+
+

Normalize the data.

+

The data is normalized based on the feature (number of genes in a +cell) by the total expression. This number is multiplied by 10,000 and +then log transformed. The function to do this is “NormalizeData.” The +values specied below are the default values of this function.

+
pbmc.m <- NormalizeData(pbmc.m, normalization.method = "LogNormalize", scale.factor = 10000)
+
## Normalizing layer: counts
+
+
+

Identification of highly variable features

+
pbmc.m <- FindVariableFeatures(pbmc.m, selection.method = "vst", nfeatures = 2000)
+
## Finding variable features for layer counts
+
# Identify the 10 most highly variable genes
+top10 <- head(VariableFeatures(pbmc.m), 10)
+
+# plot variable features with and without labels
+plot1 <- VariableFeaturePlot(pbmc.m)
+plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
+plot1 
+

+
plot2
+

+## Scaling the Data The data is scaled by doing a linear transformation. +The ScaleData function does this by shifting the expression of genes so +that the mean expression becomes 0 and the variance is 1. By default, +only variable genes are scaled. This is changed by features = +all.genes

+
##Scaling RNA data, we only scale the variable features here for efficiency
+all.genes <- rownames(pbmc.m)
+pbmc.m <- ScaleData(pbmc.m, vars.to.regress = c("percent.mt"))
+
## Regressing out percent.mt
+
## Centering and scaling data matrix
+
+
+

Perform linear dimensional reduction

+

For the first principal components, RunPCA shows the most positive +(correlated) and most negative (anticorrelated) genes

+
pbmc.m <- RunPCA(pbmc.m, features = VariableFeatures(object = pbmc.m))
+
## PC_ 1 
+## Positive:  IL7R, FP236383.3, TCF7, LTB, AQP3, CD27, PIM2, JAML, CTSS, CCR7 
+##     C1orf162, NCDN, FP671120.4, SAT1, IGFLR1, TIMP1, SNED1, FTL, SESN3, MYC 
+##     CCR4, PASK, LINC00861, AC139720.1, CYSLTR1, BEX3, MAL, FAAH2, ACP5, HERPUD1 
+## Negative:  RRM2, TYMS, MKI67, TUBA1B, STMN1, CDK1, TOP2A, NUSAP1, UBE2C, KIFC1 
+##     PCLAF, ZWINT, ASF1B, TUBB, KNL1, CENPF, ASPM, CCNA2, TPX2, HIST1H1B 
+##     PKMYT1, GTSE1, H2AFX, GGH, SPC25, CDCA2, CDCA8, CKAP2L, HJURP, AURKB 
+## PC_ 2 
+## Positive:  CTSW, PRF1, KLRD1, NKG7, GZMB, TYROBP, CCL4, CCL3, KLRC1, TRDC 
+##     FCER1G, PLEK, CST7, IL2RB, AOAH, CCL5, FCGR3A, NCAM1, GNLY, KIR2DL4 
+##     GZMA, SRGN, EOMES, TOX, KLRK1, MATK, CD300A, FGR, GSTP1, GZMK 
+## Negative:  IL7R, TIMP1, AQP3, S100A6, ANP32B, SNED1, COTL1, RGCC, MYC, IL9R 
+##     STAT1, TCF7, PDE3B, SELL, MAL, WDR86-AS1, SOS1, HSF4, INPP4B, LMNA 
+##     NME4, FHIT, BEX3, CCR4, CRIP1, KLF3, PRDX1, IMPDH2, LTB, HMGA1 
+## PC_ 3 
+## Positive:  PLK1, CCNB1, KIF20A, DLGAP5, CENPA, PSRC1, ASPM, CDC20, CCNB2, HMMR 
+##     KIF14, NEK2, CENPF, TROAP, ARL6IP1, CENPE, PIMREG, UBE2C, PIF1, CDCA8 
+##     TOP2A, AURKA, CDCA3, KIF2C, KNSTRN, TPX2, KPNA2, BIRC5, KIF23, GTSE1 
+## Negative:  GINS2, MCM2, MCM6, MCM5, CDC6, MCM7, MCM3, MCM4, PAICS, UNG 
+##     CDC45, HSP90AB1, E2F1, DCTPP1, SLBP, CDCA7, CHAF1B, NME1, DTL, PPP1R14B 
+##     FEN1, HELLS, CCNE2, CDK4, MIF, MSH6, SRM, DUT, C1QBP, POLD2 
+## PC_ 4 
+## Positive:  FCER1G, CCR7, TCF7, TXK, DHRS3, NCAM1, PTGDR, BACH2, SH2D1B, GSTM2 
+##     SYK, KIR2DL4, CXXC5, KLRF1, ID3, RNF130, XCL2, NDFIP2, CYSLTR1, CTBP2 
+##     LDB2, RAMP1, XCL1, TOX2, TYROBP, AFAP1L2, NCR1, ACTN1, ADGRG3, PTMA 
+## Negative:  S100A6, HLA-DPA1, HLA-DRB1, LAG3, CD74, LYAR, LGALS1, HLA-DQA1, HLA-DRA, GZMH 
+##     HLA-DPB1, FGFBP2, TRDV2, LINC02694, HPGD, THEMIS, FLNA, HOPX, AHNAK, MSC 
+##     TRGV9, ANXA2, CCL5, NKG7, CST7, ALOX5AP, HLA-DQB1, CXCR3, LGALS3, LTB 
+## PC_ 5 
+## Positive:  IL9R, LGALS1, TNFRSF18, IL2RA, TNFRSF4, TIMP1, CAPG, SOS1, TNFSF10, ANXA2 
+##     LMNA, IL17RB, GATA3, TXN, GAB2, TPM4, NDFIP2, LAT2, CSF2, ACTG1 
+##     CD82, PRDX1, HSPA5, AHI1, IL3RA, SNED1, RBPJ, CCNB1, NQO1, TBXAS1 
+## Negative:  GZMH, LINC00861, FGFBP2, GZMK, TRDV2, CCR7, C1orf21, TRGV9, KLRC4, TCF7 
+##     PLEK, FAM111B, LYAR, CX3CR1, LINC02446, PCNA, LAG3, CXCR4, PRSS23, MYBL2 
+##     CCL5, MCM7, CTNNAL1, PKMYT1, CLSPN, LIG1, TK1, CDT1, DUT, DTL
+
print(pbmc.m[["pca"]], dims = 1:5, nfeatures = 5)
+
## PC_ 1 
+## Positive:  IL7R, FP236383.3, TCF7, LTB, AQP3 
+## Negative:  RRM2, TYMS, MKI67, TUBA1B, STMN1 
+## PC_ 2 
+## Positive:  CTSW, PRF1, KLRD1, NKG7, GZMB 
+## Negative:  IL7R, TIMP1, AQP3, S100A6, ANP32B 
+## PC_ 3 
+## Positive:  PLK1, CCNB1, KIF20A, DLGAP5, CENPA 
+## Negative:  GINS2, MCM2, MCM6, MCM5, CDC6 
+## PC_ 4 
+## Positive:  FCER1G, CCR7, TCF7, TXK, DHRS3 
+## Negative:  S100A6, HLA-DPA1, HLA-DRB1, LAG3, CD74 
+## PC_ 5 
+## Positive:  IL9R, LGALS1, TNFRSF18, IL2RA, TNFRSF4 
+## Negative:  GZMH, LINC00861, FGFBP2, GZMK, TRDV2
+

The PCA results can be visualized in different ways.

+
VizDimLoadings(pbmc.m, dims = 1:2, reduction = "pca")
+

+
DimPlot(pbmc.m, reduction = "pca") + NoLegend()
+

+
DimHeatmap(pbmc.m, dims = 1, cells = 500, balanced = TRUE)
+

+
DimHeatmap(pbmc.m, dims = 1:15, cells = 500, balanced = TRUE)
+

+
+
+

Determine the ‘dimensionality’ of the dataset

+

Cells will be clustered based on PCA. How many PC to use is dependent +on many factors. For example, if trying to analyze a rare cell subset, +you might want to add more PCs. Usually, the first 10 is good to see +dimensionality of the data.

+
ElbowPlot(pbmc.m)
+

+
+
+

Cluster the cells.

+
##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
+pbmc.m <- FindNeighbors(pbmc.m, reduction = "pca", dims = 1:15)
+
## Computing nearest neighbor graph
+
## Computing SNN
+
pbmc.m <- FindClusters(pbmc.m, resolution = 0.5, verbose = FALSE)
+
head(Idents(pbmc.m), 5)
+
## AAACCTGAGCGATATA-1 AAACCTGAGCGTAGTG-1 AAACCTGAGGCAGGTT-1 AAACCTGAGTCTTGCA-1 
+##                  2                  2                  1                  2 
+## AAACCTGAGTGCCATT-1 
+##                  9 
+## Levels: 0 1 2 3 4 5 6 7 8 9 10
+
+
+

Run non-linear dimensional reduction (UMAP/tSNE)

+
pbmc.m <- RunUMAP(pbmc.m, dims = 1:15)
+
## 10:56:03 UMAP embedding parameters a = 0.9922 b = 1.112
+
## 10:56:03 Read 18706 rows and found 15 numeric columns
+
## 10:56:03 Using Annoy for neighbor search, n_neighbors = 30
+
## 10:56:03 Building Annoy index with metric = cosine, n_trees = 50
+
## 0%   10   20   30   40   50   60   70   80   90   100%
+
## [----|----|----|----|----|----|----|----|----|----|
+
## **************************************************|
+## 10:56:05 Writing NN index file to temp file /tmp/RtmpLV7FNl/filedf1f675edde4
+## 10:56:05 Searching Annoy index using 1 thread, search_k = 3000
+## 10:56:11 Annoy recall = 100%
+## 10:56:12 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 10:56:13 Initializing from normalized Laplacian + noise (using RSpectra)
+## 10:56:13 Commencing optimization for 200 epochs, with 794186 positive edges
+## 10:56:36 Optimization finished
+
# note that you can set `label = TRUE` or use the LabelClusters function to help label
+# individual clusters
+DimPlot(pbmc.m, reduction = "umap", label = TRUE)
+

+
pbmc.m <- RunTSNE(pbmc.m, reduction = "pca", dims = 1:15)
+DimPlot(pbmc.m, reduction = "tsne", label = TRUE)
+

+

Lets check our metadata now of the seurat object to see what has been +added.

+
head(pbmc.m@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
+## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
+## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
+## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
+## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
+## AAACCTGCAACACCTA-1     pbmc_m        670          460   3.731343
+##                    RNA_snn_res.0.5 seurat_clusters
+## AAACCTGAGCGATATA-1               2               2
+## AAACCTGAGCGTAGTG-1               2               2
+## AAACCTGAGGCAGGTT-1               1               1
+## AAACCTGAGTCTTGCA-1               2               2
+## AAACCTGAGTGCCATT-1               9               9
+## AAACCTGCAACACCTA-1               0               0
+
#We now have seurat clusters and RNA_snn_res.0.5 columns added. 
+
+
+

Finding differentially expressed features (cluster biomarkers)

+
# find markers for every cluster compared to all remaining cells, report only the positive
+# ones
+markers_m <- FindAllMarkers(pbmc.m, only.pos = TRUE)
+
## Calculating cluster 0
+
## Calculating cluster 1
+
## Calculating cluster 2
+
## Calculating cluster 3
+
## Calculating cluster 4
+
## Calculating cluster 5
+
## Calculating cluster 6
+
## Calculating cluster 7
+
## Calculating cluster 8
+
## Calculating cluster 9
+
## Calculating cluster 10
+

VlnPlot will display the differential expression across the clusters. +For example, I am looking here at CD8A and CD4 expression in the +clusters.

+
VlnPlot(pbmc.m, features = c("CD8A", "CD4"))
+

+

The raw counts can also be shown instead by adding some +parameters.

+
VlnPlot(pbmc.m, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)
+

+
FeaturePlot(pbmc.m, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
+    "CD8A"))
+

+
+

Combine Seurat object with demuxlet output.

+
+
+
+

First, load and edit the .best demuxlet output file to make it more +compatible.

+
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
+
+library(patchwork)
+library(tidyr)
+m_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/aM.best", header = T, stringsAsFactors = F, check.names = F)
+head(m_demuxlet)
+
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP
+## 1 AAACCTGAGCGATATA-1  108714   15696   12405  5336
+## 2 AAACCTGAGCGTAGTG-1   32770    4639    4030  1659
+## 3 AAACCTGAGGCAGGTT-1   17692    2237    1768  1172
+## 4 AAACCTGAGTCTTGCA-1   50476    6748    5402  2761
+## 5 AAACCTGAGTGCCATT-1   44258    6139    4879  2498
+## 6 AAACCTGCAACACCTA-1    2652     251     207   163
+##                                                BEST             SNG.1ST
+## 1 DBL-GSA8_0_NYUMD0327-01-GSA8_0_NYUMD0334-01-0.500 GSA8_0_NYUMD0334-01
+## 2                           SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
+## 3                           SNG-GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0327-01
+## 4                           SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
+## 5                           SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
+## 6                           SNG-GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0327-01
+##     SNG.LLK1             SNG.2ND   SNG.LLK2   SNG.LLK0             DBL.1ST
+## 1 -3624.4315 GSA8_0_NYUMD0327-01 -4035.0095 -3409.1657 GSA8_0_NYUMD0327-01
+## 2  -592.3739 GSA8_0_NYUMD0289-01 -1650.8367  -986.3261 GSA8_0_NYUMD0289-01
+## 3  -419.2268 GSA8_0_NYUMD0289-01  -965.4518  -628.2189 GSA8_0_NYUMD0315-01
+## 4  -925.1157 GSA8_0_NYUMD0289-01 -2551.6585 -1551.8233 GSA8_0_NYUMD0315-01
+## 5  -860.0781 GSA8_0_NYUMD0334-01 -2364.5970 -1443.7875 GSA8_0_NYUMD0315-01
+## 6   -55.5857 GSA8_0_NYUMD0315-01  -164.0652  -104.5352 GSA8_0_NYUMD0315-01
+##               DBL.2ND ALPHA      LLK12       LLK1       LLK2      LLK10
+## 1 GSA8_0_NYUMD0334-01   0.5 -2581.7203 -4035.0095 -3624.4315 -3643.5020
+## 2 GSA8_0_NYUMD0315-01   0.5  -933.2834 -1650.8367  -592.3739 -1677.4911
+## 3 GSA8_0_NYUMD0327-01   0.5  -564.1035  -994.3223  -419.2268  -840.8888
+## 4 GSA8_0_NYUMD0327-01   0.5 -1447.4708  -925.1157 -2570.0140 -1455.0858
+## 5 GSA8_0_NYUMD0334-01   0.5 -1359.2728  -860.0781 -2364.5970 -1390.1505
+## 6 GSA8_0_NYUMD0327-01   0.5   -82.1285  -164.0652   -55.5857  -141.7246
+##        LLK20      LLK00   PRB.DBL PRB.SNG1
+## 1 -3476.9482 -3089.7874  1.00e+00      NaN
+## 2  -933.2834 -1028.3060 2.94e-149        1
+## 3  -565.5310  -627.3371  4.98e-64        1
+## 4 -2278.0753 -1610.0639 4.65e-228        1
+## 5 -2162.2396 -1504.2012 5.32e-218        1
+## 6   -86.4408  -103.7608  1.00e-12        1
+

To edit: I will split the Best column into multiple columns.

+
m_demuxlet_edit = m_demuxlet %>% 
+  mutate(BEST = gsub("-01","", BEST)) %>%
+  separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
+  separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
+  separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
+  select(-contains("garbage"))
+
## Warning: Expected 3 pieces. Additional pieces discarded in 2409 rows [1, 8, 13,
+## 16, 17, 20, 43, 48, 63, 69, 73, 81, 95, 112, 114, 129, 136, 141, 172, 202,
+## ...].
+
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 16725 rows [2,
+## 3, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 18, 19, 21, 22, 23, 24, 25, 26, ...].
+
head(m_demuxlet_edit)
+
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
+## 1 AAACCTGAGCGATATA-1  108714   15696   12405  5336                       DBL
+## 2 AAACCTGAGCGTAGTG-1   32770    4639    4030  1659                       SNG
+## 3 AAACCTGAGGCAGGTT-1   17692    2237    1768  1172                       SNG
+## 4 AAACCTGAGTCTTGCA-1   50476    6748    5402  2761                       SNG
+## 5 AAACCTGAGTGCCATT-1   44258    6139    4879  2498                       SNG
+## 6 AAACCTGCAACACCTA-1    2652     251     207   163                       SNG
+##   DMX_maxID DMX_secondID             SNG.1ST   SNG.LLK1             SNG.2ND
+## 1 NYUMD0327    NYUMD0334 GSA8_0_NYUMD0334-01 -3624.4315 GSA8_0_NYUMD0327-01
+## 2 NYUMD0315         <NA> GSA8_0_NYUMD0315-01  -592.3739 GSA8_0_NYUMD0289-01
+## 3 NYUMD0327         <NA> GSA8_0_NYUMD0327-01  -419.2268 GSA8_0_NYUMD0289-01
+## 4 NYUMD0315         <NA> GSA8_0_NYUMD0315-01  -925.1157 GSA8_0_NYUMD0289-01
+## 5 NYUMD0315         <NA> GSA8_0_NYUMD0315-01  -860.0781 GSA8_0_NYUMD0334-01
+## 6 NYUMD0327         <NA> GSA8_0_NYUMD0327-01   -55.5857 GSA8_0_NYUMD0315-01
+##     SNG.LLK2   SNG.LLK0             DBL.1ST             DBL.2ND ALPHA
+## 1 -4035.0095 -3409.1657 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5
+## 2 -1650.8367  -986.3261 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01   0.5
+## 3  -965.4518  -628.2189 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01   0.5
+## 4 -2551.6585 -1551.8233 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01   0.5
+## 5 -2364.5970 -1443.7875 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01   0.5
+## 6  -164.0652  -104.5352 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01   0.5
+##        LLK12       LLK1       LLK2      LLK10      LLK20      LLK00   PRB.DBL
+## 1 -2581.7203 -4035.0095 -3624.4315 -3643.5020 -3476.9482 -3089.7874  1.00e+00
+## 2  -933.2834 -1650.8367  -592.3739 -1677.4911  -933.2834 -1028.3060 2.94e-149
+## 3  -564.1035  -994.3223  -419.2268  -840.8888  -565.5310  -627.3371  4.98e-64
+## 4 -1447.4708  -925.1157 -2570.0140 -1455.0858 -2278.0753 -1610.0639 4.65e-228
+## 5 -1359.2728  -860.0781 -2364.5970 -1390.1505 -2162.2396 -1504.2012 5.32e-218
+## 6   -82.1285  -164.0652   -55.5857  -141.7246   -86.4408  -103.7608  1.00e-12
+##   PRB.SNG1
+## 1      NaN
+## 2        1
+## 3        1
+## 4        1
+## 5        1
+## 6        1
+
table(m_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
+
## 
+##   AMB   DBL   SNG 
+##    10  2399 16725
+
table(m_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
+
## 
+## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334 
+##      4972      4523      6209      3430
+
table(m_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
+
##                          DMX_maxID
+## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
+##                       AMB         2         2         3         3
+##                       DBL      1134       787       458        20
+##                       SNG      3836      3734      5748      3407
+
+
+

Next, I will add this edited demuxlet to the Seurat object.

+
m_demuxlet_edit.subset <- m_demuxlet_edit[m_demuxlet_edit$BARCODE %in% colnames(pbmc.m),]
+
+pbmc.m@meta.data <- cbind(pbmc.m@meta.data,m_demuxlet_edit.subset$DMX_maxID,m_demuxlet_edit.subset$DMX_classification.global, m_demuxlet_edit.subset$BARCODE)
+
+head(pbmc.m@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
+## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
+## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
+## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
+## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
+## AAACCTGCAACACCTA-1     pbmc_m        670          460   3.731343
+##                    RNA_snn_res.0.5 seurat_clusters
+## AAACCTGAGCGATATA-1               2               2
+## AAACCTGAGCGTAGTG-1               2               2
+## AAACCTGAGGCAGGTT-1               1               1
+## AAACCTGAGTCTTGCA-1               2               2
+## AAACCTGAGTGCCATT-1               9               9
+## AAACCTGCAACACCTA-1               0               0
+##                    m_demuxlet_edit.subset$DMX_maxID
+## AAACCTGAGCGATATA-1                        NYUMD0327
+## AAACCTGAGCGTAGTG-1                        NYUMD0315
+## AAACCTGAGGCAGGTT-1                        NYUMD0327
+## AAACCTGAGTCTTGCA-1                        NYUMD0315
+## AAACCTGAGTGCCATT-1                        NYUMD0315
+## AAACCTGCAACACCTA-1                        NYUMD0327
+##                    m_demuxlet_edit.subset$DMX_classification.global
+## AAACCTGAGCGATATA-1                                              DBL
+## AAACCTGAGCGTAGTG-1                                              SNG
+## AAACCTGAGGCAGGTT-1                                              SNG
+## AAACCTGAGTCTTGCA-1                                              SNG
+## AAACCTGAGTGCCATT-1                                              SNG
+## AAACCTGCAACACCTA-1                                              SNG
+##                    m_demuxlet_edit.subset$BARCODE
+## AAACCTGAGCGATATA-1             AAACCTGAGCGATATA-1
+## AAACCTGAGCGTAGTG-1             AAACCTGAGCGTAGTG-1
+## AAACCTGAGGCAGGTT-1             AAACCTGAGGCAGGTT-1
+## AAACCTGAGTCTTGCA-1             AAACCTGAGTCTTGCA-1
+## AAACCTGAGTGCCATT-1             AAACCTGAGTGCCATT-1
+## AAACCTGCAACACCTA-1             AAACCTGCAACACCTA-1
+
+
+

Pre-Processing the new object.

+

The Seurat Object has already been preprocessed in my case, so this +should be clean)

+
pbmc.m[["percent.mt"]] <- PercentageFeatureSet(pbmc.m, pattern = "^MT-")
+
+library(cowplot)
+
## 
+## Attaching package: 'cowplot'
+
## The following object is masked from 'package:patchwork':
+## 
+##     align_plots
+
# look at distribution of metrics by classification
+plot_grid(VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "m_demuxlet_edit.subset$DMX_maxID"))
+

+
VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "m_demuxlet_edit.subset$DMX_classification.global")
+

+

Now, select for MT content under 10% and for nfeatureRNA > 200 to +ensure getting alive cells. I previously did this, so should see no +change in cell numnbers.

+
pbmc.m <- subset(pbmc.m, subset = nFeature_RNA > 200 & percent.mt < 10)
+length(colnames(pbmc.m))
+
## [1] 18706
+
length(rownames(pbmc.m))
+
## [1] 24658
+

Add column to meta data to identify seurat object as Basline +condition. Also rename some columns for clarity purposes.

+
pbmc.m@meta.data$condition <- 'Monomer'
+names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
+names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
+names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$BARCODE"] <- "Barcode"
+head(pbmc.m@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
+## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
+## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
+## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
+## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
+## AAACCTGCAACACCTA-1     pbmc_m        670          460   3.731343
+##                    RNA_snn_res.0.5 seurat_clusters DMX_maxID
+## AAACCTGAGCGATATA-1               2               2 NYUMD0327
+## AAACCTGAGCGTAGTG-1               2               2 NYUMD0315
+## AAACCTGAGGCAGGTT-1               1               1 NYUMD0327
+## AAACCTGAGTCTTGCA-1               2               2 NYUMD0315
+## AAACCTGAGTGCCATT-1               9               9 NYUMD0315
+## AAACCTGCAACACCTA-1               0               0 NYUMD0327
+##                    DMX_classification.global            Barcode condition
+## AAACCTGAGCGATATA-1                       DBL AAACCTGAGCGATATA-1   Monomer
+## AAACCTGAGCGTAGTG-1                       SNG AAACCTGAGCGTAGTG-1   Monomer
+## AAACCTGAGGCAGGTT-1                       SNG AAACCTGAGGCAGGTT-1   Monomer
+## AAACCTGAGTCTTGCA-1                       SNG AAACCTGAGTCTTGCA-1   Monomer
+## AAACCTGAGTGCCATT-1                       SNG AAACCTGAGTGCCATT-1   Monomer
+## AAACCTGCAACACCTA-1                       SNG AAACCTGCAACACCTA-1   Monomer
+
saveRDS(pbmc.m, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.monomer.final.RDS")
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/Unstim_Seurat.RMD b/Unstim_Seurat.RMD index b3c2177..688ea88 100644 --- a/Unstim_Seurat.RMD +++ b/Unstim_Seurat.RMD @@ -15,8 +15,9 @@ knitr::opts_chunk$set(echo = TRUE) .libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths())) -library(Seurat) -library(patchwork, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library") +library(sctransform, lib.loc = "/hpc/users/richaa21/.Rlib") +library(Seurat, lib.loc = "/hpc/users/richaa21/.Rlib") +library(patchwork) library(dplyr) ``` @@ -107,7 +108,7 @@ VizDimLoadings(pbmc.unstim, dims = 1:2, reduction = "pca") ``` ```{r PC2 vs PC1 scatter plot, warning=FALSE} -DimPlot(pbmc.unstim, reduction = "pca") + NoLegend() +DimPlot(pbmc.unstim, reduction = "pca") ``` ```{r Heatmap of PC1, warning=FALSE} @@ -196,7 +197,6 @@ FeaturePlot(pbmc.unstim, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", ```{r} .libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths())) -library(patchwork, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.3.0/site-library") library(tidyr) u_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/Unstim.best", header = T, stringsAsFactors = F, check.names = F) head(u_demuxlet) @@ -270,36 +270,6 @@ names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.su head(pbmc.unstim@meta.data) ``` - -## Remove douplets. -Note - It is a good idea to compare doublet determination by dumuxlet and another resource (i.e. DoupletFinder or scDblFinder). However, these packages do not work with SeuratV5, which is what I used to create my SeuratObject. -For now, I am just removing doublets defined by demuxlet. - -```{r} -#Idents(pbmc.unstim) <- pbmc.unstim$DMX_classification.global -#pbmc.unstim_sing <- subset(x = pbmc.unstim, idents = c("SNG"), invert = F) - -## confirm that there are no doublets left -#table(pbmc.unstim_sing$DMX_classification.global) -``` - -# Add metadata information. -```{r} -# Add patient data to Seurat object meta data. - -#pt_data <- data.frame(DMX_maxID = c("NYUMD0289", "NYUMD0315", "NYUMD0327", "NYUMD0334"), - #DX = c("PD", "PD", "CO", "CO"), - # Sex = c("Male", "Male", "Male", "Male"), - #Age = c(68, 65, 65, 60)) - -#merged_df <- merge(pbmc.unstim_sing@meta.data, pt_data, by = "DMX_maxID") - -#pbmc.unstim_sing@meta.data <- merged_df - -#head(pbmc.unstim_sing@meta.data) -``` - - ```{r Save as .finalRDS file} saveRDS(pbmc.unstim, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.unstim.final.RDS") ``` diff --git a/Unstim_Seurat.html b/Unstim_Seurat.html new file mode 100644 index 0000000..0dd0a09 --- /dev/null +++ b/Unstim_Seurat.html @@ -0,0 +1,896 @@ + + + + + + + + + + + + + + + +Parkinson Disease Stimulation Exp - Unstim Samples + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Rmarkdown to pre-process scRNA seq from baseline (aka unstimated) +PBMCs.

+
+
+

Create the Seurat Object.

+
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
+
+library(sctransform, lib.loc = "/hpc/users/richaa21/.Rlib")
+library(Seurat, lib.loc = "/hpc/users/richaa21/.Rlib")
+
## Loading required package: SeuratObject
+
## Loading required package: sp
+
## 'SeuratObject' was built under R 4.2.0 but the current version is
+## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
+## for R may have changed
+
## 'SeuratObject' was built with package 'Matrix' 1.6.4 but the current
+## version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
+## the ABI for 'Matrix' may have changed
+
## 
+## Attaching package: 'SeuratObject'
+
## The following object is masked from 'package:base':
+## 
+##     intersect
+
library(patchwork)
+library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/Unstim/outs/per_sample_outs/Unstim/count/sample_filtered_feature_bc_matrix")
+pbmc.unstim <- CreateSeuratObject(counts = pbmc.data, project = "pbmc_sc", min.cells = 3, min.features = 200)
+pbmc.unstim
+
## An object of class Seurat 
+## 24998 features across 21114 samples within 1 assay 
+## Active assay: RNA (24998 features, 0 variable features)
+##  1 layer present: counts
+

Lets check how many cells and features we are starting with.

+
length(colnames(pbmc.unstim)) #number of cells
+
## [1] 21114
+
length(rownames(pbmc.unstim)) #number of features 
+
## [1] 24998
+
+
+

Quality Control (QC)

+
a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT. 
+b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data. 
+
pbmc.unstim[["percent.mt"]] <- PercentageFeatureSet(pbmc.unstim, pattern = "^MT-")
+
+# Show QC metrics for the first 5 cells
+head(pbmc.unstim@meta.data, 5)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
+## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
+## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
+## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
+## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
+
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
+VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
+

+
plot1 <- FeatureScatter(pbmc.unstim, feature1 = "nCount_RNA", feature2 = "percent.mt")
+plot2 <- FeatureScatter(pbmc.unstim, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
+plot1 
+

+
plot2
+

+
# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells. 
+pbmc.unstim <- subset(pbmc.unstim, subset = nFeature_RNA > 200 & percent.mt < 10)
+head(pbmc.unstim@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
+## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
+## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
+## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
+## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
+## AAACCTGAGCAGGCTA-1    pbmc_sc      32387         6522   4.421527
+
+
+

Normalize the data.

+

The data is normalized based on the feature (number of genes in a +cell) by the total expression. This number is multiplied by 10,000 and +then log transformed. The function to do this is “NormalizeData.” The +values specied below are the default values of this function.

+
pbmc.unstim <- NormalizeData(pbmc.unstim, normalization.method = "LogNormalize", scale.factor = 10000)
+
## Normalizing layer: counts
+
+
+

Identification of highly variable features

+
pbmc.unstim <- FindVariableFeatures(pbmc.unstim, selection.method = "vst", nfeatures = 2000)
+
## Finding variable features for layer counts
+
# Identify the 10 most highly variable genes
+top10 <- head(VariableFeatures(pbmc.unstim), 10)
+
+# plot variable features with and without labels
+plot1 <- VariableFeaturePlot(pbmc.unstim)
+plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
+plot1 
+

+
plot2
+

+## Scaling the Data The data is scaled by doing a linear transformation. +The ScaleData function does this by shifting the expression of genes so +that the mean expression becomes 0 and the variance is 1. By default, +only variable genes are scaled. This is changed by features = +all.genes

+
##Scaling RNA data, we only scale the variable features here for efficiency
+all.genes <- rownames(pbmc.unstim)
+pbmc.unstim <- ScaleData(pbmc.unstim, vars.to.regress = c("percent.mt"))
+
## Regressing out percent.mt
+
## Centering and scaling data matrix
+
+
+

Perform linear dimensional reduction

+

For the first principal components, RunPCA shows the most positive +(correlated) and most negative (anticorrelated) genes

+
pbmc.unstim <- RunPCA(pbmc.unstim, features = VariableFeatures(object = pbmc.unstim))
+
## PC_ 1 
+## Positive:  IL7R, FP236383.3, SQSTM1, LTB, TCF7, AQP3, JAML, FP671120.4, HERPUD1, PIM2 
+##     CD27, TIMP1, CCR7, TNFSF8, NCDN, CD40LG, BEX3, C1orf162, LINC00861, SNED1 
+##     CCR4, IGFLR1, TRBV6-2, RCBTB2, MYC, TRAV8-3, FAAH2, WDR86-AS1, HSF4, LRRN3 
+## Negative:  RRM2, MKI67, TYMS, TUBA1B, TOP2A, STMN1, UBE2C, PCLAF, CDK1, KIFC1 
+##     ASF1B, NUSAP1, ZWINT, CCNA2, CENPF, ASPM, KNL1, HIST1H1B, GTSE1, TPX2 
+##     SPC25, PKMYT1, TUBB, AURKB, CDCA8, CDCA2, BIRC5, CKAP2L, HJURP, GGH 
+## PC_ 2 
+## Positive:  KLRD1, TYROBP, CTSW, FCER1G, PRF1, GZMB, NKG7, CCL3, CCL4, KLRC1 
+##     IL2RB, SH2D1B, FCGR3A, PLEK, KLRC2, NCAM1, CST7, EOMES, KIR2DL4, AOAH 
+##     CCL5, TRDC, GNLY, GZMA, NCR1, FASLG, TOX, SLAMF7, CD38, KLRK1 
+## Negative:  IL7R, TIMP1, S100A6, AQP3, IL32, COTL1, S100A4, SNED1, WDR86-AS1, ANP32B 
+##     STAT1, IL9R, TUBA4A, ITGB1, MYC, PLK1, CCNB2, LMNA, DLGAP5, CENPA 
+##     SOS1, TUBA1C, HMMR, TCF7, PLP2, CRIP1, CEP55, CENPF, CDCA3, PRDX1 
+## PC_ 3 
+## Positive:  GINS2, MCM2, CDC6, MCM6, MCM5, MCM7, MCM4, CDC45, E2F1, MCM3 
+##     DTL, PAICS, UNG, FAM111B, FEN1, UHRF1, SLBP, HSP90AB1, CCNE2, PCNA 
+##     HELLS, CHAF1B, CDCA7, SLC29A1, MCM10, CLSPN, NME1, MIF, DUT, CTNNAL1 
+## Negative:  PLK1, CCNB1, KIF20A, CDC20, CENPA, NEK2, HMMR, CCNB2, DLGAP5, PSRC1 
+##     ASPM, KIF14, TROAP, CENPF, CENPE, PIF1, AURKA, PIMREG, ARL6IP1, KIF2C 
+##     UBALD2, PRR11, KNSTRN, GZMK, DEPDC1, CDCA8, UBE2C, KPNA2, TPX2, BIRC5 
+## PC_ 4 
+## Positive:  CD79A, JCHAIN, POU2AF1, IGHM, BASP1, BLNK, MEF2C, SYK, TSPAN33, TCF4 
+##     RALGPS2, IGLC2, SPI1, RHEX, CD40, IGHA1, BANK1, NCF1, CDK14, CD19 
+##     LINC00926, FCRL5, LINC02362, BTK, TNFRSF17, AC012236.1, IGKC, SLC43A2, RASGRP3, IGLC3 
+## Negative:  S100A4, CCL5, LTB, S100A6, IL32, GZMA, PFN1, CST7, HOPX, NKG7 
+##     CD8A, CD8B, ALOX5AP, GZMM, FGFBP2, THEMIS, LYAR, GZMK, ACTB, PRF1 
+##     FLNA, TMSB10, GZMB, LAG3, CFL1, SAMD3, AHNAK, CXCR3, ANXA2, NCR3 
+## PC_ 5 
+## Positive:  LGALS1, S100A4, S100A6, IL32, ANXA2, TXN, HLA-DPA1, GAPDH, TMSB10, HLA-DRB1 
+##     PFN1, CD74, ACTB, HLA-DQA1, HLA-DRA, CFL1, HOPX, PRDX1, HLA-DPB1, ACTG1 
+##     ENO1, FTL, AHNAK, LINC02694, LGALS3, VIM, COTL1, JPT1, LAG3, FLNA 
+## Negative:  TCF7, CCR7, TXK, PLAC8, FP236383.3, FCER1G, CXCR4, PTMA, ID3, NCAM1 
+##     TYROBP, HIST1H1D, PTGDR, SELL, BACH2, C1orf162, SH2D1B, SESN3, KCNQ5, FHIT 
+##     CTBP2, FAM111B, KLRF1, E2F7, RNF130, AC139720.1, EXO1, MYBL2, E2F8, ESCO2
+
print(pbmc.unstim[["pca"]], dims = 1:5, nfeatures = 5)
+
## PC_ 1 
+## Positive:  IL7R, FP236383.3, SQSTM1, LTB, TCF7 
+## Negative:  RRM2, MKI67, TYMS, TUBA1B, TOP2A 
+## PC_ 2 
+## Positive:  KLRD1, TYROBP, CTSW, FCER1G, PRF1 
+## Negative:  IL7R, TIMP1, S100A6, AQP3, IL32 
+## PC_ 3 
+## Positive:  GINS2, MCM2, CDC6, MCM6, MCM5 
+## Negative:  PLK1, CCNB1, KIF20A, CDC20, CENPA 
+## PC_ 4 
+## Positive:  CD79A, JCHAIN, POU2AF1, IGHM, BASP1 
+## Negative:  S100A4, CCL5, LTB, S100A6, IL32 
+## PC_ 5 
+## Positive:  LGALS1, S100A4, S100A6, IL32, ANXA2 
+## Negative:  TCF7, CCR7, TXK, PLAC8, FP236383.3
+

The PCA results can be visualized in different ways.

+
VizDimLoadings(pbmc.unstim, dims = 1:2, reduction = "pca")
+

+
DimPlot(pbmc.unstim, reduction = "pca")
+

+
DimHeatmap(pbmc.unstim, dims = 1, cells = 500, balanced = TRUE)
+

+
DimHeatmap(pbmc.unstim, dims = 1:15, cells = 500, balanced = TRUE)
+

+
+
+

Determine the ‘dimensionality’ of the dataset

+

Cells will be clustered based on PCA. How many PC to use is dependent +on many factors. For example, if trying to analyze a rare cell subset, +you might want to add more PCs. Usually, the first 10 is good to see +dimensionality of the data.

+
ElbowPlot(pbmc.unstim)
+

+
+
+

Cluster the cells.

+
##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
+pbmc.unstim <- FindNeighbors(pbmc.unstim, reduction = "pca", dims = 1:15)
+
## Computing nearest neighbor graph
+
## Computing SNN
+
pbmc.unstim <- FindClusters(pbmc.unstim, resolution = 0.5, verbose = FALSE)
+
head(Idents(pbmc.unstim), 5)
+
## AAACCTGAGAAACGCC-1 AAACCTGAGAAGGGTA-1 AAACCTGAGACCTAGG-1 AAACCTGAGATAGTCA-1 
+##                 10                  2                  6                  0 
+## AAACCTGAGCAAATCA-1 
+##                  1 
+## Levels: 0 1 2 3 4 5 6 7 8 9 10 11
+
+
+

Run non-linear dimensional reduction (UMAP/tSNE)

+
pbmc.unstim <- RunUMAP(pbmc.unstim, dims = 1:15)
+
## 10:15:59 UMAP embedding parameters a = 0.9922 b = 1.112
+
## 10:15:59 Read 20924 rows and found 15 numeric columns
+
## 10:15:59 Using Annoy for neighbor search, n_neighbors = 30
+
## 10:15:59 Building Annoy index with metric = cosine, n_trees = 50
+
## 0%   10   20   30   40   50   60   70   80   90   100%
+
## [----|----|----|----|----|----|----|----|----|----|
+
## **************************************************|
+## 10:16:01 Writing NN index file to temp file /tmp/RtmpA4NXVY/filebc682171bd0c
+## 10:16:01 Searching Annoy index using 1 thread, search_k = 3000
+## 10:16:08 Annoy recall = 100%
+## 10:16:09 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 10:16:10 Initializing from normalized Laplacian + noise (using RSpectra)
+## 10:16:10 Commencing optimization for 200 epochs, with 892462 positive edges
+## 10:16:36 Optimization finished
+
# note that you can set `label = TRUE` or use the LabelClusters function to help label
+# individual clusters
+DimPlot(pbmc.unstim, reduction = "umap", label = TRUE)
+

+
pbmc.unstim <- RunTSNE(pbmc.unstim, reduction = "pca", dims = 1:15)
+DimPlot(pbmc.unstim, reduction = "tsne", label = TRUE)
+

+

Lets check our metadata now of the seurat object to see what has been +added.

+
head(pbmc.unstim@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
+## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
+## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
+## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
+## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
+## AAACCTGAGCAGGCTA-1    pbmc_sc      32387         6522   4.421527
+##                    RNA_snn_res.0.5 seurat_clusters
+## AAACCTGAGAAACGCC-1              10              10
+## AAACCTGAGAAGGGTA-1               2               2
+## AAACCTGAGACCTAGG-1               6               6
+## AAACCTGAGATAGTCA-1               0               0
+## AAACCTGAGCAAATCA-1               1               1
+## AAACCTGAGCAGGCTA-1               3               3
+
#We now have seurat clusters and RNA_snn_res.0.5 columns added. 
+
+
+

Finding differentially expressed features (cluster biomarkers)

+
# find markers for every cluster compared to all remaining cells, report only the positive
+# ones
+markers_unstim <- FindAllMarkers(pbmc.unstim, only.pos = TRUE)
+
## Calculating cluster 0
+
## Calculating cluster 1
+
## Calculating cluster 2
+
## Calculating cluster 3
+
## Calculating cluster 4
+
## Calculating cluster 5
+
## Calculating cluster 6
+
## Calculating cluster 7
+
## Calculating cluster 8
+
## Calculating cluster 9
+
## Calculating cluster 10
+
## Calculating cluster 11
+

VlnPlot will display the differential expression across the clusters. +For example, I am looking here at CD8A and CD4 expression in the +clusters.

+
VlnPlot(pbmc.unstim, features = c("CD8A", "CD4"))
+

+

The raw counts can also be shown instead by adding some +parameters.

+
VlnPlot(pbmc.unstim, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)
+

+
FeaturePlot(pbmc.unstim, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
+    "CD8A"))
+

+
+

Combine Seurat object with demuxlet output.

+
+
+
+

First, load and edit the .best demuxlet output file to make it more +compatible.

+
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
+
+library(tidyr)
+u_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/Unstim.best", header = T, stringsAsFactors = F, check.names = F)
+head(u_demuxlet)
+
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP
+## 1 AAACCTGAGAAACGCC-1   42286   22364   20626  1854
+## 2 AAACCTGAGAAGGGTA-1   35144    4231    3579  1888
+## 3 AAACCTGAGACCTAGG-1    8284    1119     928   608
+## 4 AAACCTGAGATAGTCA-1   31640    3692    3025  1681
+## 5 AAACCTGAGCAAATCA-1   34136    4239    3322  1982
+## 6 AAACCTGAGCAGGCTA-1   93780   12646   10291  4636
+##                                                BEST             SNG.1ST
+## 1                           SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
+## 2 DBL-GSA8_0_NYUMD0315-01-GSA8_0_NYUMD0327-01-0.500 GSA8_0_NYUMD0315-01
+## 3                           SNG-GSA8_0_NYUMD0334-01 GSA8_0_NYUMD0334-01
+## 4                           SNG-GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0289-01
+## 5                           SNG-GSA8_0_NYUMD0334-01 GSA8_0_NYUMD0334-01
+## 6 DBL-GSA8_0_NYUMD0327-01-GSA8_0_NYUMD0334-01-0.500 GSA8_0_NYUMD0327-01
+##     SNG.LLK1             SNG.2ND   SNG.LLK2   SNG.LLK0             DBL.1ST
+## 1  -632.7918 GSA8_0_NYUMD0289-01 -1794.0976 -1099.8520 GSA8_0_NYUMD0289-01
+## 2 -1267.1772 GSA8_0_NYUMD0327-01 -1376.1579 -1201.1205 GSA8_0_NYUMD0315-01
+## 3  -220.6593 GSA8_0_NYUMD0327-01  -499.5366  -330.9340 GSA8_0_NYUMD0327-01
+## 4  -601.2027 GSA8_0_NYUMD0327-01 -1580.6534  -975.7141 GSA8_0_NYUMD0289-01
+## 5  -695.9992 GSA8_0_NYUMD0315-01 -1791.4493 -1106.4335 GSA8_0_NYUMD0315-01
+## 6 -2584.2428 GSA8_0_NYUMD0334-01 -3897.3665 -2888.3866 GSA8_0_NYUMD0327-01
+##               DBL.2ND ALPHA      LLK12       LLK1       LLK2      LLK10
+## 1 GSA8_0_NYUMD0315-01   0.5 -1024.5246 -1794.0976  -632.7918 -1840.7324
+## 2 GSA8_0_NYUMD0327-01   0.5  -885.9914 -1267.1772 -1376.1579 -1211.0138
+## 3 GSA8_0_NYUMD0334-01   0.5  -301.3090  -499.5366  -220.6593  -458.4461
+## 4 GSA8_0_NYUMD0315-01   0.5  -900.3046  -601.2027 -1603.4487  -610.6523
+## 5 GSA8_0_NYUMD0334-01   0.5  -992.4663 -1791.4493  -695.9992 -1561.3775
+## 6 GSA8_0_NYUMD0334-01   0.5 -2274.3541 -2584.2428 -3897.3665 -2729.2507
+##        LLK20      LLK00   PRB.DBL PRB.SNG1
+## 1 -1024.5246 -1155.0510 2.49e-171        1
+## 2 -1279.7057 -1088.3092  1.00e+00        1
+## 3  -313.0930  -338.7091  3.14e-36        1
+## 4  -900.3046  -996.4076 4.23e-131        1
+## 5 -1019.5818 -1123.0042 5.87e-130        1
+## 6 -3366.3864 -2648.3299  1.00e+00        1
+

To edit: I will split the Best column into multiple columns.

+
u_demuxlet_edit = u_demuxlet %>% 
+  mutate(BEST = gsub("-01","", BEST)) %>%
+  separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
+  separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
+  separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
+  select(-contains("garbage"))
+
## Warning: Expected 3 pieces. Additional pieces discarded in 3082 rows [2, 6, 8,
+## 11, 15, 23, 24, 29, 30, 31, 34, 42, 54, 58, 59, 60, 68, 75, 76, 83, ...].
+
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 18152 rows [1,
+## 3, 4, 5, 7, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, ...].
+
head(u_demuxlet_edit)
+
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
+## 1 AAACCTGAGAAACGCC-1   42286   22364   20626  1854                       SNG
+## 2 AAACCTGAGAAGGGTA-1   35144    4231    3579  1888                       DBL
+## 3 AAACCTGAGACCTAGG-1    8284    1119     928   608                       SNG
+## 4 AAACCTGAGATAGTCA-1   31640    3692    3025  1681                       SNG
+## 5 AAACCTGAGCAAATCA-1   34136    4239    3322  1982                       SNG
+## 6 AAACCTGAGCAGGCTA-1   93780   12646   10291  4636                       DBL
+##   DMX_maxID DMX_secondID             SNG.1ST   SNG.LLK1             SNG.2ND
+## 1 NYUMD0315         <NA> GSA8_0_NYUMD0315-01  -632.7918 GSA8_0_NYUMD0289-01
+## 2 NYUMD0315    NYUMD0327 GSA8_0_NYUMD0315-01 -1267.1772 GSA8_0_NYUMD0327-01
+## 3 NYUMD0334         <NA> GSA8_0_NYUMD0334-01  -220.6593 GSA8_0_NYUMD0327-01
+## 4 NYUMD0289         <NA> GSA8_0_NYUMD0289-01  -601.2027 GSA8_0_NYUMD0327-01
+## 5 NYUMD0334         <NA> GSA8_0_NYUMD0334-01  -695.9992 GSA8_0_NYUMD0315-01
+## 6 NYUMD0327    NYUMD0334 GSA8_0_NYUMD0327-01 -2584.2428 GSA8_0_NYUMD0334-01
+##     SNG.LLK2   SNG.LLK0             DBL.1ST             DBL.2ND ALPHA
+## 1 -1794.0976 -1099.8520 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01   0.5
+## 2 -1376.1579 -1201.1205 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01   0.5
+## 3  -499.5366  -330.9340 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5
+## 4 -1580.6534  -975.7141 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01   0.5
+## 5 -1791.4493 -1106.4335 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01   0.5
+## 6 -3897.3665 -2888.3866 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5
+##        LLK12       LLK1       LLK2      LLK10      LLK20      LLK00   PRB.DBL
+## 1 -1024.5246 -1794.0976  -632.7918 -1840.7324 -1024.5246 -1155.0510 2.49e-171
+## 2  -885.9914 -1267.1772 -1376.1579 -1211.0138 -1279.7057 -1088.3092  1.00e+00
+## 3  -301.3090  -499.5366  -220.6593  -458.4461  -313.0930  -338.7091  3.14e-36
+## 4  -900.3046  -601.2027 -1603.4487  -610.6523  -900.3046  -996.4076 4.23e-131
+## 5  -992.4663 -1791.4493  -695.9992 -1561.3775 -1019.5818 -1123.0042 5.87e-130
+## 6 -2274.3541 -2584.2428 -3897.3665 -2729.2507 -3366.3864 -2648.3299  1.00e+00
+##   PRB.SNG1
+## 1        1
+## 2        1
+## 3        1
+## 4        1
+## 5        1
+## 6        1
+
table(u_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
+
## 
+##   AMB   DBL   SNG 
+##    25  3057 18152
+
table(u_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
+
## 
+## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334 
+##      4904      5661      5475      5194
+
table(u_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
+
##                          DMX_maxID
+## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
+##                       AMB         4         5         5        11
+##                       DBL      1301      1131       608        17
+##                       SNG      3599      4525      4862      5166
+
+
+

Next, I will add this edited demuxlet to the Seurat object.

+
u_demuxlet_edit.subset <- u_demuxlet_edit[u_demuxlet_edit$BARCODE %in% colnames(pbmc.unstim),]
+
+pbmc.unstim@meta.data <- cbind(pbmc.unstim@meta.data,u_demuxlet_edit.subset$DMX_maxID,u_demuxlet_edit.subset$DMX_classification.global, u_demuxlet_edit.subset$BARCODE)
+
+head(pbmc.unstim@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
+## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
+## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
+## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
+## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
+## AAACCTGAGCAGGCTA-1    pbmc_sc      32387         6522   4.421527
+##                    RNA_snn_res.0.5 seurat_clusters
+## AAACCTGAGAAACGCC-1              10              10
+## AAACCTGAGAAGGGTA-1               2               2
+## AAACCTGAGACCTAGG-1               6               6
+## AAACCTGAGATAGTCA-1               0               0
+## AAACCTGAGCAAATCA-1               1               1
+## AAACCTGAGCAGGCTA-1               3               3
+##                    u_demuxlet_edit.subset$DMX_maxID
+## AAACCTGAGAAACGCC-1                        NYUMD0315
+## AAACCTGAGAAGGGTA-1                        NYUMD0315
+## AAACCTGAGACCTAGG-1                        NYUMD0334
+## AAACCTGAGATAGTCA-1                        NYUMD0289
+## AAACCTGAGCAAATCA-1                        NYUMD0334
+## AAACCTGAGCAGGCTA-1                        NYUMD0327
+##                    u_demuxlet_edit.subset$DMX_classification.global
+## AAACCTGAGAAACGCC-1                                              SNG
+## AAACCTGAGAAGGGTA-1                                              DBL
+## AAACCTGAGACCTAGG-1                                              SNG
+## AAACCTGAGATAGTCA-1                                              SNG
+## AAACCTGAGCAAATCA-1                                              SNG
+## AAACCTGAGCAGGCTA-1                                              DBL
+##                    u_demuxlet_edit.subset$BARCODE
+## AAACCTGAGAAACGCC-1             AAACCTGAGAAACGCC-1
+## AAACCTGAGAAGGGTA-1             AAACCTGAGAAGGGTA-1
+## AAACCTGAGACCTAGG-1             AAACCTGAGACCTAGG-1
+## AAACCTGAGATAGTCA-1             AAACCTGAGATAGTCA-1
+## AAACCTGAGCAAATCA-1             AAACCTGAGCAAATCA-1
+## AAACCTGAGCAGGCTA-1             AAACCTGAGCAGGCTA-1
+
+
+

Pre-Processing the new object.

+

The Seurat Object has already been preprocessed in my case, so this +should be clean)

+
pbmc.unstim[["percent.mt"]] <- PercentageFeatureSet(pbmc.unstim, pattern = "^MT-")
+
+library(cowplot)
+
## 
+## Attaching package: 'cowplot'
+
## The following object is masked from 'package:patchwork':
+## 
+##     align_plots
+
# look at distribution of metrics by classification
+plot_grid(VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "u_demuxlet_edit.subset$DMX_maxID"))
+

+
VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "u_demuxlet_edit.subset$DMX_classification.global")
+

+

Now, select for MT content under 10% and for nfeatureRNA > 200 to +ensure getting alive cells. I previously did this, so should see no +change in cell numnbers.

+
pbmc.unstim <- subset(pbmc.unstim, subset = nFeature_RNA > 200 & percent.mt < 10)
+length(colnames(pbmc.unstim))
+
## [1] 20924
+
length(rownames(pbmc.unstim))
+
## [1] 24998
+

Add column to meta data to identify seurat object as Basline +condition. Also rename some columns for clarity purposes.

+
pbmc.unstim@meta.data$condition <- 'Baseline'
+names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
+names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
+names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$BARCODE"] <- "Barcode"
+head(pbmc.unstim@meta.data)
+
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
+## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
+## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
+## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
+## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
+## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
+## AAACCTGAGCAGGCTA-1    pbmc_sc      32387         6522   4.421527
+##                    RNA_snn_res.0.5 seurat_clusters DMX_maxID
+## AAACCTGAGAAACGCC-1              10              10 NYUMD0315
+## AAACCTGAGAAGGGTA-1               2               2 NYUMD0315
+## AAACCTGAGACCTAGG-1               6               6 NYUMD0334
+## AAACCTGAGATAGTCA-1               0               0 NYUMD0289
+## AAACCTGAGCAAATCA-1               1               1 NYUMD0334
+## AAACCTGAGCAGGCTA-1               3               3 NYUMD0327
+##                    DMX_classification.global            Barcode condition
+## AAACCTGAGAAACGCC-1                       SNG AAACCTGAGAAACGCC-1  Baseline
+## AAACCTGAGAAGGGTA-1                       DBL AAACCTGAGAAGGGTA-1  Baseline
+## AAACCTGAGACCTAGG-1                       SNG AAACCTGAGACCTAGG-1  Baseline
+## AAACCTGAGATAGTCA-1                       SNG AAACCTGAGATAGTCA-1  Baseline
+## AAACCTGAGCAAATCA-1                       SNG AAACCTGAGCAAATCA-1  Baseline
+## AAACCTGAGCAGGCTA-1                       DBL AAACCTGAGCAGGCTA-1  Baseline
+
saveRDS(pbmc.unstim, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.unstim.final.RDS")
+
+ + + + +
+ + + + + + + + + + + + + + +