Merge pull request #159 from zhanghao-njmu/develop

Develop
zhanghao-njmu · Sep 8, 2023 · 8996664 · 8996664
2 parents c2f6a55 + c267ef9
commit 8996664
Show file tree

Hide file tree

Showing 24 changed files with 243 additions and 200 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -4,7 +4,7 @@ Title: Single Cell Pipeline
 Version: 0.5.1
 Author: Hao Zhang
 Maintainer: Hao Zhang <[email protected]>
-Description: SCP provides a comprehensive set of tools for single cell data processing and downstream analysis.
+Description: An end-to-end Single-Cell Pipeline designed to facilitate comprehensive analysis and exploration of single-cell data.
 License: GPL (>= 3)
 Encoding: UTF-8
 LazyData: True

diff --git a/R/SCP-analysis.R b/R/SCP-analysis.R
diff --git a/R/SCP-app.R b/R/SCP-app.R
@@ -589,6 +589,9 @@ RunSCExplorer <- function(base_dir = "SCExplorer",
   }
 
   main_code <- '
+  if (!file.exists("Rplots.pdf")) {
+      file.create("Rplots.pdf")
+  }
   data_group <- rhdf5::h5ls(DataFile)$group
   meta_group <- rhdf5::h5ls(MetaFile)$group
   group <- intersect(data_group, meta_group)
@@ -1403,11 +1406,7 @@ RunSCExplorer <- function(base_dir = "SCExplorer",
       meta_features_name <- rhdf5::h5read(MetaFile, name = paste0("/", dataset2, "/metadata.stat/asfeatures"))
 
       if (is.null(features2)) {
-        if (is.null(initial_feature)) {
-          features2 <- initial_feature
-        } else {
-          features2 <- meta_features_name[1]
-        }
+        features2 <- initial_feature
       }
       feature_area2 <- gsub(x = unlist(strsplit(feature_area2, "(\\r)|(\\n)", perl = TRUE)), pattern = " ", replacement = "")
       features2 <- c(as.character(features2), as.character(feature_area2))
@@ -1627,11 +1626,7 @@ RunSCExplorer <- function(base_dir = "SCExplorer",
       meta_features_name <- rhdf5::h5read(MetaFile, name = paste0("/", dataset4, "/metadata.stat/asfeatures"))
 
       if (is.null(features4)) {
-        if (is.null(initial_feature)) {
-          features4 <- initial_feature
-        } else {
-          features4 <- meta_features_name[1]
-        }
+        features4 <- initial_feature
       }
       feature_area4 <- gsub(x = unlist(strsplit(feature_area4, "(\\r)|(\\n)", perl = TRUE)), pattern = " ", replacement = "")
       features4 <- c(as.character(features4), as.character(feature_area4))

diff --git a/R/SCP-feature_annotation.R b/R/SCP-feature_annotation.R
@@ -46,7 +46,11 @@ AnnotateFeatures <- function(srt, species = "Homo_sapiens", IDtype = c("symbol",
       species = species, db = db, db_update = db_update, db_version = db_version, convert_species = convert_species,
       db_IDtypes = IDtype, Ensembl_version = Ensembl_version, mirror = mirror
     )
-    for (single_db in db) {
+    db_notfound <- setdiff(db, names(db_list[[species]]))
+    if (length(db_notfound) > 0) {
+      warning(paste0("The following databases are not found:", paste0(db_notfound, collapse = ",")))
+    }
+    for (single_db in names(db_list[[species]])) {
       TERM2GENE <- unique(db_list[[species]][[single_db]][["TERM2GENE"]])
       TERM2NAME <- unique(db_list[[species]][[single_db]][["TERM2NAME"]])
       rownames(TERM2NAME) <- TERM2NAME[, 1]
@@ -66,7 +70,7 @@ AnnotateFeatures <- function(srt, species = "Homo_sapiens", IDtype = c("symbol",
         }
         db_sub <- db_df[rownames(db_df) %in% rownames(meta.features), , drop = FALSE]
         if (nrow(db_sub) == 0) {
-          stop(paste0("No db data found in the seurat object. Please check if the species name is correct. The expected feature names are ", paste(head(rownames(db_df), 10), collapse = ","), "."))
+          stop(paste0("No data to append was found in the Seurat object. Please check if the species name is correct. The expected feature names are ", paste(head(rownames(db_df), 10), collapse = ","), "."))
         }
         meta.features <- cbind(meta.features, db_sub[rownames(meta.features), setdiff(colnames(db_sub), colnames(meta.features)), drop = FALSE])
         srt[[assay]]@meta.features <- meta.features

diff --git a/R/utils.R b/R/utils.R
@@ -97,7 +97,8 @@ PrepareEnv <- function(conda = "auto", miniconda_repo = "https://repo.anaconda.c
   }
 
   packages <- c(
-    "numpy==1.21.6", "numba==0.55.2", "scikit-learn==1.1.2", "pandas==1.3.5", "python-igraph==0.10.2", "matplotlib==3.6.3", "palantir==1.0.1",
+    "numpy==1.21.6", "numba==0.55.2", "scikit-learn==1.1.2", "pandas==1.3.5", "python-igraph==0.10.2", "matplotlib==3.6.3",
+    "palantir==1.0.1", "wot==1.0.8.post2",
     "scipy", "versioned-hdf5", "leidenalg", "scanpy", "scvelo"
   )
   check_Python(packages = packages, envname = envname, conda = conda, force = force, ...)

diff --git a/README.Rmd b/README.Rmd
@@ -388,6 +388,8 @@ PAGAPlot(srt = pancreas_sub, reduction = "UMAP", label = TRUE, label_insitu = TR
 
 ### Velocity analysis
 
+> To estimate cell velocity, you need to have both "spliced" and "unspliced" assays in your Seurat object. You can generate these matrices using [velocyto](http://velocyto.org/velocyto.py/index.html), [bustools](https://bustools.github.io/BUS_notebooks_R/velocity.html), or [alevin](https://combine-lab.github.io/alevin-fry-tutorials/2021/alevin-fry-velocity/).
+
 ```{r RunSCVELO}
 pancreas_sub <- RunSCVELO(
   srt = pancreas_sub, group_by = "SubCellType",
@@ -408,11 +410,11 @@ VolcanoPlot(srt = pancreas_sub, group_by = "CellType")
 DEGs <- pancreas_sub@tools$DEtest_CellType$AllMarkers_wilcox
 DEGs <- DEGs[with(DEGs, avg_log2FC > 1 & p_val_adj < 0.05), ]
 # Annotate features with transcription factors and surface proteins
-pancreas_sub <- AnnotateFeatures(pancreas_sub, species = "Mus_musculus", db = c("TF", "SP"))
+pancreas_sub <- AnnotateFeatures(pancreas_sub, species = "Mus_musculus", db = c("TF", "CSPA"))
 ht <- FeatureHeatmap(
   srt = pancreas_sub, group.by = "CellType", features = DEGs$gene, feature_split = DEGs$group1,
   species = "Mus_musculus", db = c("GO_BP", "KEGG", "WikiPathway"), anno_terms = TRUE,
-  feature_annotation = c("TF", "SP"), feature_annotation_palcolor = list(c("gold", "steelblue"), c("forestgreen")),
+  feature_annotation = c("TF", "CSPA"), feature_annotation_palcolor = list(c("gold", "steelblue"), c("forestgreen")),
   height = 5, width = 4
 )
 print(ht$plot)
@@ -443,14 +445,15 @@ EnrichmentPlot(
 )
 ```
 
+> To ensure that labels are visible, you can adjust the size of the viewer panel on Rstudio IDE.
+
 ```{r Enrichment_enrichmap, fig.height=9.5,fig.width=15}
 EnrichmentPlot(
   srt = pancreas_sub, group_by = "CellType", group_use = "Ductal",
   plot_type = "enrichmap"
 )
 ```
 
-
 ```{r Enrichment_comparison, fig.height=6}
 EnrichmentPlot(srt = pancreas_sub, group_by = "CellType", plot_type = "comparison")
 ```
@@ -462,7 +465,7 @@ pancreas_sub <- RunGSEA(
   srt = pancreas_sub, group_by = "CellType", db = "GO_BP", species = "Mus_musculus",
   DE_threshold = "p_val_adj < 0.05"
 )
-GSEAPlot(srt = pancreas_sub, group_by = "CellType", group_use = "Endocrine", geneSetID = "GO:0007186")
+GSEAPlot(srt = pancreas_sub, group_by = "CellType", group_use = "Endocrine", id_use = "GO:0007186")
 ```
 
 ```{r GSEA_comparison, fig.height=6}
@@ -487,7 +490,7 @@ ht <- DynamicHeatmap(
   species = "Mus_musculus", db = "GO_BP", anno_terms = TRUE, anno_keys = TRUE, anno_features = TRUE,
   heatmap_palette = "viridis", cell_annotation = "SubCellType",
   separate_annotation = list("SubCellType", c("Nnat", "Irx1")), separate_annotation_palette = c("Paired", "Set1"),
-  feature_annotation = c("TF", "SP"), feature_annotation_palcolor = list(c("gold", "steelblue"), c("forestgreen")),
+  feature_annotation = c("TF", "CSPA"), feature_annotation_palcolor = list(c("gold", "steelblue"), c("forestgreen")),
   pseudotime_label = 25, pseudotime_label_color = "red",
   height = 5, width = 2
 )

diff --git a/README.md b/README.md
@@ -451,6 +451,13 @@ PAGAPlot(srt = pancreas_sub, reduction = "UMAP", label = TRUE, label_insitu = TR
 
 ### Velocity analysis
 
+> To estimate cell velocity, you need to have both “spliced” and
+> “unspliced” assays in your Seurat object. You can generate these
+> matrices using [velocyto](http://velocyto.org/velocyto.py/index.html),
+> [bustools](https://bustools.github.io/BUS_notebooks_R/velocity.html),
+> or
+> [alevin](https://combine-lab.github.io/alevin-fry-tutorials/2021/alevin-fry-velocity/).
+
 ``` r
 pancreas_sub <- RunSCVELO(
   srt = pancreas_sub, group_by = "SubCellType",
@@ -480,11 +487,11 @@ VolcanoPlot(srt = pancreas_sub, group_by = "CellType")
 DEGs <- pancreas_sub@tools$DEtest_CellType$AllMarkers_wilcox
 DEGs <- DEGs[with(DEGs, avg_log2FC > 1 & p_val_adj < 0.05), ]
 # Annotate features with transcription factors and surface proteins
-pancreas_sub <- AnnotateFeatures(pancreas_sub, species = "Mus_musculus", db = c("TF", "SP"))
+pancreas_sub <- AnnotateFeatures(pancreas_sub, species = "Mus_musculus", db = c("TF", "CSPA"))
 ht <- FeatureHeatmap(
   srt = pancreas_sub, group.by = "CellType", features = DEGs$gene, feature_split = DEGs$group1,
   species = "Mus_musculus", db = c("GO_BP", "KEGG", "WikiPathway"), anno_terms = TRUE,
-  feature_annotation = c("TF", "SP"), feature_annotation_palcolor = list(c("gold", "steelblue"), c("forestgreen")),
+  feature_annotation = c("TF", "CSPA"), feature_annotation_palcolor = list(c("gold", "steelblue"), c("forestgreen")),
   height = 5, width = 4
 )
 print(ht$plot)
@@ -534,6 +541,9 @@ EnrichmentPlot(
 
 <img src="README/README-RunEnrichment-4.png" width="100%" style="display: block; margin: auto;" />
 
+> To ensure that labels are visible, you can adjust the size of the
+> viewer panel on Rstudio IDE.
+
 ``` r
 EnrichmentPlot(
   srt = pancreas_sub, group_by = "CellType", group_use = "Ductal",
@@ -556,7 +566,7 @@ pancreas_sub <- RunGSEA(
   srt = pancreas_sub, group_by = "CellType", db = "GO_BP", species = "Mus_musculus",
   DE_threshold = "p_val_adj < 0.05"
 )
-GSEAPlot(srt = pancreas_sub, group_by = "CellType", group_use = "Endocrine", geneSetID = "GO:0007186")
+GSEAPlot(srt = pancreas_sub, group_by = "CellType", group_use = "Endocrine", id_use = "GO:0007186")
 ```
 
 <img src="README/README-RunGSEA-1.png" width="100%" style="display: block; margin: auto;" />
@@ -597,7 +607,7 @@ ht <- DynamicHeatmap(
   species = "Mus_musculus", db = "GO_BP", anno_terms = TRUE, anno_keys = TRUE, anno_features = TRUE,
   heatmap_palette = "viridis", cell_annotation = "SubCellType",
   separate_annotation = list("SubCellType", c("Nnat", "Irx1")), separate_annotation_palette = c("Paired", "Set1"),
-  feature_annotation = c("TF", "SP"), feature_annotation_palcolor = list(c("gold", "steelblue"), c("forestgreen")),
+  feature_annotation = c("TF", "CSPA"), feature_annotation_palcolor = list(c("gold", "steelblue"), c("forestgreen")),
   pseudotime_label = 25, pseudotime_label_color = "red",
   height = 5, width = 2
 )

diff --git a/README/README-DynamicHeatmap-1.png b/README/README-DynamicHeatmap-1.png
diff --git a/README/README-Enrichment_enrichmap-1.png b/README/README-Enrichment_enrichmap-1.png
diff --git a/README/README-FeatureHeatmap-1.png b/README/README-FeatureHeatmap-1.png
diff --git a/README/README-RunCellQC-1.png b/README/README-RunCellQC-1.png
diff --git a/README/README-RunCellQC-2.png b/README/README-RunCellQC-2.png
diff --git a/README/README-RunCellQC-3.png b/README/README-RunCellQC-3.png
diff --git a/README/README-RunEnrichment-4.png b/README/README-RunEnrichment-4.png
diff --git a/README/README-RunKNNPredict-scrna-1.png b/README/README-RunKNNPredict-scrna-1.png
diff --git a/SCP.Rproj b/SCP.Rproj
@@ -1,8 +1,8 @@
 Version: 1.0
 
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Yes
 
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
@@ -16,3 +16,5 @@ BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
 PackageRoxygenize: rd,collate,namespace,vignette
+
+QuitChildProcessesOnExit: Yes
diff --git a/inst/python/SCP_analysis.py b/inst/python/SCP_analysis.py
@@ -768,15 +768,15 @@ def Palantir(adata=None, h5ad=None,group_by=None,palette=None,
 
   return adata
 
-def WOT(adata=None, h5ad=None, group_by=None,palette=None, linear_reduction=None, nonlinear_reduction=None,basis=None,
-            n_pcs=30,n_neighbors=30, use_rna_velocity=False,vkey="stochastic",
-            embedded_with_PAGA=False,paga_layout="fr", threshold=0.1, point_size=20, 
-            infer_pseudotime = False,root_cell = None,root_group=None,n_dcs = 10, n_branchings = 0, min_group_size = 0.01,
-            show_plot=True, dpi=300, save=False, dirpath="./", fileprefix=""):
+def WOT(adata=None, h5ad=None, group_by=None,palette=None, 
+        time_field = "Time", growth_iters = 3, tmap_out = "tmaps/tmap_out",
+        time_from = 1, time_to = None, get_coupling = False,recalculate=False,
+        show_plot=True, dpi=300, save=False, dirpath="./", fileprefix=""):
   import matplotlib.pyplot as plt
   import scanpy as sc
   import numpy as np
   import statistics
+  import pandas as pd
   from math import hypot
 
   import warnings
@@ -810,101 +810,58 @@ def WOT(adata=None, h5ad=None, group_by=None,palette=None, linear_reduction=None
     if group_by is None:
       print("group_by must be provided.")
       exit()
-
-    if linear_reduction is None and nonlinear_reduction is None:
-      print("linear_reduction or nonlinear_reduction must be provided at least one.")
-      exit()
-
-    if linear_reduction is None:
-      sc.pp.pca(adata, n_comps = n_pcs)
-      linear_reduction="X_pca"
-
-    if basis is None:
-      if nonlinear_reduction is not None:
-        basis=nonlinear_reduction
-      else:
-        basis="basis"
-        adata.obsm["basis"]=adata.obsm[linear_reduction][:,0:2]
-
-    if point_size is None:
-      point_size = min(100000 / adata.shape[0],20)  
-
-    if infer_pseudotime is True and root_cell is None and root_group is None:
-      print("root_cell or root_group should be provided.")
-      exit()
 
-    if use_rna_velocity is True:
-      adata.uns["velocity_graph"]=adata.uns[vkey+"_graph"]
-    # del adata.uns
-
     adata.obs[group_by] = adata.obs[group_by].astype(dtype = "category")
-
-    if "X_diffmap" in adata.obsm_keys():
-      X_diffmap = adata.obsm['X_diffmap']
-      del adata.obsm['X_diffmap']
-      sc.pp.neighbors(adata, n_pcs = n_pcs, use_rep = linear_reduction, n_neighbors = n_neighbors)
-      adata.obsm['X_diffmap'] = X_diffmap
+    if pd.api.types.is_categorical_dtype(adata.obs[time_field]):
+      adata.obs["time_field"] = adata.obs[time_field].cat.codes
+    elif not pd.api.types.is_numeric_dtype(adata.obs[time_field]):
+      try:
+        adata.obs['time_field'] = adata.obs[time_field].astype("float")
+      except ValueError:
+        print("Unable to convert column '" + time_field + "' to float type.")
     else:
-      sc.pp.neighbors(adata, n_pcs = n_pcs, use_rep = linear_reduction, n_neighbors = n_neighbors)
-
-    sc.tl.paga(adata, groups = group_by, use_rna_velocity = use_rna_velocity)
+      adata.obs["time_field"] = adata.obs[time_field]
+
+    time_dict = dict(zip(adata.obs[time_field],adata.obs["time_field"]))
+    ot_model <- wot.ot.OTModel(adata, growth_iters = growth_iters, day_field = "time_field")
 
-    if use_rna_velocity is True:
-      sc.pl.paga_compare(adata, basis=basis,palette=palette, threshold = threshold, size = point_size, min_edge_width = 1, node_size_scale = 1,
-      dashed_edges='connectivities', solid_edges='transitions_confidence', transitions='transitions_confidence',
-      title = basis,frameon = False, edges = True, save = False, show = False)
+    if recalculate is True:
+      ot_model.compute_all_transport_maps(tmap_out = tmap_out)
+      tmap_model = wot.tmap.TransportMapModel.from_directory(tmap_out)
     else:
-      sc.pl.paga_compare(adata, basis=basis,palette=palette, threshold = threshold, size = point_size, title = basis,frameon = False, edges = True, save = False, show = False)
-    if show_plot is True:
-      plt.show()
-    if save:
-      plt.savefig('.'.join(filter(None, [fileprefix, "paga_compare.png"])), dpi=dpi)
+      try:
+        tmap_model = wot.tmap.TransportMapModel.from_directory(tmap_out)
+      except ValueError:
+        ot_model.compute_all_transport_maps(tmap_out = tmap_out)
+        tmap_model = wot.tmap.TransportMapModel.from_directory(tmap_out)
+
+    cell_sets = {}
+    for k, v in zip(adata.obs[group_by], adata.obs_names):
+        if k not in cell_sets:
+          cell_sets[k] = []
+        cell_sets[k].append(v)
 
-    sc.pl.paga(adata, threshold = threshold, layout = paga_layout, title = "PAGA layout: " + paga_layout, frameon = False, save = False, show = False)
-    if show_plot is True:
-      plt.show()
-    if save:
-      plt.savefig('.'.join(filter(None, [fileprefix, "paga_layout.png"])), dpi=dpi)
-
-    sc.tl.draw_graph(adata, init_pos = "paga", layout = paga_layout)
-    sc.pl.draw_graph(adata, color=group_by,palette=palette, title = "PAGA layout: " + paga_layout, layout = paga_layout,  frameon = False, legend_loc="on data",show = False)
-    if show_plot is True:
-      plt.show()
-    if save:
-      plt.savefig('.'.join(filter(None, [fileprefix, "paga_graph.png"])), dpi=dpi)
-
-    if embedded_with_PAGA is True:
-      umap2d = sc.tl.umap(adata, init_pos = "paga", n_components=2, copy = True)
-      adata.obsm["PAGAUMAP2D"] = umap2d.obsm["X_umap"]
-      sc.pl.paga_compare(adata, basis = "PAGAUMAP2D",palette=palette, threshold = threshold, size = point_size, title = "PAGA-initialized UMAP", edges = True, save = False, show = False)
-      if show_plot is True:
-        plt.show()
-      if save:
-        plt.savefig('.'.join(filter(None, [fileprefix, "paga_umap.png"])), dpi=dpi)
-
-    if infer_pseudotime is True:
-      if root_group is not None and root_cell is None:
-        cell = adata.obs[group_by].index.values[adata.obs[group_by]==root_group]
-        root_group_cell=adata.obsm[basis][adata.obs[group_by]==root_group,][:, [0, 1]]
-        x=statistics.median(root_group_cell[:,0])
-        y=statistics.median(root_group_cell[:,1])
-        diff=np.array((x - root_group_cell[:,0], y - root_group_cell[:,1]))
-        dist=[]
-        for i in range(diff.shape[1]):
-          dist.append(hypot(diff[0,i],diff[1,i]))
+    from_populations = tmap_model.population_from_cell_sets(cell_sets, at_time = time_dict[time_from])
+
+    trajectory_ds = tmap_model.trajectories(from_populations)
+    trajectory_df = pd.DataFrame(trajectory_ds.X, index=trajectory_ds.obs_names, columns=trajectory_ds.var_names)
+    adata.uns["trajectory_"+str(time_from)]= trajectory_df
 
-        root_cell=cell[dist.index(min(dist))]
-
-      sc.tl.diffmap(adata, n_comps = n_dcs)
-      adata.uns['iroot'] = np.flatnonzero(adata.obs_names == root_cell)[0]
-      sc.tl.dpt(adata, n_dcs = n_dcs, n_branchings = n_branchings, min_group_size = min_group_size)
-
-      sc.pl.embedding(adata, basis = basis, color = "dpt_pseudotime", save = False, show = False)
-      if show_plot is True:
-          plt.show()
-      if save:
-          plt.savefig('.'.join(filter(None, [fileprefix, "dpt_pseudotime.png"])), dpi=dpi)
-
+    fates_ds = tmap_model.fates(from_populations)
+    fates_df = pd.DataFrame(fates_ds.X, index=fates_ds.obs_names, columns=fates_ds.var_names)
+    adata.uns["fates_"+str(time_from)]= fates_df
+
+    # obs_list = wot.tmap.trajectory_trends_from_trajectory(trajectory_ds = trajectory_ds, expression_ds = adata)
+
+    if time_to is not None:
+      to_populations = tmap_model.population_from_cell_sets(cell_sets, at_time = time_dict[time_to])
+      transition_table = tmap_model.transition_table(from_populations, to_populations)
+      transition_df = pd.DataFrame(transition_table.X, index=transition_table.obs_names, columns=transition_table.var_names)
+      adata.uns["transition_"+str(time_from)+"_to_"+str(time_to)]= fates_df
+      if get_coupling is True:
+        coupling = tmap_model.get_coupling(time_dict[time_from], time_dict[time_to])
+        coupling_df = pd.DataFrame(coupling.X, index=coupling.obs_names, columns=coupling.var_names)
+        adata.uns["coupling_"+str(time_from)+"_to_"+str(time_to)]= coupling_df
 
   finally:
       os.chdir(prevdir)

diff --git a/man/ListDB.Rd b/man/ListDB.Rd