Skip to content

Commit

Permalink
[FEAT/WIP] Add new cell typing script celltyping_scROSHI.R
Browse files Browse the repository at this point in the history
  • Loading branch information
Anne Bertolini committed Jun 19, 2024
1 parent b9ac197 commit d560c00
Showing 1 changed file with 246 additions and 0 deletions.
246 changes: 246 additions & 0 deletions workflow/scripts/celltyping_scROSHI.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
################################################################################
## cell type classification - perform cell typing by comparing gene expression
## profiles with pre-defined lists
################################################################################

################################################################################
## Load packages and parse command line options

# convenience function for string concatenation
"%&%" <- function(a, b) paste(a, b, sep = "")
# convenience function for loading packages
f_pck_load <- function(x) {
suppressWarnings(
suppressMessages(
require(x, character.only = T, warn.conflicts = F, quietly = T))
)
}
# Load packages
lby <- c("optparse", "reshape2", "scran", "limma", "uwot", "igraph", "Hmisc",
"pheatmap", "RColorBrewer", "cowplot", "scROSHI")
resp <- lapply(lby, f_pck_load)
if (!all(unlist(resp))) {
print(resp)
stop("Could not load one or more packages")
} else {
rm(resp, lby)
}

# give out session Info
cat("\n\n\nPrint sessionInfo:\n\n")
print(sessionInfo())
cat("\n\n\n\n")

# command line arguments are parsed
option_list <- list(
make_option("--SCE", type = "character", help = "Path to sce onject file with input data (sce_basic.RDS)."),
make_option("--celltype_lists", type = "character", help = "Path to the file containing marker genes all cell types."),
make_option("--celltype_config", type = "character", help = "Path to the file containing a config file that specifies major and sub cell types."),
make_option("--min_genes", type = "character", help = "Minimum number of celltype specific genes expressed. If less: classify unknown."),
make_option("--outputDirec", type = "character", help = "Path to the directory where output files will be written."),
make_option("--sampleName", type = "character", help = "Sample identifier. Attached to each output name.")
)
opt_parser <- OptionParser(option_list = option_list)
opt <- parse_args(opt_parser)

path <- opt$outputDirec %&% opt$sampleName
min_genes <- as.numeric(opt$min_genes)
if (!is.finite(min_genes)) stop("min_genes was not provided correctly.")
print(opt$SCE)

# Set global options
options(stringsAsFactors = FALSE)
# options(error=function()traceback(2))

f.round.preserve.sum <- function(x, digits = 0) {
up <- 10^digits
x <- x * up
y <- floor(x)
indices <- tail(order(x - y), round(sum(x)) - sum(y))
y[indices] <- y[indices] + 1
y / up
}

################################################################################
## main code starts here
################################################################################
## load input data
sce_data <- readRDS(opt$SCE)

# celltype config file, sub types are individual for each major type
type_config <- as.data.frame(read.table(opt$celltype_config, sep = "\t", head = T, stringsAsFactors = F))
major_types <- type_config$Major
print("str(major_types):")
print(str(major_types))
minor_types <- lapply(type_config$Subtype, function(x) strsplit(x, ",")[[1]])
names(minor_types) <- type_config$Major
# remove "none"
minor_types <- lapply(minor_types, function(x) x[x != "none"])
# remove NA
idx <- sapply(minor_types, function(x) length(which(is.na(x))))
minor_types <- minor_types[which(idx == 0)]
# remove empty list items
idx <- sapply(minor_types, length)
minor_types <- minor_types[idx > 0]
# final list
print("str(minor_types):")
print(str(minor_types))

#####################################################scROSHI

sce_data <- scROSHI(sce_data = sce_data,
celltype_lists = opt$celltype_lists,
type_config = type_config,
count_data = "normcounts",
gene_symbol = "SYMBOL",
cell_scores = TRUE,
min_genes = opt$min_genes,
min_var = 1.5,
n_top_genes = 2000,
n_nn = 5,
thresh_unknown = 0.05,
thresh_uncert = 0.1,
thresh_uncert_second = 0.8)

#####################################################scROSHI

# keep sce object including cluster 0
sce_data_incl_cluster0 <- sce_data

# remove cells that are in cluster 0
mask_keep_cells <- colData(sce_data)$phenograph_clusters != 0
cat("\n\nNumber of cells that remain after filtering out cluster 0:\n\n")
print(sum(mask_keep_cells))
cat("\n\nsce object before filtering out cells that are in cluster 0:\n\n")
print(sce_data)

sce_data <- sce_data[, mask_keep_cells]
reducedDims(sce_data)$phenodist <- reducedDims(sce_data)$phenodist[, mask_keep_cells]
cat("\n\nsce object after filtering out cells that are in cluster 0:\n\n")
print(sce_data)

## write final celltype to disk
res <- colData(sce_data)[, c("barcodes", "celltype_final")]
write.table(res, file = path %&% ".cts_final.txt", sep = "\t", row.names = F)


################################################################################
## compare cell type with phenograph_clusters
res <- colData(sce_data)
tab <- table(res$phenograph_clusters, res$celltype_final)
top <- ncol(tab)
left <- nrow(tab)
tab <- addmargins(tab)
tab <- rbind(tab, f.round.preserve.sum(tab[nrow(tab), ] / tab[nrow(tab), ncol(tab)] * 100))
rownames(tab)[nrow(tab)] <- "Percent"


#########################################
## begin new cluster purity calculations
# if at least one minor_type, make major-minor dictionary:

all.major.types <- sort(c(metadata(sce_data)$all_major_types, "uncertain", "unknown"))

if (length(minor_types) > 0) {
minor_types <- lapply(minor_types, function(x) gsub("([^_]+)_.*", "\\1", x))
names(minor_types) <- gsub("([^_]+)_.*", "\\1", names(minor_types))
# add major type also to minor type column (as it can be final cell type as well)
minor_types <- sapply(names(minor_types), simplify = FALSE, function(x) {
minor_types[x] <- c(minor_types[[x]], x)
})
major_minor_dict <- melt(minor_types)
names(major_minor_dict) <- c("minor", "major")
# also have all major types in the dictionary
for (m_type in all.major.types) {
if (!m_type %in% major_minor_dict$minor) {
major_minor_dict[nrow(major_minor_dict) + 1, ] <- c(m_type, m_type)
}
}
}
# loop over all phenograph clusters. Need not be continuous or numeric but unique
clu.pu <- data.frame("Cluster" = rownames(tab[seq(left), ]), "Dominant.celltype" = NA,
"Celltype composition" = NA, stringsAsFactors = F, check.names = F)
for (ii in unique(res$phenograph_clusters)) {
# ii = unique(res$phenograph_clusters)[1]
rowid <- which(rownames(tab) == ii)
# calculate percentage of each cell type in cluster ii
purity <- tab[rowid, seq(top)] / sum(tab[rowid, seq(top)])
purity <- f.round.preserve.sum(purity * 100)
# a "dominant.type" is a cell type that contributes more than 20% to the cluster
dominant.type <- which(purity > 20)
if (length(dominant.type) == 1) {
# if only one dominant cell type is found in cluster ii insert information into
# the column "Dominant.celltype"
clu.pu$Dominant.celltype[rowid] <- names(dominant.type)
ct.comp <- names(dominant.type) %&% " (" %&% purity[dominant.type] %&% "%)"
clu.pu$`Celltype composition`[rowid] <- ct.comp
} else {
if (length(dominant.type) > 1) {
clu.pu$Dominant.celltype[rowid] <- "mixed"
# order dominant celltype labels by decreasing proportion
ct.order <- order(purity[dominant.type], decreasing = T)
ct.comp.1 <- names(dominant.type)[ct.order]
ct.comp.2 <- "(" %&% purity[dominant.type[ct.order]] %&% "%)"
clu.pu$`Celltype composition`[rowid] <- paste(ct.comp.1, ct.comp.2, collapse = ", ")
# if all dominant.types have the same major_celltype, change to "mayor_type_mixed"
if (exists("major_minor_dict")) {
idx <- match(names(dominant.type), major_minor_dict$minor)
idx <- idx[!is.na(idx)]
id.major <- major_minor_dict$major[idx]
# only if all dominant types have the same major_type
if (length(id.major) > 0 & length(unique(id.major)) == 1) {
dominant_major <- unique(id.major)
clu.pu$Dominant.celltype[rowid] <- dominant_major %&% "_mixed"
}
}
} else {
clu.pu$Dominant.celltype[rowid] <- "mixed"
clu.pu$`Celltype composition`[rowid] <- "all<20"
}
}
}
clu.pu <- rbind(clu.pu, c("Sum", "", ""), c("Percent (95% ci)", "", ""))
tmp <- as.data.frame(as.matrix(tab))
tmp$Cluster <- c(sort(unique(res$phenograph_clusters)), "Sum", "Percent (95% ci)")
clu.pu <- merge(tmp, clu.pu, sort = F)
# # confidence intervals for proportions (Wilson)
# round(binconf(0, 4446, method="wilson")*100,1)
tab.ci <- binconf(tab[nrow(tab) - 1, seq(ncol(tab) - 1)],
rep(tab[nrow(tab) - 1, ncol(tab)], ncol(tab) - 1),
method = "wilson") * 100
tab.ci <- apply(tab.ci, 2, function(x) as.character(signif(x, 2)))
tab.ci <- apply(tab.ci, 1, function(x) sprintf("%s (%s;%s)", x[1], x[2], x[3]))
clu.pu[nrow(clu.pu), 1 + seq(length(tab.ci))] <- tab.ci
# to remove confidence intervals, comment out the previous 4 lines.
write.table(clu.pu, file = path %&% ".celltyping.phenograph_celltype_association.txt", sep = "\t", row.names = F)
## end new cluster purity calculations


# print out percentages of cell types, major and final
perc_major <- as.data.frame(round(prop.table(table(res$celltype_major)), 3) * 100)
perc_major <- perc_major[order(perc_major$Freq, decreasing = TRUE), ]
cat("\n\n Frequencies major cell types:\n")
print(perc_major)


perc_final <- as.data.frame(round(prop.table(table(res$celltype_final)), 3) * 100)
perc_final <- perc_final[order(perc_final$Freq, decreasing = TRUE), ]
cat("\n\n Frequencies final cell types:\n")
print(perc_final)

cat("\n\n Number of cells: ", length(res$barcodes))

cat("\n\n Number of clusters: ", length(unique(res$phenograph_clusters)), "\n\n")

new_dom_types <- clu.pu$Dominant.celltype[!clu.pu$Dominant.celltype == ""]
dom_types <- as.data.frame(table(new_dom_types))
dom_types <- dom_types[order(dom_types$Freq, decreasing = TRUE), ]
cat("\n\n Frequencies dominant cell types of clusters:\n")
print(dom_types)

################################################################################
## save sce object for later use
saveRDS(sce_data, path %&% ".celltyping.RDS")
# also save sce_data object that including the cells that are in cluster 0
saveRDS(sce_data_incl_cluster0, path %&% ".celltyping.all_cells.RDS")

0 comments on commit d560c00

Please sign in to comment.