From 6d50c0a67654d1aa64ab225110cece5628bdf0bd Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 3 Nov 2022 01:12:42 -0700 Subject: [PATCH 1/8] add collate_ighv_results for mixcr mut status --- R/utilities.R | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/R/utilities.R b/R/utilities.R index 1b2ded8b..ce62f4dd 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1323,6 +1323,8 @@ collate_results = function(sample_table, these_samples_metadata, case_set, sbs_manipulation = "", + mixcr_results_directory = "", + missing_threshold = 2, seq_type_filter = "genome", from_cache = TRUE, ssh_session){ @@ -1367,6 +1369,7 @@ collate_results = function(sample_table, sample_table = collate_ancestry(sample_table = sample_table, seq_type_filter = seq_type_filter) sample_table = collate_sbs_results(sample_table = sample_table, sbs_manipulation = sbs_manipulation, seq_type_filter = seq_type_filter) sample_table = collate_qc_results(sample_table = sample_table, seq_type_filter = seq_type_filter) + sample_table = collate_ighv_results(sample_table = sample_table, seq_type_filter = seq_type_filter, results_directory = mixcr_results_directory, missing_threshold = missing_threshold) } if(write_to_file){ #write results from "slow option" to new cached results file @@ -4101,3 +4104,74 @@ supplement_maf <- function(incoming_maf, missing_sample_maf) return(full_maf) } + +#' INTERNAL FUNCTION called by collate_results, not meant for out-of-package usage. +#' Identify samples with mutated IGHV from MiXCR + IgBLAST results +#' +#' @param sample_table A data frame with sample_id as the first column. +#' @param results_directory Directory containging MiXCR/IgBLASTN results +#' @param missing_threshold Limit for missing IGH regions during MiXCR assembly. Default is 2. +#' +#' @return Samples table. +#' @import tidyverse +#' @import stats +#' @examples +#' sample_table = collate_nfkbiz_results(sample_table=sample_table) +#' + +collate_ighv_results = function(sample_table, + seq_type_filter = "genome", + results_directory = "", + missing_threshold = 2){ + if (missing(sample_table)) { + sample_table = get_gambl_metadata(seq_type_filter = seq_type_filter) %>% + dplyr::select(sample_id, patient_id, biopsy_id) + } + + # Check seq_type_filter is genome or mrna + if (seq_type_filter=="capture") { + message("'capture' seq_type_filter doesn't make sense for this tool. Use 'genome' or 'mrna'.") + return (sample_table) + } + # Look in mixcr-1.2/results if no results directory specified + if (results_directory == "") { + base = config::get("project_base") + results_directory = glue(paste0(base, "gambl/mixcr-1.2/01-mixcr/{seq_type_filter}/")) + } + print(paste0("Searching for MiXCR results in directory:", results_directory)) + + # Make table with necessary columns from result files + mixcr_table = data.frame(biopsy_id=character(), + mixcr_mutated=character(), + missing=double()) + for (sample_id in sample_table$sample_id) { + mixcr_results = glue(paste0(results_directory,"{sample_id}/mixcr.{sample_id}.clonotypes.IGH.igblast.txt")) + # Check if sample_id results file exists + if (file.exists(mixcr_results)) { + mixcr = suppressMessages(read_tsv(mixcr_results)) + # Some results may be empty due to not having (productive) clones + if (nrow(mixcr)==0) { + next + } + # Get results from first clonotype (highest fraction) + entry = c(sample_table[sample_table$sample_id==sample_id,]$biopsy_id, + mixcr$mutatedStatus[1], + mixcr$numMissingRegions[1]) + mixcr_table[nrow(mixcr_table) + 1,] <- entry + } + } + # Remove entries that are empty + print("Removing empty entries") + mixcr_results = mixcr_table %>% dplyr::filter(rowSums(is.na(mixcr_table)) != ncol(mixcr_table)) %>% as.data.frame() + + # Remove entries where number of missing V regions is 3 or more then remove missing column + print("Removing rows that exceed missing threshold") + mixcr_results = mixcr_table %>% dplyr::filter(missing <= missing_threshold) %>% select(-missing) + + # Join rows to sample_table + print("Joining tables") + sample_table = dplyr::left_join(sample_table, mixcr_results) + return(sample_table) +} + + From 7f44844daa899b4450860bcbb1efb330f55b49e4 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 3 Nov 2022 15:22:50 -0700 Subject: [PATCH 2/8] Fix typo --- R/utilities.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utilities.R b/R/utilities.R index ce62f4dd..07de9eec 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -4110,7 +4110,7 @@ supplement_maf <- function(incoming_maf, #' #' @param sample_table A data frame with sample_id as the first column. #' @param results_directory Directory containging MiXCR/IgBLASTN results -#' @param missing_threshold Limit for missing IGH regions during MiXCR assembly. Default is 2. +#' @param missing_threshold Limit for missing IGH regions during MiXCR assembly. Default is 2. #' #' @return Samples table. #' @import tidyverse From 841e73d6bc8e4dd135464910df1f431f5ba527fa Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 3 Nov 2022 15:27:37 -0700 Subject: [PATCH 3/8] Change select() to dplyr::select() --- R/utilities.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utilities.R b/R/utilities.R index 07de9eec..06320fce 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -4166,7 +4166,7 @@ collate_ighv_results = function(sample_table, # Remove entries where number of missing V regions is 3 or more then remove missing column print("Removing rows that exceed missing threshold") - mixcr_results = mixcr_table %>% dplyr::filter(missing <= missing_threshold) %>% select(-missing) + mixcr_results = mixcr_table %>% dplyr::filter(missing <= missing_threshold) %>% dplyr::select(-missing) # Join rows to sample_table print("Joining tables") From 36d73529113e80a219d17a5a2e51b31f52d4bd00 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 3 Nov 2022 16:10:03 -0700 Subject: [PATCH 4/8] add documentation --- R/utilities.R | 14 +++++++------- man/collate_igblast_results.Rd | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 man/collate_igblast_results.Rd diff --git a/R/utilities.R b/R/utilities.R index 06320fce..99ba6f56 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1369,7 +1369,7 @@ collate_results = function(sample_table, sample_table = collate_ancestry(sample_table = sample_table, seq_type_filter = seq_type_filter) sample_table = collate_sbs_results(sample_table = sample_table, sbs_manipulation = sbs_manipulation, seq_type_filter = seq_type_filter) sample_table = collate_qc_results(sample_table = sample_table, seq_type_filter = seq_type_filter) - sample_table = collate_ighv_results(sample_table = sample_table, seq_type_filter = seq_type_filter, results_directory = mixcr_results_directory, missing_threshold = missing_threshold) + sample_table = collate_igblast_results(sample_table = sample_table, seq_type_filter = seq_type_filter, results_directory = mixcr_results_directory, missing_threshold = missing_threshold) } if(write_to_file){ #write results from "slow option" to new cached results file @@ -4109,17 +4109,17 @@ supplement_maf <- function(incoming_maf, #' Identify samples with mutated IGHV from MiXCR + IgBLAST results #' #' @param sample_table A data frame with sample_id as the first column. -#' @param results_directory Directory containging MiXCR/IgBLASTN results -#' @param missing_threshold Limit for missing IGH regions during MiXCR assembly. Default is 2. +#' @param results_directory Directory containging MiXCR/IgBLASTN results. Depends on files having the columns "mutatedStatus" and "numMissingRegions" +#' @param missing_threshold Limit for missing IGH regions during MiXCR assembly. Default is 2. Higher thresholds may return less accurate mutation status. #' #' @return Samples table. #' @import tidyverse -#' @import stats -#' @examples -#' sample_table = collate_nfkbiz_results(sample_table=sample_table) +#' +#' @examples +#' sample_table = collate_ighv_results(sample_table = sample_table, results_directory=".../01-mixcr/{seq_type}/", missing_threshold=3) #' -collate_ighv_results = function(sample_table, +collate_igblast_results = function(sample_table, seq_type_filter = "genome", results_directory = "", missing_threshold = 2){ diff --git a/man/collate_igblast_results.Rd b/man/collate_igblast_results.Rd new file mode 100644 index 00000000..791d0f2c --- /dev/null +++ b/man/collate_igblast_results.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{collate_igblast_results} +\alias{collate_igblast_results} +\title{INTERNAL FUNCTION called by collate_results, not meant for out-of-package usage. +Identify samples with mutated IGHV from MiXCR + IgBLAST results} +\usage{ +collate_igblast_results( + sample_table, + seq_type_filter = "genome", + results_directory = "", + missing_threshold = 2 +) +} +\arguments{ +\item{sample_table}{A data frame with sample_id as the first column.} + +\item{results_directory}{Directory containging MiXCR/IgBLASTN results. Depends on files having the columns "mutatedStatus" and "numMissingRegions"} + +\item{missing_threshold}{Limit for missing IGH regions during MiXCR assembly. Default is 2. Higher thresholds may return less accurate mutation status.} +} +\value{ +Samples table. +} +\description{ +INTERNAL FUNCTION called by collate_results, not meant for out-of-package usage. +Identify samples with mutated IGHV from MiXCR + IgBLAST results +} +\examples{ +sample_table = collate_ighv_results(sample_table = sample_table, results_directory=".../01-mixcr/{seq_type}/", missing_threshold=3) + +} From 1faed60f9dfd837908318365e10d46cffc9daf3b Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 23 Nov 2022 15:21:56 -0800 Subject: [PATCH 5/8] Minimize for loops, add MiXCR dir to config --- R/utilities.R | 67 +++++++++++++++++++++++---------------------------- config.yml | 1 + 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/R/utilities.R b/R/utilities.R index 99ba6f56..d11343cc 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1291,6 +1291,7 @@ test_glue = function(placeholder="INSERTED"){ #' @param these_samples_metadata Optional argument to use a user specified metadata df, overwrites get_gambl_metadata in join_with_full_metadata. #' @param case_set Optional short name for a pre-defined set of cases. #' @param sbs_manipulation Optional variable for transforming sbs values (e.g log, scale). +#' @param mixcr_results_directory Optional path to MiXCR results. #' @param seq_type_filter Filtering criteria, default is genomes. #' @param from_cache Boolean variable for using cached results (/projects/nhl_meta_analysis_scratch/gambl/results_local/shared/gambl_{seq_type_filter}_results.tsv), default is TRUE. If write_to_file is TRUE, this parameter auto-defaults to FALSE. #' @@ -1323,8 +1324,6 @@ collate_results = function(sample_table, these_samples_metadata, case_set, sbs_manipulation = "", - mixcr_results_directory = "", - missing_threshold = 2, seq_type_filter = "genome", from_cache = TRUE, ssh_session){ @@ -1333,7 +1332,7 @@ collate_results = function(sample_table, # the sample_id should probably not even be in this file if we want this to be biopsy-centric if(missing(sample_table)){ sample_table = get_gambl_metadata(seq_type_filter = seq_type_filter) %>% - dplyr::select(sample_id, patient_id, biopsy_id) + dplyr::select(sample_id, patient_id, biopsy_id, unix_group) } if(write_to_file){ from_cache = FALSE #override default automatically for nonsense combination of options @@ -1369,7 +1368,7 @@ collate_results = function(sample_table, sample_table = collate_ancestry(sample_table = sample_table, seq_type_filter = seq_type_filter) sample_table = collate_sbs_results(sample_table = sample_table, sbs_manipulation = sbs_manipulation, seq_type_filter = seq_type_filter) sample_table = collate_qc_results(sample_table = sample_table, seq_type_filter = seq_type_filter) - sample_table = collate_igblast_results(sample_table = sample_table, seq_type_filter = seq_type_filter, results_directory = mixcr_results_directory, missing_threshold = missing_threshold) + sample_table = collate_igblast_results(sample_table = sample_table, seq_type_filter = seq_type_filter) } if(write_to_file){ #write results from "slow option" to new cached results file @@ -4125,48 +4124,42 @@ collate_igblast_results = function(sample_table, missing_threshold = 2){ if (missing(sample_table)) { sample_table = get_gambl_metadata(seq_type_filter = seq_type_filter) %>% - dplyr::select(sample_id, patient_id, biopsy_id) + dplyr::select(sample_id, biopsy_id, unix_group) } - # Check seq_type_filter is genome or mrna - if (seq_type_filter=="capture") { - message("'capture' seq_type_filter doesn't make sense for this tool. Use 'genome' or 'mrna'.") - return (sample_table) - } - # Look in mixcr-1.2/results if no results directory specified + # Look in mixcr-1.2/99-outputs if no results directory specified if (results_directory == "") { - base = config::get("project_base") - results_directory = glue(paste0(base, "gambl/mixcr-1.2/01-mixcr/{seq_type_filter}/")) - } - print(paste0("Searching for MiXCR results in directory:", results_directory)) - - # Make table with necessary columns from result files - mixcr_table = data.frame(biopsy_id=character(), - mixcr_mutated=character(), - missing=double()) - for (sample_id in sample_table$sample_id) { - mixcr_results = glue(paste0(results_directory,"{sample_id}/mixcr.{sample_id}.clonotypes.IGH.igblast.txt")) - # Check if sample_id results file exists - if (file.exists(mixcr_results)) { - mixcr = suppressMessages(read_tsv(mixcr_results)) - # Some results may be empty due to not having (productive) clones - if (nrow(mixcr)==0) { - next - } - # Get results from first clonotype (highest fraction) - entry = c(sample_table[sample_table$sample_id==sample_id,]$biopsy_id, - mixcr$mutatedStatus[1], - mixcr$numMissingRegions[1]) - mixcr_table[nrow(mixcr_table) + 1,] <- entry - } + results_directory = paste0(config::get("project_base"),config::get("results_directories")$mixcr, "txt/{seq_type_filter}/mixcr.{sample_id}.clonotypes.IGH.igblast.txt") + } + print(paste("Searching for MiXCR results in directory:", results_directory)) + + # Create dataframe from data in sample_table + mixcr_table = data.frame(sample_id = sample_table$sample_id, + biopsy_id = sample_table$biopsy_id, + unix_group = sample_table$unix_group, + mixcr_mutated = "placeholder", + missing = "placeholder") + + # Create column with potential filename for sample_id unix_group combination + mixcr_table = mixcr_table %>% dplyr::mutate(file_path = glue::glue(results_directory)) + + # Filter down to only sample_ids with existing files then read file contents into own column (this takes some time) + mixcr_table = mixcr_table[file.exists(mixcr_table$file_path),] %>% dplyr::mutate(results = suppressMessages(lapply(file_path, read_tsv))) + message(paste(nrow(mixcr_table), "MiXCR results files found.")) + + # Extract required columns (only extract top column, this is clonotype with highest fraction) + for (i in seq_len(nrow(mixcr_table))) { + mixcr_table$mixcr_mutated[i] <- mixcr_table$results[[i]]$mutatedStatus[1] + mixcr_table$missing[i] <- mixcr_table$results[[i]]$numMissingRegions[1] } # Remove entries that are empty print("Removing empty entries") mixcr_results = mixcr_table %>% dplyr::filter(rowSums(is.na(mixcr_table)) != ncol(mixcr_table)) %>% as.data.frame() # Remove entries where number of missing V regions is 3 or more then remove missing column - print("Removing rows that exceed missing threshold") - mixcr_results = mixcr_table %>% dplyr::filter(missing <= missing_threshold) %>% dplyr::select(-missing) + print(paste("Removing rows that exceed missing threshold of", missing_threshold)) + mixcr_results = mixcr_results %>% dplyr::filter(missing <= missing_threshold) %>% dplyr::select(-missing, -results, -file_path, -unix_group, -sample_id) + message(paste(nrow(mixcr_results), glue::glue("MiXCR results contained top clones with <= {missing_threshold} missing regions."))) # Join rows to sample_table print("Joining tables") diff --git a/config.yml b/config.yml index e2f7ec3b..ef1b1c82 100644 --- a/config.yml +++ b/config.yml @@ -116,6 +116,7 @@ default: results_directories: manta: "manta_current/99-outputs/bedpe/" controlfreec: "controlfreec_liftover_current/99-outputs/" + mixcr: "{unix_group}/mixcr-1.2/99-outputs/" tables: biopsies: "biopsy_metadata" From f7edf628008104c851001ea5486fb586507c5540 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 23 Nov 2022 15:24:46 -0800 Subject: [PATCH 6/8] Remove mixcr dir param from collate_results --- R/utilities.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/utilities.R b/R/utilities.R index d11343cc..a04e6b8c 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1291,7 +1291,6 @@ test_glue = function(placeholder="INSERTED"){ #' @param these_samples_metadata Optional argument to use a user specified metadata df, overwrites get_gambl_metadata in join_with_full_metadata. #' @param case_set Optional short name for a pre-defined set of cases. #' @param sbs_manipulation Optional variable for transforming sbs values (e.g log, scale). -#' @param mixcr_results_directory Optional path to MiXCR results. #' @param seq_type_filter Filtering criteria, default is genomes. #' @param from_cache Boolean variable for using cached results (/projects/nhl_meta_analysis_scratch/gambl/results_local/shared/gambl_{seq_type_filter}_results.tsv), default is TRUE. If write_to_file is TRUE, this parameter auto-defaults to FALSE. #' From 1fd6ae6a4f1549f150bd2118642a36e01cd47588 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 23 Nov 2022 15:41:29 -0800 Subject: [PATCH 7/8] documentation update --- R/utilities.R | 2 +- man/collate_igblast_results.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/utilities.R b/R/utilities.R index a04e6b8c..ba1c5c9c 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -4114,7 +4114,7 @@ supplement_maf <- function(incoming_maf, #' @import tidyverse #' #' @examples -#' sample_table = collate_ighv_results(sample_table = sample_table, results_directory=".../01-mixcr/{seq_type}/", missing_threshold=3) +#' sample_table = collate_igblast_results(sample_table = sample_table, results_directory="*/01-mixcr/{seq_type}/", missing_threshold=3) #' collate_igblast_results = function(sample_table, diff --git a/man/collate_igblast_results.Rd b/man/collate_igblast_results.Rd index 791d0f2c..4c9b942e 100644 --- a/man/collate_igblast_results.Rd +++ b/man/collate_igblast_results.Rd @@ -27,6 +27,6 @@ INTERNAL FUNCTION called by collate_results, not meant for out-of-package usage. Identify samples with mutated IGHV from MiXCR + IgBLAST results } \examples{ -sample_table = collate_ighv_results(sample_table = sample_table, results_directory=".../01-mixcr/{seq_type}/", missing_threshold=3) +sample_table = collate_igblast_results(sample_table = sample_table, results_directory="*/01-mixcr/{seq_type}/", missing_threshold=3) } From 6766f8214808ead27adbce7a951707311e58a92f Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 23 Nov 2022 15:52:32 -0800 Subject: [PATCH 8/8] More detailed desc of results_directory param --- R/utilities.R | 4 ++-- man/collate_igblast_results.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/utilities.R b/R/utilities.R index ba1c5c9c..6833ea48 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -4107,14 +4107,14 @@ supplement_maf <- function(incoming_maf, #' Identify samples with mutated IGHV from MiXCR + IgBLAST results #' #' @param sample_table A data frame with sample_id as the first column. -#' @param results_directory Directory containging MiXCR/IgBLASTN results. Depends on files having the columns "mutatedStatus" and "numMissingRegions" +#' @param results_directory Optional parameter to override default directory specified in config. Path to MiXCR/IgBLAST results files, path requires `unix_group`, `seq_type_filter` and `sample_id` wildcards. Depends on files having the columns "mutatedStatus" and "numMissingRegions". #' @param missing_threshold Limit for missing IGH regions during MiXCR assembly. Default is 2. Higher thresholds may return less accurate mutation status. #' #' @return Samples table. #' @import tidyverse #' #' @examples -#' sample_table = collate_igblast_results(sample_table = sample_table, results_directory="*/01-mixcr/{seq_type}/", missing_threshold=3) +#' sample_table = collate_igblast_results(sample_table = sample_table, results_directory="/projects/rmorin/projects/gambl-repos/gambl-*/results/{unix_group}/mixcr-1.2/99-outputs/txt/{seq_type_filter}/mixcr.{sample_id}.clonotypes.IGH.igblast.txt", missing_threshold=3) #' collate_igblast_results = function(sample_table, diff --git a/man/collate_igblast_results.Rd b/man/collate_igblast_results.Rd index 4c9b942e..210a1b78 100644 --- a/man/collate_igblast_results.Rd +++ b/man/collate_igblast_results.Rd @@ -15,7 +15,7 @@ collate_igblast_results( \arguments{ \item{sample_table}{A data frame with sample_id as the first column.} -\item{results_directory}{Directory containging MiXCR/IgBLASTN results. Depends on files having the columns "mutatedStatus" and "numMissingRegions"} +\item{results_directory}{Optional parameter to override default directory specified in config. Path to MiXCR/IgBLAST results files, path requires \code{unix_group}, \code{seq_type_filter} and \code{sample_id} wildcards. Depends on files having the columns "mutatedStatus" and "numMissingRegions".} \item{missing_threshold}{Limit for missing IGH regions during MiXCR assembly. Default is 2. Higher thresholds may return less accurate mutation status.} } @@ -27,6 +27,6 @@ INTERNAL FUNCTION called by collate_results, not meant for out-of-package usage. Identify samples with mutated IGHV from MiXCR + IgBLAST results } \examples{ -sample_table = collate_igblast_results(sample_table = sample_table, results_directory="*/01-mixcr/{seq_type}/", missing_threshold=3) +sample_table = collate_igblast_results(sample_table = sample_table, results_directory="/projects/rmorin/projects/gambl-repos/gambl-*/results/{unix_group}/mixcr-1.2/99-outputs/txt/{seq_type_filter}/mixcr.{sample_id}.clonotypes.IGH.igblast.txt", missing_threshold=3) }