From 92063e18b0999dc3bce0575ad0666bf5cbb0a299 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 19 Oct 2024 17:33:35 +1100 Subject: [PATCH 01/32] pcgr: use central parser --- NAMESPACE | 3 +- R/pcgr.R | 159 ++++++------------------------------------ R/sash.R | 18 +---- R/umccrise.R | 18 +---- man/PcgrJsonFile.Rd | 93 ------------------------ man/PcgrTiersFile.Rd | 93 ------------------------ man/pcgr_json_read.Rd | 22 ++++++ 7 files changed, 49 insertions(+), 357 deletions(-) delete mode 100644 man/PcgrJsonFile.Rd delete mode 100644 man/PcgrTiersFile.Rd create mode 100644 man/pcgr_json_read.Rd diff --git a/NAMESPACE b/NAMESPACE index 6239a8d..5270441 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,8 +6,6 @@ export(BclconvertReports) export(BclconvertReports375) export(File) export(MultiqcFile) -export(PcgrJsonFile) -export(PcgrTiersFile) export(PloidyEstimationMetricsFile) export(Wf) export(Wf_dragen) @@ -59,6 +57,7 @@ export(multiqc_parse_raw_interop) export(multiqc_parse_xyline_plot) export(multiqc_parse_xyline_plot_contig_cvg) export(multiqc_tidy_json) +export(pcgr_json_read) export(rdf2tab) export(read) export(s3_file_presignedurl) diff --git a/R/pcgr.R b/R/pcgr.R index 193fb77..c69be63 100644 --- a/R/pcgr.R +++ b/R/pcgr.R @@ -1,144 +1,29 @@ -#' PcgrJson R6 Class +#' PCGR JSON Read #' -#' @description -#' Contains methods for reading and displaying contents of the -#' `pcgr.json.gz` file output from PCGR. +#' @param x Path to file. #' -#' @examples -#' \dontrun{ -#' x <- "/path/to/pcgr.json.gz" -#' d <- PcgrJsonFile$new(x) -#' d_parsed <- d$read() # or read(d) -#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") -#' } -#' @export -PcgrJsonFile <- R6::R6Class( - "PcgrJsonFile", - inherit = File, - public = list( - #' @description - #' Reads the `pcgr.json.gz` file output from PCGR. - #' - #' @return List of tibbles. - read = function() { - x <- self$path - j <- read_jsongz_jsonlite(x) - # l2tib <- function(el) { - # purrr::flatten(el) |> - # dplyr::bind_rows() |> - # dplyr::mutate(dplyr::across(dplyr::everything(), ~ as.character(.))) - # } - # dbrel <- j[["metadata"]][["pcgr_db_release"]] |> - # purrr::map(l2tib) |> - # dplyr::bind_rows(.id = "name_tidy") |> - # dplyr::select("name", "name_tidy", "version", "url", "resource_type") - # handle nulls and rename - see umccr/dracarys#99 - tmb <- - j[["content"]][["tmb"]][["variant_statistic"]] %||% - j[["content"]][["tmb"]][["v_stat"]] %||% - list(tmb_estimate = NA, n_tmb = NA) - tmb <- purrr::flatten(tmb) |> - tibble::as_tibble_row() |> - dplyr::select("tmb_estimate", "n_tmb") - msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]] - # handle nulls - msi <- msi %||% list(fracIndels = NA, predicted_class = NA) - msi <- purrr::flatten(msi) |> - tibble::as_tibble_row() |> - dplyr::select("fracIndels", "predicted_class") - metrics <- dplyr::bind_cols(msi, tmb) - list( - # using list in case we want other data as well - metrics = metrics - ) - }, - - #' @description - #' Writes a tidy version of the `pcgr.json.gz` file output from PCGR. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s) (one of 'tsv' (def.), - #' 'parquet', 'both'). - write = function(d, out_dir, prefix, out_format = "tsv") { - prefix <- file.path(out_dir, prefix) - p <- glue("{prefix}_pcgr") - l <- list( - meta = list( - obj = d[["metrics"]], - pref = glue("{p}_metrics") - ) - ) - purrr::map(l, function(k) { - write_dracarys(obj = k[["obj"]], prefix = k[["pref"]], out_format = out_format) - }) - } - ) -) - -#' PcgrTiersFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of the -#' `pcgr.snvs_indels.tiers.tsv` file output from PCGR. +#' @return A tibble with: `fracIndels`, `predicted_class`, `tmb_estimate`, `n_tmb`. #' #' @examples #' \dontrun{ -#' x <- "/path/to/pcgr.snvs_indels.tiers.tsv" -#' d <- PcgrTiersFile$new(x) -#' d_parsed <- d$read() # or read(d) -#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +#' pcgr_json_read(x) #' } #' @export -PcgrTiersFile <- R6::R6Class( - "PcgrTiersFile", - inherit = File, - public = list( - #' @description - #' Reads the `pcgr.snvs_indels.tiers.tsv` file output from PCGR. - #' - #' @return List of tibbles. - read = function() { - x <- self$path - ct <- readr::cols( - CHROM = "c", POS = "i", REF = "c", ALT = "c", GENOMIC_CHANGE = "c", - GENOME_VERSION = "c", VCF_SAMPLE_ID = "c", VARIANT_CLASS = "c", - SYMBOL = "c", GENE_NAME = "c", CCDS = "c", CANONICAL = "c", - ENTREZ_ID = "d", UNIPROT_ID = "c", ENSEMBL_TRANSCRIPT_ID = "c", - ENSEMBL_GENE_ID = "c", REFSEQ_MRNA = "c", ONCOSCORE = "d", - ONCOGENE = "l", TUMOR_SUPPRESSOR = "l", ONCOGENE_EVIDENCE = "c", - TUMOR_SUPPRESSOR_EVIDENCE = "c", DISGENET_CUI = "c", - DISGENET_TERMS = "c", CONSEQUENCE = "c", PROTEIN_CHANGE = "c", - PROTEIN_DOMAIN = "c", CODING_STATUS = "c", EXONIC_STATUS = "c", - CDS_CHANGE = "c", HGVSp = "c", HGVSc = "c", EFFECT_PREDICTIONS = "c", - MUTATION_HOTSPOT = "c", MUTATION_HOTSPOT_TRANSCRIPT = "c", - MUTATION_HOTSPOT_CANCERTYPE = "c", PUTATIVE_DRIVER_MUTATION = "l", - CHASMPLUS_DRIVER = "c", CHASMPLUS_TTYPE = "c", VEP_ALL_CSQ = "c", - DBSNPRSID = "c", COSMIC_MUTATION_ID = "c", TCGA_PANCANCER_COUNT = "d", - TCGA_FREQUENCY = "c", ICGC_PCAWG_OCCURRENCE = "c", - CHEMBL_COMPOUND_ID = "c", CHEMBL_COMPOUND_TERMS = "c", - SIMPLEREPEATS_HIT = "l", WINMASKER_HIT = "l", OPENTARGETS_RANK = "d", - CLINVAR = "c", CLINVAR_CLNSIG = "c", GLOBAL_AF_GNOMAD = "d", - GLOBAL_AF_1KG = "d", CALL_CONFIDENCE = "l", DP_TUMOR = "d", - AF_TUMOR = "d", DP_CONTROL = "l", AF_CONTROL = "l", TIER = "c", - TIER_DESCRIPTION = "c" - ) - readr::read_tsv(x, col_types = ct) - }, - - #' @description - #' Writes a tidy version of the `pcgr.snvs_indels.tiers.tsv` file output from PCGR. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s) (one of 'tsv' (def.), - #' 'parquet', 'both'). - write = function(d, out_dir, prefix, out_format = "tsv") { - prefix <- file.path(out_dir, prefix) - prefix2 <- glue("{prefix}_tiers") - write_dracarys(obj = d, prefix = prefix2, out_format = out_format) - } - ) -) +pcgr_json_read <- function(x) { + j <- read_jsongz_jsonlite(x) + tmb <- + j[["content"]][["tmb"]][["variant_statistic"]] %||% + j[["content"]][["tmb"]][["v_stat"]] %||% + list(tmb_estimate = NA, n_tmb = NA) + tmb <- purrr::flatten(tmb) |> + tibble::as_tibble_row() |> + dplyr::select("tmb_estimate", "n_tmb") + msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]] + # handle nulls + msi <- msi %||% list(fracIndels = NA, predicted_class = NA) + msi <- purrr::flatten(msi) |> + tibble::as_tibble_row() |> + dplyr::select("fracIndels", "predicted_class") + metrics <- dplyr::bind_cols(msi, tmb) + return(metrics) +} diff --git a/R/sash.R b/R/sash.R index 345fc2b..9fc2166 100644 --- a/R/sash.R +++ b/R/sash.R @@ -99,22 +99,8 @@ Wf_sash <- R6::R6Class( #' @description Read `pcgr.json.gz` file. #' @param x Path to file. read_pcgr_json = function(x) { - j <- read_jsongz_jsonlite(x) - tmb <- - j[["content"]][["tmb"]][["variant_statistic"]] %||% - j[["content"]][["tmb"]][["v_stat"]] %||% - list(tmb_estimate = NA, n_tmb = NA) - tmb <- purrr::flatten(tmb) |> - tibble::as_tibble_row() |> - dplyr::select("tmb_estimate", "n_tmb") - msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]] - # handle nulls - msi <- msi %||% list(fracIndels = NA, predicted_class = NA) - msi <- purrr::flatten(msi) |> - tibble::as_tibble_row() |> - dplyr::select("fracIndels", "predicted_class") - metrics <- dplyr::bind_cols(msi, tmb) - return(metrics) + dat <- pcgr_json_read(x) + tibble::tibble(name = "pcgrjson", data = list(dat)) }, #' @description Read `dragen.tsv.gz` cancer report hrd file. #' @param x Path to file. diff --git a/R/umccrise.R b/R/umccrise.R index efcd5fe..c097a8f 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -101,22 +101,8 @@ Wf_umccrise <- R6::R6Class( #' @description Read `pcgr.json.gz` file. #' @param x Path to file. read_pcgr_json = function(x) { - j <- read_jsongz_jsonlite(x) - tmb <- - j[["content"]][["tmb"]][["variant_statistic"]] %||% - j[["content"]][["tmb"]][["v_stat"]] %||% - list(tmb_estimate = NA, n_tmb = NA) - tmb <- purrr::flatten(tmb) |> - tibble::as_tibble_row() |> - dplyr::select("tmb_estimate", "n_tmb") - msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]] - # handle nulls - msi <- msi %||% list(fracIndels = NA, predicted_class = NA) - msi <- purrr::flatten(msi) |> - tibble::as_tibble_row() |> - dplyr::select("fracIndels", "predicted_class") - metrics <- dplyr::bind_cols(msi, tmb) - return(metrics) + dat <- pcgr_json_read(x) + tibble::tibble(name = "pcgrjson", data = list(dat)) }, #' @description Read `chord.tsv.gz` cancer report file. #' @param x Path to file. diff --git a/man/PcgrJsonFile.Rd b/man/PcgrJsonFile.Rd deleted file mode 100644 index a554687..0000000 --- a/man/PcgrJsonFile.Rd +++ /dev/null @@ -1,93 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pcgr.R -\name{PcgrJsonFile} -\alias{PcgrJsonFile} -\title{PcgrJson R6 Class} -\description{ -Contains methods for reading and displaying contents of the -\code{pcgr.json.gz} file output from PCGR. -} -\examples{ -\dontrun{ -x <- "/path/to/pcgr.json.gz" -d <- PcgrJsonFile$new(x) -d_parsed <- d$read() # or read(d) -d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") -} -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{PcgrJsonFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-PcgrJsonFile-read}{\code{PcgrJsonFile$read()}} -\item \href{#method-PcgrJsonFile-write}{\code{PcgrJsonFile$write()}} -\item \href{#method-PcgrJsonFile-clone}{\code{PcgrJsonFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PcgrJsonFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{pcgr.json.gz} file output from PCGR. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{PcgrJsonFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -List of tibbles. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PcgrJsonFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{pcgr.json.gz} file output from PCGR. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{PcgrJsonFile$write(d, out_dir, prefix, out_format = "tsv")}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s) (one of 'tsv' (def.), -'parquet', 'both').} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PcgrJsonFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{PcgrJsonFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/PcgrTiersFile.Rd b/man/PcgrTiersFile.Rd deleted file mode 100644 index a0dc272..0000000 --- a/man/PcgrTiersFile.Rd +++ /dev/null @@ -1,93 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pcgr.R -\name{PcgrTiersFile} -\alias{PcgrTiersFile} -\title{PcgrTiersFile R6 Class} -\description{ -Contains methods for reading and displaying contents of the -\code{pcgr.snvs_indels.tiers.tsv} file output from PCGR. -} -\examples{ -\dontrun{ -x <- "/path/to/pcgr.snvs_indels.tiers.tsv" -d <- PcgrTiersFile$new(x) -d_parsed <- d$read() # or read(d) -d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") -} -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{PcgrTiersFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-PcgrTiersFile-read}{\code{PcgrTiersFile$read()}} -\item \href{#method-PcgrTiersFile-write}{\code{PcgrTiersFile$write()}} -\item \href{#method-PcgrTiersFile-clone}{\code{PcgrTiersFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PcgrTiersFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{pcgr.snvs_indels.tiers.tsv} file output from PCGR. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{PcgrTiersFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -List of tibbles. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PcgrTiersFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{pcgr.snvs_indels.tiers.tsv} file output from PCGR. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{PcgrTiersFile$write(d, out_dir, prefix, out_format = "tsv")}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s) (one of 'tsv' (def.), -'parquet', 'both').} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PcgrTiersFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{PcgrTiersFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/pcgr_json_read.Rd b/man/pcgr_json_read.Rd new file mode 100644 index 0000000..edd1436 --- /dev/null +++ b/man/pcgr_json_read.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pcgr.R +\name{pcgr_json_read} +\alias{pcgr_json_read} +\title{PCGR JSON Read} +\usage{ +pcgr_json_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +A tibble with: \code{fracIndels}, \code{predicted_class}, \code{tmb_estimate}, \code{n_tmb}. +} +\description{ +PCGR JSON Read +} +\examples{ +\dontrun{ +pcgr_json_read(x) +} +} From cae0e6e5933e7087bffa55ec181b9620b66d64a0 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 19 Oct 2024 18:47:26 +1100 Subject: [PATCH 02/32] dragen_subprefix tests --- R/dragen.R | 26 +++++++++++++++--- man/dragen_subprefix.Rd | 27 +++++++++++++++++++ .../test-roxytest-testexamples-dragen.R | 17 ++++++++++++ 3 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 man/dragen_subprefix.Rd create mode 100644 tests/testthat/test-roxytest-testexamples-dragen.R diff --git a/R/dragen.R b/R/dragen.R index 2e6cbbd..d481e82 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -853,13 +853,33 @@ PloidyEstimationMetricsFile <- R6::R6Class( ) ) +#' DRAGEN File Subprefix +#' +#' Extracts a file subprefix for better table naming. +#' +#' @param x File name. +#' @param suffix Suffix to remove. +#' +#' @return Clean string. +#' +#' @examples +#' x1 <- "L2401290.exon_contig_mean_cov.csv" +#' x2 <- "L2401290.tmb_contig_mean_cov.csv.gz" +#' x3 <- "foo.bar.exon_contig_mean_cov.csv.gz" +#' (s1 <- dragen_subprefix(x1, "_contig_mean_cov")) +#' (s2 <- dragen_subprefix(x2, "_contig_mean_cov")) +#' (s3 <- dragen_subprefix(x3, "_contig_mean_cov")) +#' @testexamples +#' expect_equal(s1, "exon") +#' expect_equal(s2, "tmb") +#' expect_equal(s3, "bar") dragen_subprefix <- function(x, suffix) { # L2401290.exon_contig_mean_cov.csv -> exon # L2401290.target_bed_contig_mean_cov.csv -> target_bed # L2401290.tmb_contig_mean_cov.csv -> tmb # L2401290.wgs_contig_mean_cov.csv -> wgs + # capture the substring between the first dot and the next dot. bname <- basename(x) - s1 <- tools::file_path_sans_ext(bname) - s2 <- sub(".*\\.(.*)", "\\1", s1) - sub(suffix, "", s2) + s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov + sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon" } diff --git a/man/dragen_subprefix.Rd b/man/dragen_subprefix.Rd new file mode 100644 index 0000000..6a25ce2 --- /dev/null +++ b/man/dragen_subprefix.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_subprefix} +\alias{dragen_subprefix} +\title{DRAGEN File Subprefix} +\usage{ +dragen_subprefix(x, suffix) +} +\arguments{ +\item{x}{File name.} + +\item{suffix}{Suffix to remove.} +} +\value{ +Clean string. +} +\description{ +Extracts a file subprefix for better table naming. +} +\examples{ +x1 <- "L2401290.exon_contig_mean_cov.csv" +x2 <- "L2401290.tmb_contig_mean_cov.csv.gz" +x3 <- "foo.bar.exon_contig_mean_cov.csv.gz" +(s1 <- dragen_subprefix(x1, "_contig_mean_cov")) +(s2 <- dragen_subprefix(x2, "_contig_mean_cov")) +(s3 <- dragen_subprefix(x3, "_contig_mean_cov")) +} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R new file mode 100644 index 0000000..7305361 --- /dev/null +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -0,0 +1,17 @@ +# Generated by roxytest: do not edit by hand! + +# File R/dragen.R: @testexamples + +test_that("Function dragen_subprefix() @ L876", { + + x1 <- "L2401290.exon_contig_mean_cov.csv" + x2 <- "L2401290.tmb_contig_mean_cov.csv.gz" + x3 <- "foo.bar.exon_contig_mean_cov.csv.gz" + (s1 <- dragen_subprefix(x1, "_contig_mean_cov")) + (s2 <- dragen_subprefix(x2, "_contig_mean_cov")) + (s3 <- dragen_subprefix(x3, "_contig_mean_cov")) + expect_equal(s1, "exon") + expect_equal(s2, "tmb") + expect_equal(s3, "bar") +}) + From ab4680c1261f6aaaa0c98ba780477ff3530d362e Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 19 Oct 2024 19:21:51 +1100 Subject: [PATCH 03/32] sigs: use single parser --- R/sash.R | 68 +++++++++++++++++++++--------------------- man/Wf_sash.Rd | 80 +++----------------------------------------------- 2 files changed, 37 insertions(+), 111 deletions(-) diff --git a/R/sash.R b/R/sash.R index 9fc2166..19f677e 100644 --- a/R/sash.R +++ b/R/sash.R @@ -7,11 +7,11 @@ #' \dontrun{ #' #' #---- Local ----# -#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" -#' p2 <- "202408270b93455e/L2401308_L2401307" +#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ03324/sash" +#' p2 <- "202408309698c304/L2300777_L2300776" #' p <- normalizePath(file.path(p1, p2)) -#' SubjectID <- "SBJ05571" -#' SampleID_tumor <- "MDX240307" +#' SubjectID <- "SBJ03324" +#' SampleID_tumor <- "PRJ230432" #' prefix <- glue("{SubjectID}__{SampleID_tumor}") #' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) #' s1$list_files(max_files = 20) @@ -69,10 +69,10 @@ Wf_sash <- R6::R6Class( glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord", glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect", glue("{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrd_dragen", - glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015", - glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020", - glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs", - glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel", + glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigstsv", + glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigstsv", + glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigstsv", + glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigstsv", glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum", glue("{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgr_json" ) |> @@ -100,13 +100,14 @@ Wf_sash <- R6::R6Class( #' @param x Path to file. read_pcgr_json = function(x) { dat <- pcgr_json_read(x) - tibble::tibble(name = "pcgrjson", data = list(dat)) + tibble::tibble(name = "pcgrjson", data = list(dat[])) }, #' @description Read `dragen.tsv.gz` cancer report hrd file. #' @param x Path to file. read_hrd_dragen = function(x) { ct <- readr::cols(.default = "d", Sample = "c") - read_tsvgz(x, col_types = ct) + dat <- read_tsvgz(x, col_types = ct) + tibble::tibble(name = "hrddragen", data = list(dat[])) }, #' @description Read `chord.tsv.gz` cancer report hrd file. #' @param x Path to file. @@ -118,7 +119,8 @@ Wf_sash <- R6::R6Class( p_BRCA1 = "d", p_BRCA2 = "d" ) - read_tsvgz(x, col_types = ct) + dat <- read_tsvgz(x, col_types = ct) + tibble::tibble(name = "hrdchord", data = list(dat[])) }, #' @description Read `hrdetect.tsv.gz` cancer report hrd file. #' @param x Path to file. @@ -127,43 +129,26 @@ Wf_sash <- R6::R6Class( .default = "d", sample = "c" ) - read_tsvgz(x, col_types = ct) |> + dat <- read_tsvgz(x, col_types = ct) |> dplyr::select(-c("sample")) + tibble::tibble(name = "hrdhrdetect", data = list(dat[])) }, #' @description Read signature cancer report file. #' @param x Path to file. read_sigstsv = function(x) { + suffix <- private$sigs_suffix(x) ct <- readr::cols( .default = "d", Signature = "c" ) - read_tsvgz(x, col_types = ct) - }, - #' @description Read `snv_2015.tsv.gz` sigs cancer report file. - #' @param x Path to file. - read_sigs_snv2015 = function(x) { - self$read_sigstsv(x) - }, - #' @description Read `snv_2020.tsv.gz` sigs cancer report file. - #' @param x Path to file. - read_sigs_snv2020 = function(x) { - self$read_sigstsv(x) - }, - #' @description Read `dbs.tsv.gz` sigs cancer report file. - #' @param x Path to file. - read_sigs_dbs = function(x) { - self$read_sigstsv(x) - }, - #' @description Read `indel.tsv.gz` sigs cancer report file. - #' @param x Path to file. - read_sigs_indel = function(x) { - self$read_sigstsv(x) + dat <- read_tsvgz(x, col_types = ct) + tibble::tibble(name = glue("sigs_{suffix}"), data = list(dat[])) }, #' @description Read `qc_summary.tsv.gz` cancer report file. #' @param x Path to file. read_qcsum = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) - d |> + dat <- d |> dplyr::select("variable", "value") |> tidyr::pivot_wider(names_from = "variable", values_from = "value") |> dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |> @@ -187,8 +172,21 @@ Wf_sash <- R6::R6Class( wgd_hmf = "WGD", "hypermutated" ) + tibble::tibble(name = glue("qcsum"), data = list(dat[])) } - ) # end public + ), # end public + private = list( + sigs_suffix = function(x) { + x <- basename(x) + dplyr::case_when( + grepl("-dbs", x) ~ "dbs", + grepl("-indel", x) ~ "ind", + grepl("-snv_2015", x) ~ "snv2015", + grepl("-snv_2020", x) ~ "snv2020", + .default = "" + ) + } + ) ) #' sash Download Tidy and Write diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd index 1caa51a..f236de8 100644 --- a/man/Wf_sash.Rd +++ b/man/Wf_sash.Rd @@ -10,11 +10,11 @@ Reads and writes tidy versions of files from the \code{sash} workflow \dontrun{ #---- Local ----# -p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" -p2 <- "202408270b93455e/L2401308_L2401307" +p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ03324/sash" +p2 <- "202408309698c304/L2300777_L2300776" p <- normalizePath(file.path(p1, p2)) -SubjectID <- "SBJ05571" -SampleID_tumor <- "MDX240307" +SubjectID <- "SBJ03324" +SampleID_tumor <- "PRJ230432" prefix <- glue("{SubjectID}__{SampleID_tumor}") s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) s1$list_files(max_files = 20) @@ -72,10 +72,6 @@ d_write <- s1$write( \item \href{#method-Wf_sash-read_hrd_chord}{\code{Wf_sash$read_hrd_chord()}} \item \href{#method-Wf_sash-read_hrd_hrdetect}{\code{Wf_sash$read_hrd_hrdetect()}} \item \href{#method-Wf_sash-read_sigstsv}{\code{Wf_sash$read_sigstsv()}} -\item \href{#method-Wf_sash-read_sigs_snv2015}{\code{Wf_sash$read_sigs_snv2015()}} -\item \href{#method-Wf_sash-read_sigs_snv2020}{\code{Wf_sash$read_sigs_snv2020()}} -\item \href{#method-Wf_sash-read_sigs_dbs}{\code{Wf_sash$read_sigs_dbs()}} -\item \href{#method-Wf_sash-read_sigs_indel}{\code{Wf_sash$read_sigs_indel()}} \item \href{#method-Wf_sash-read_qcsum}{\code{Wf_sash$read_qcsum()}} \item \href{#method-Wf_sash-clone}{\code{Wf_sash$clone()}} } @@ -207,74 +203,6 @@ Read signature cancer report file. \if{html}{\out{
}}\preformatted{Wf_sash$read_sigstsv(x)}\if{html}{\out{
}} } -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Path to file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_snv2015}{}}} -\subsection{Method \code{read_sigs_snv2015()}}{ -Read \code{snv_2015.tsv.gz} sigs cancer report file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigs_snv2015(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Path to file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_snv2020}{}}} -\subsection{Method \code{read_sigs_snv2020()}}{ -Read \code{snv_2020.tsv.gz} sigs cancer report file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigs_snv2020(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Path to file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_dbs}{}}} -\subsection{Method \code{read_sigs_dbs()}}{ -Read \code{dbs.tsv.gz} sigs cancer report file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigs_dbs(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Path to file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_indel}{}}} -\subsection{Method \code{read_sigs_indel()}}{ -Read \code{indel.tsv.gz} sigs cancer report file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigs_indel(x)}\if{html}{\out{
}} -} - \subsection{Arguments}{ \if{html}{\out{
}} \describe{ From 1ea69787ba13e6dfa25b53c12eff06a226c1c4b8 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 19 Oct 2024 23:55:59 +1100 Subject: [PATCH 04/32] sigs: use single parser --- R/umccrise.R | 58 ++++++++++++++++++------------------- man/Wf_umccrise.Rd | 72 ---------------------------------------------- 2 files changed, 28 insertions(+), 102 deletions(-) diff --git a/R/umccrise.R b/R/umccrise.R index c097a8f..e372b75 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -70,10 +70,10 @@ Wf_umccrise <- R6::R6Class( ~regex, ~fun, glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord", glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect", - glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015", - glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020", - glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs", - glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel", + glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigstsv", + glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigstsv", + glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigstsv", + glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigstsv", glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum", glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpairmultiqc", glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgr_json" @@ -114,7 +114,8 @@ Wf_umccrise <- R6::R6Class( p_BRCA1 = "d", p_BRCA2 = "d" ) - read_tsvgz(x, col_types = ct) + dat <- read_tsvgz(x, col_types = ct) + tibble::tibble(name = "hrdchord", data = list(dat[])) }, #' @description Read `hrdetect.tsv.gz` cancer report file. #' @param x Path to file. @@ -123,43 +124,26 @@ Wf_umccrise <- R6::R6Class( .default = "d", sample = "c" ) - read_tsvgz(x, col_types = ct) |> + dat <- read_tsvgz(x, col_types = ct) |> dplyr::select(-c("sample")) + tibble::tibble(name = "hrdhrdetect", data = list(dat[])) }, #' @description Read signature cancer report file. #' @param x Path to file. read_sigstsv = function(x) { + suffix <- private$sigs_suffix(x) ct <- readr::cols( .default = "d", Signature = "c" ) - read_tsvgz(x, col_types = ct) - }, - #' @description Read `snv_2015.tsv.gz` sigs cancer report file. - #' @param x Path to file. - read_sigs_snv2015 = function(x) { - self$read_sigstsv(x) - }, - #' @description Read `snv_2020.tsv.gz` sigs cancer report file. - #' @param x Path to file. - read_sigs_snv2020 = function(x) { - self$read_sigstsv(x) - }, - #' @description Read `dbs.tsv.gz` sigs cancer report file. - #' @param x Path to file. - read_sigs_dbs = function(x) { - self$read_sigstsv(x) - }, - #' @description Read `indel.tsv.gz` sigs cancer report file. - #' @param x Path to file. - read_sigs_indel = function(x) { - self$read_sigstsv(x) + dat <- read_tsvgz(x, col_types = ct) + tibble::tibble(name = glue("sigs_{suffix}"), data = list(dat[])) }, #' @description Read `qc_summary.tsv.gz` cancer report file. #' @param x Path to file. read_qcsum = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) - d |> + dat <- d |> dplyr::select("variable", "value") |> tidyr::pivot_wider(names_from = "variable", values_from = "value") |> dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |> @@ -188,6 +172,7 @@ Wf_umccrise <- R6::R6Class( wgd_hmf = "WGD", "hypermutated", "bpi_enabled" ) + tibble::tibble(name = glue("qcsum"), data = list(dat[])) }, #' @description Read multiqc_conpair.txt file. #' @param x Path to file. @@ -216,12 +201,25 @@ Wf_umccrise <- R6::R6Class( } d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c")) assertthat::assert_that(all(colnames(d1) == cnames$old)) - d1 |> + dat <- d1 |> dplyr::filter(!.data$Sample %in% um_ref_samples) |> dplyr::relocate("contamination", .after = "Sample") |> rlang::set_names(cnames$new) + tibble::tibble(name = glue("conpair"), data = list(dat[])) } - ) # end public + ), # end public + private = list( + sigs_suffix = function(x) { + x <- basename(x) + dplyr::case_when( + grepl("-dbs", x) ~ "dbs", + grepl("-indel", x) ~ "ind", + grepl("-snv_2015", x) ~ "snv2015", + grepl("-snv_2020", x) ~ "snv2020", + .default = "" + ) + } + ) ) #' umccrise Download Tidy and Write diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 131a62e..76a6b1e 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -73,10 +73,6 @@ d_write <- um2$write( \item \href{#method-Wf_umccrise-read_hrd_chord}{\code{Wf_umccrise$read_hrd_chord()}} \item \href{#method-Wf_umccrise-read_hrd_hrdetect}{\code{Wf_umccrise$read_hrd_hrdetect()}} \item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}} -\item \href{#method-Wf_umccrise-read_sigs_snv2015}{\code{Wf_umccrise$read_sigs_snv2015()}} -\item \href{#method-Wf_umccrise-read_sigs_snv2020}{\code{Wf_umccrise$read_sigs_snv2020()}} -\item \href{#method-Wf_umccrise-read_sigs_dbs}{\code{Wf_umccrise$read_sigs_dbs()}} -\item \href{#method-Wf_umccrise-read_sigs_indel}{\code{Wf_umccrise$read_sigs_indel()}} \item \href{#method-Wf_umccrise-read_qcsum}{\code{Wf_umccrise$read_qcsum()}} \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}} \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} @@ -192,74 +188,6 @@ Read signature cancer report file. \if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{
}} } -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Path to file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_snv2015}{}}} -\subsection{Method \code{read_sigs_snv2015()}}{ -Read \code{snv_2015.tsv.gz} sigs cancer report file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigs_snv2015(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Path to file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_snv2020}{}}} -\subsection{Method \code{read_sigs_snv2020()}}{ -Read \code{snv_2020.tsv.gz} sigs cancer report file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigs_snv2020(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Path to file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_dbs}{}}} -\subsection{Method \code{read_sigs_dbs()}}{ -Read \code{dbs.tsv.gz} sigs cancer report file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigs_dbs(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Path to file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_indel}{}}} -\subsection{Method \code{read_sigs_indel()}}{ -Read \code{indel.tsv.gz} sigs cancer report file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigs_indel(x)}\if{html}{\out{
}} -} - \subsection{Arguments}{ \if{html}{\out{
}} \describe{ From 4297519d3d1fe8a85d52ea680a2f02d239f21818 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 20 Oct 2024 00:00:53 +1100 Subject: [PATCH 05/32] make dragen_subprefix private --- R/dragen.R | 31 ------------------- R/tso_dragen.R | 23 +++++++++----- man/dragen_subprefix.Rd | 27 ---------------- .../test-roxytest-testexamples-dragen.R | 17 ---------- 4 files changed, 16 insertions(+), 82 deletions(-) delete mode 100644 man/dragen_subprefix.Rd delete mode 100644 tests/testthat/test-roxytest-testexamples-dragen.R diff --git a/R/dragen.R b/R/dragen.R index d481e82..b198cb8 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -852,34 +852,3 @@ PloidyEstimationMetricsFile <- R6::R6Class( } ) ) - -#' DRAGEN File Subprefix -#' -#' Extracts a file subprefix for better table naming. -#' -#' @param x File name. -#' @param suffix Suffix to remove. -#' -#' @return Clean string. -#' -#' @examples -#' x1 <- "L2401290.exon_contig_mean_cov.csv" -#' x2 <- "L2401290.tmb_contig_mean_cov.csv.gz" -#' x3 <- "foo.bar.exon_contig_mean_cov.csv.gz" -#' (s1 <- dragen_subprefix(x1, "_contig_mean_cov")) -#' (s2 <- dragen_subprefix(x2, "_contig_mean_cov")) -#' (s3 <- dragen_subprefix(x3, "_contig_mean_cov")) -#' @testexamples -#' expect_equal(s1, "exon") -#' expect_equal(s2, "tmb") -#' expect_equal(s3, "bar") -dragen_subprefix <- function(x, suffix) { - # L2401290.exon_contig_mean_cov.csv -> exon - # L2401290.target_bed_contig_mean_cov.csv -> target_bed - # L2401290.tmb_contig_mean_cov.csv -> tmb - # L2401290.wgs_contig_mean_cov.csv -> wgs - # capture the substring between the first dot and the next dot. - bname <- basename(x) - s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov - sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon" -} diff --git a/R/tso_dragen.R b/R/tso_dragen.R index dc35091..6df67d9 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -117,7 +117,7 @@ Wf_dragen <- R6::R6Class( #' @param x Path to file. #' @param keep_alt Keep ALT contigs. read_contigMeanCov = function(x, keep_alt = FALSE) { - subprefix <- dragen_subprefix(x, "_contig_mean_cov") + subprefix <- private$dragen_subprefix(x, "_contig_mean_cov") dat <- readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> dplyr::filter( if (!keep_alt) { @@ -131,14 +131,14 @@ Wf_dragen <- R6::R6Class( #' @description Read `coverage_metrics.csv` file. #' @param x Path to file. read_coverageMetrics = function(x) { - subprefix <- dragen_subprefix(x, "_coverage_metrics") + subprefix <- private$dragen_subprefix(x, "_coverage_metrics") dat <- dragen_coverage_metrics_read(x) tibble::tibble(name = glue("covmetrics_{subprefix}"), data = list(dat)) }, #' @description Read `fine_hist.csv` file. #' @param x Path to file. read_fineHist = function(x) { - subprefix <- dragen_subprefix(x, "_fine_hist") + subprefix <- private$dragen_subprefix(x, "_fine_hist") d <- readr::read_csv(x, col_types = "cd") assertthat::assert_that(all(colnames(d) == c("Depth", "Overall"))) # there's a max Depth of 2000+, so convert to numeric for easier plotting @@ -175,7 +175,7 @@ Wf_dragen <- R6::R6Class( #' @description Read `hist.csv` (not `fine_hist.csv`!) file. #' @param x Path to file. read_hist = function(x) { - subprefix <- dragen_subprefix(x, "_hist") + subprefix <- private$dragen_subprefix(x, "_hist") d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd") dat <- d |> dplyr::mutate( @@ -197,7 +197,9 @@ Wf_dragen <- R6::R6Class( #' @param x Path to file. read_timeMetrics = function(x) { cn <- c("dummy1", "dummy2", "Step", "time_hrs", "time_sec") - ct <- readr::cols(.default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d") + ct <- readr::cols( + .default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d" + ) d <- readr::read_csv(x, col_names = cn, col_types = ct) assertthat::assert_that(d$dummy1[1] == "RUN TIME", is.na(d$dummy2[1])) assertthat::assert_that(inherits(d$time_hrs, "hms")) @@ -215,7 +217,7 @@ Wf_dragen <- R6::R6Class( #' @description Read `vc_metrics.csv`/`gvcf_metrics.csv` file. #' @param x Path to file. read_vcMetrics = function(x) { - subprefix <- dragen_subprefix(x, "_metrics") + subprefix <- private$dragen_subprefix(x, "_metrics") dat <- dragen_vc_metrics_read(x) tibble::tibble(name = glue("vcmetrics_{subprefix}"), data = list(dat[])) }, @@ -268,5 +270,12 @@ Wf_dragen <- R6::R6Class( dplyr::rename(Chromosome = "#Chromosome") tibble::tibble(name = "msidiffs", data = list(dat[])) } - ) # end public + ), # end public + private = list( + dragen_subprefix = function(x, suffix) { + bname <- basename(x) + s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov + sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon" + } + ) ) # end Wf_dragen diff --git a/man/dragen_subprefix.Rd b/man/dragen_subprefix.Rd deleted file mode 100644 index 6a25ce2..0000000 --- a/man/dragen_subprefix.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{dragen_subprefix} -\alias{dragen_subprefix} -\title{DRAGEN File Subprefix} -\usage{ -dragen_subprefix(x, suffix) -} -\arguments{ -\item{x}{File name.} - -\item{suffix}{Suffix to remove.} -} -\value{ -Clean string. -} -\description{ -Extracts a file subprefix for better table naming. -} -\examples{ -x1 <- "L2401290.exon_contig_mean_cov.csv" -x2 <- "L2401290.tmb_contig_mean_cov.csv.gz" -x3 <- "foo.bar.exon_contig_mean_cov.csv.gz" -(s1 <- dragen_subprefix(x1, "_contig_mean_cov")) -(s2 <- dragen_subprefix(x2, "_contig_mean_cov")) -(s3 <- dragen_subprefix(x3, "_contig_mean_cov")) -} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R deleted file mode 100644 index 7305361..0000000 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ /dev/null @@ -1,17 +0,0 @@ -# Generated by roxytest: do not edit by hand! - -# File R/dragen.R: @testexamples - -test_that("Function dragen_subprefix() @ L876", { - - x1 <- "L2401290.exon_contig_mean_cov.csv" - x2 <- "L2401290.tmb_contig_mean_cov.csv.gz" - x3 <- "foo.bar.exon_contig_mean_cov.csv.gz" - (s1 <- dragen_subprefix(x1, "_contig_mean_cov")) - (s2 <- dragen_subprefix(x2, "_contig_mean_cov")) - (s3 <- dragen_subprefix(x3, "_contig_mean_cov")) - expect_equal(s1, "exon") - expect_equal(s2, "tmb") - expect_equal(s3, "bar") -}) - From cb5c41436450b4bacdc46e82740437e4a512fcec Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 20 Oct 2024 00:12:03 +1100 Subject: [PATCH 06/32] sigs: refactor --- R/Wf.R | 3 ++- R/sash.R | 38 ++++++++++++++--------------- R/umccrise.R | 57 ++++++++++++++++++++++--------------------- man/Wf_sash.Rd | 60 +++++++++++++++++++++++----------------------- man/Wf_umccrise.Rd | 60 +++++++++++++++++++++++----------------------- 5 files changed, 108 insertions(+), 110 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index 9d35956..089d575 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -199,7 +199,8 @@ Wf <- R6::R6Class( #' @param x Tibble with `localpath` to file and the function `type` to parse it. tidy_files = function(x) { # awesomeness - tidy_files(x, envir = self) + tidy_files(x, envir = self) |> + dplyr::arrange(.data$name) }, #' @description Write tidy data. #' @param x Tibble with tidy `data` list-column. diff --git a/R/sash.R b/R/sash.R index 19f677e..725d36c 100644 --- a/R/sash.R +++ b/R/sash.R @@ -98,20 +98,20 @@ Wf_sash <- R6::R6Class( }, #' @description Read `pcgr.json.gz` file. #' @param x Path to file. - read_pcgr_json = function(x) { + read_pcgrJson = function(x) { dat <- pcgr_json_read(x) tibble::tibble(name = "pcgrjson", data = list(dat[])) }, #' @description Read `dragen.tsv.gz` cancer report hrd file. #' @param x Path to file. - read_hrd_dragen = function(x) { + read_hrdDragen = function(x) { ct <- readr::cols(.default = "d", Sample = "c") dat <- read_tsvgz(x, col_types = ct) tibble::tibble(name = "hrddragen", data = list(dat[])) }, #' @description Read `chord.tsv.gz` cancer report hrd file. #' @param x Path to file. - read_hrd_chord = function(x) { + read_hrdChord = function(x) { ct <- readr::cols_only( p_hrd = "d", hr_status = "c", @@ -124,7 +124,7 @@ Wf_sash <- R6::R6Class( }, #' @description Read `hrdetect.tsv.gz` cancer report hrd file. #' @param x Path to file. - read_hrd_hrdetect = function(x) { + read_hrdHrdetect = function(x) { ct <- readr::cols( .default = "d", sample = "c" @@ -135,8 +135,18 @@ Wf_sash <- R6::R6Class( }, #' @description Read signature cancer report file. #' @param x Path to file. - read_sigstsv = function(x) { - suffix <- private$sigs_suffix(x) + read_sigsTsv = function(x) { + .sigsSuffix <- function(x) { + x <- basename(x) + dplyr::case_when( + grepl("-dbs", x) ~ "dbs", + grepl("-indel", x) ~ "ind", + grepl("-snv_2015", x) ~ "snv2015", + grepl("-snv_2020", x) ~ "snv2020", + .default = "" + ) + } + suffix <- .sigsSuffix(x) ct <- readr::cols( .default = "d", Signature = "c" @@ -146,7 +156,7 @@ Wf_sash <- R6::R6Class( }, #' @description Read `qc_summary.tsv.gz` cancer report file. #' @param x Path to file. - read_qcsum = function(x) { + read_qcSum = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) dat <- d |> dplyr::select("variable", "value") |> @@ -174,19 +184,7 @@ Wf_sash <- R6::R6Class( ) tibble::tibble(name = glue("qcsum"), data = list(dat[])) } - ), # end public - private = list( - sigs_suffix = function(x) { - x <- basename(x) - dplyr::case_when( - grepl("-dbs", x) ~ "dbs", - grepl("-indel", x) ~ "ind", - grepl("-snv_2015", x) ~ "snv2015", - grepl("-snv_2020", x) ~ "snv2020", - .default = "" - ) - } - ) + ) # end public ) #' sash Download Tidy and Write diff --git a/R/umccrise.R b/R/umccrise.R index e372b75..0c0c296 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -68,15 +68,15 @@ Wf_umccrise <- R6::R6Class( crep <- "cancer_report_tables" regexes <- tibble::tribble( ~regex, ~fun, - glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord", - glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect", - glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigstsv", - glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigstsv", - glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigstsv", - glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigstsv", - glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum", - glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpairmultiqc", - glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgr_json" + glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord", + glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect", + glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv", + glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv", + glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv", + glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv", + glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum", + glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpair", + glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgrJson" ) |> dplyr::mutate(fun = paste0("read_", .data$fun)) @@ -100,13 +100,13 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `pcgr.json.gz` file. #' @param x Path to file. - read_pcgr_json = function(x) { + read_pcgrJson = function(x) { dat <- pcgr_json_read(x) tibble::tibble(name = "pcgrjson", data = list(dat)) }, #' @description Read `chord.tsv.gz` cancer report file. #' @param x Path to file. - read_hrd_chord = function(x) { + read_hrdChord = function(x) { ct <- readr::cols_only( p_hrd = "d", hr_status = "c", @@ -119,7 +119,7 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `hrdetect.tsv.gz` cancer report file. #' @param x Path to file. - read_hrd_hrdetect = function(x) { + read_hrdHrdetect = function(x) { ct <- readr::cols( .default = "d", sample = "c" @@ -130,8 +130,19 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read signature cancer report file. #' @param x Path to file. - read_sigstsv = function(x) { - suffix <- private$sigs_suffix(x) + read_sigsTsv = function(x) { + .sigsSuffix <- function(x) { + x <- basename(x) + dplyr::case_when( + grepl("-dbs", x) ~ "dbs", + grepl("-indel", x) ~ "ind", + grepl("-snv_2015", x) ~ "snv2015", + grepl("-snv_2020", x) ~ "snv2020", + .default = "" + ) + } + + suffix <- .sigsSuffix(x) ct <- readr::cols( .default = "d", Signature = "c" @@ -141,7 +152,7 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `qc_summary.tsv.gz` cancer report file. #' @param x Path to file. - read_qcsum = function(x) { + read_qcSum = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) dat <- d |> dplyr::select("variable", "value") |> @@ -176,7 +187,7 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read multiqc_conpair.txt file. #' @param x Path to file. - read_conpairmultiqc = function(x) { + read_conpair = function(x) { um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota") um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples))) cnames <- list( @@ -207,19 +218,7 @@ Wf_umccrise <- R6::R6Class( rlang::set_names(cnames$new) tibble::tibble(name = glue("conpair"), data = list(dat[])) } - ), # end public - private = list( - sigs_suffix = function(x) { - x <- basename(x) - dplyr::case_when( - grepl("-dbs", x) ~ "dbs", - grepl("-indel", x) ~ "ind", - grepl("-snv_2015", x) ~ "snv2015", - grepl("-snv_2020", x) ~ "snv2020", - .default = "" - ) - } - ) + ) # end public ) #' umccrise Download Tidy and Write diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd index f236de8..1b7c81f 100644 --- a/man/Wf_sash.Rd +++ b/man/Wf_sash.Rd @@ -67,12 +67,12 @@ d_write <- s1$write( \itemize{ \item \href{#method-Wf_sash-new}{\code{Wf_sash$new()}} \item \href{#method-Wf_sash-print}{\code{Wf_sash$print()}} -\item \href{#method-Wf_sash-read_pcgr_json}{\code{Wf_sash$read_pcgr_json()}} -\item \href{#method-Wf_sash-read_hrd_dragen}{\code{Wf_sash$read_hrd_dragen()}} -\item \href{#method-Wf_sash-read_hrd_chord}{\code{Wf_sash$read_hrd_chord()}} -\item \href{#method-Wf_sash-read_hrd_hrdetect}{\code{Wf_sash$read_hrd_hrdetect()}} -\item \href{#method-Wf_sash-read_sigstsv}{\code{Wf_sash$read_sigstsv()}} -\item \href{#method-Wf_sash-read_qcsum}{\code{Wf_sash$read_qcsum()}} +\item \href{#method-Wf_sash-read_pcgrJson}{\code{Wf_sash$read_pcgrJson()}} +\item \href{#method-Wf_sash-read_hrdDragen}{\code{Wf_sash$read_hrdDragen()}} +\item \href{#method-Wf_sash-read_hrdChord}{\code{Wf_sash$read_hrdChord()}} +\item \href{#method-Wf_sash-read_hrdHrdetect}{\code{Wf_sash$read_hrdHrdetect()}} +\item \href{#method-Wf_sash-read_sigsTsv}{\code{Wf_sash$read_sigsTsv()}} +\item \href{#method-Wf_sash-read_qcSum}{\code{Wf_sash$read_qcSum()}} \item \href{#method-Wf_sash-clone}{\code{Wf_sash$clone()}} } } @@ -127,12 +127,12 @@ Print details about the Workflow. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgr_json}{}}} -\subsection{Method \code{read_pcgr_json()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgrJson}{}}} +\subsection{Method \code{read_pcgrJson()}}{ Read \code{pcgr.json.gz} file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_pcgr_json(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_pcgrJson(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -144,12 +144,12 @@ Read \code{pcgr.json.gz} file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_dragen}{}}} -\subsection{Method \code{read_hrd_dragen()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdDragen}{}}} +\subsection{Method \code{read_hrdDragen()}}{ Read \code{dragen.tsv.gz} cancer report hrd file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_hrd_dragen(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrdDragen(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -161,12 +161,12 @@ Read \code{dragen.tsv.gz} cancer report hrd file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_chord}{}}} -\subsection{Method \code{read_hrd_chord()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdChord}{}}} +\subsection{Method \code{read_hrdChord()}}{ Read \code{chord.tsv.gz} cancer report hrd file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_hrd_chord(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrdChord(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -178,12 +178,12 @@ Read \code{chord.tsv.gz} cancer report hrd file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_hrdetect}{}}} -\subsection{Method \code{read_hrd_hrdetect()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdHrdetect}{}}} +\subsection{Method \code{read_hrdHrdetect()}}{ Read \code{hrdetect.tsv.gz} cancer report hrd file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_hrd_hrdetect(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrdHrdetect(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -195,12 +195,12 @@ Read \code{hrdetect.tsv.gz} cancer report hrd file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigstsv}{}}} -\subsection{Method \code{read_sigstsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigsTsv}{}}} +\subsection{Method \code{read_sigsTsv()}}{ Read signature cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigstsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigsTsv(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -212,12 +212,12 @@ Read signature cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_qcsum}{}}} -\subsection{Method \code{read_qcsum()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_qcSum}{}}} +\subsection{Method \code{read_qcSum()}}{ Read \code{qc_summary.tsv.gz} cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_qcsum(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_qcSum(x)}\if{html}{\out{
}} } \subsection{Arguments}{ diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 76a6b1e..f9e92e2 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -69,12 +69,12 @@ d_write <- um2$write( \itemize{ \item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}} \item \href{#method-Wf_umccrise-print}{\code{Wf_umccrise$print()}} -\item \href{#method-Wf_umccrise-read_pcgr_json}{\code{Wf_umccrise$read_pcgr_json()}} -\item \href{#method-Wf_umccrise-read_hrd_chord}{\code{Wf_umccrise$read_hrd_chord()}} -\item \href{#method-Wf_umccrise-read_hrd_hrdetect}{\code{Wf_umccrise$read_hrd_hrdetect()}} -\item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}} -\item \href{#method-Wf_umccrise-read_qcsum}{\code{Wf_umccrise$read_qcsum()}} -\item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}} +\item \href{#method-Wf_umccrise-read_pcgrJson}{\code{Wf_umccrise$read_pcgrJson()}} +\item \href{#method-Wf_umccrise-read_hrdChord}{\code{Wf_umccrise$read_hrdChord()}} +\item \href{#method-Wf_umccrise-read_hrdHrdetect}{\code{Wf_umccrise$read_hrdHrdetect()}} +\item \href{#method-Wf_umccrise-read_sigsTsv}{\code{Wf_umccrise$read_sigsTsv()}} +\item \href{#method-Wf_umccrise-read_qcSum}{\code{Wf_umccrise$read_qcSum()}} +\item \href{#method-Wf_umccrise-read_conpair}{\code{Wf_umccrise$read_conpair()}} \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} } } @@ -129,12 +129,12 @@ Print details about the Workflow. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgr_json}{}}} -\subsection{Method \code{read_pcgr_json()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgrJson}{}}} +\subsection{Method \code{read_pcgrJson()}}{ Read \code{pcgr.json.gz} file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_pcgr_json(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_pcgrJson(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -146,12 +146,12 @@ Read \code{pcgr.json.gz} file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrd_chord}{}}} -\subsection{Method \code{read_hrd_chord()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrdChord}{}}} +\subsection{Method \code{read_hrdChord()}}{ Read \code{chord.tsv.gz} cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrd_chord(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrdChord(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -163,12 +163,12 @@ Read \code{chord.tsv.gz} cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrd_hrdetect}{}}} -\subsection{Method \code{read_hrd_hrdetect()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrdHrdetect}{}}} +\subsection{Method \code{read_hrdHrdetect()}}{ Read \code{hrdetect.tsv.gz} cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrd_hrdetect(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrdHrdetect(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -180,12 +180,12 @@ Read \code{hrdetect.tsv.gz} cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigstsv}{}}} -\subsection{Method \code{read_sigstsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsTsv}{}}} +\subsection{Method \code{read_sigsTsv()}}{ Read signature cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigsTsv(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -197,12 +197,12 @@ Read signature cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcsum}{}}} -\subsection{Method \code{read_qcsum()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcSum}{}}} +\subsection{Method \code{read_qcSum()}}{ Read \code{qc_summary.tsv.gz} cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_qcsum(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_qcSum(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -214,12 +214,12 @@ Read \code{qc_summary.tsv.gz} cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_conpairmultiqc}{}}} -\subsection{Method \code{read_conpairmultiqc()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_conpair}{}}} +\subsection{Method \code{read_conpair()}}{ Read multiqc_conpair.txt file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_conpairmultiqc(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_conpair(x)}\if{html}{\out{
}} } \subsection{Arguments}{ From eafd6cbf7b726e60ad669cdc6160e3454147b605 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 20 Oct 2024 23:57:48 +1100 Subject: [PATCH 07/32] umccrise: use full path for regexes --- R/umccrise.R | 63 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/R/umccrise.R b/R/umccrise.R index 0c0c296..dc4b5d9 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -25,14 +25,18 @@ #' ) #' #' #---- GDS ----# -#' SubjectID <- "SBJ03043" -#' SampleID_tumor <- "PRJ230004" +#' SubjectID <- "SBJ03606" +#' SampleID_tumor <- "PRJ230726" +#' SampleID_normal <- "PRJ230725" #' prefix <- glue("{SubjectID}__{SampleID_tumor}") #' p1_gds <- "gds://production/analysis_data" -#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +#' p <- file.path(p1_gds, "SBJ03606/umccrise/20240829d11e13b0/L2300828__L2300827") #' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) #' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' um2 <- Wf_umccrise$new( +#' path = p, SubjectID = SubjectID, +#' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal +#' ) #' um2$list_files(max_files = 8) #' um2$list_files_filter_relevant(ica_token = token, max_files = 500) #' d <- um2$download_files( @@ -53,36 +57,51 @@ Wf_umccrise <- R6::R6Class( "Wf_umccrise", inherit = Wf, public = list( - #' @field SubjectID The SubjectID of the sample (needed for path lookup). - #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + #' @field SubjectID The SubjectID of the sample. + #' @field SampleID_tumor The SampleID of the tumor sample. + #' @field SampleID_normal The SampleID of the normal sample. SubjectID = NULL, SampleID_tumor = NULL, + SampleID_normal = NULL, #' @description Create a new Wf_umccrise object. #' @param path Path to directory with raw workflow results (from GDS, S3, or #' local filesystem). - #' @param SubjectID The SubjectID of the sample (needed for path lookup). - #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). - initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { + #' @param SubjectID The SubjectID of the sample. + #' @param SampleID_tumor The SampleID of the tumor sample. + #' @field SampleID_normal The SampleID of the normal sample. + initialize = function(path = NULL, SubjectID = NULL, + SampleID_tumor = NULL, SampleID_normal = NULL) { wname <- "umccrise" pref <- glue("{SubjectID}__{SampleID_tumor}") + pref_norm <- glue("{SubjectID}__{SampleID_normal}") crep <- "cancer_report_tables" + smallv <- "small_variants" regexes <- tibble::tribble( ~regex, ~fun, - glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord", - glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect", - glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv", - glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv", - glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv", - glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv", - glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum", - glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpair", - glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgrJson" + glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord", + glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect", + glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv", + glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum", + glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpair", + glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgrJson", + glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.somatic\\.tsv$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY" ) |> - dplyr::mutate(fun = paste0("read_", .data$fun)) - + dplyr::mutate( + fun = paste0("read_", .data$fun), + fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun) + ) super$initialize(path = path, wname = wname, regexes = regexes) self$SubjectID <- SubjectID self$SampleID_tumor <- SampleID_tumor + self$SampleID_normal <- SampleID_normal }, #' @description Print details about the Workflow. #' @param ... (ignored). @@ -93,7 +112,8 @@ Wf_umccrise <- R6::R6Class( "wname", self$wname, "filesystem", self$filesystem, "SubjectID", self$SubjectID, - "SampleID_tumor", self$SampleID_tumor + "SampleID_tumor", self$SampleID_tumor, + "SampleID_normal", self$SampleID_normal ) print(res) invisible(self) @@ -141,7 +161,6 @@ Wf_umccrise <- R6::R6Class( .default = "" ) } - suffix <- .sigsSuffix(x) ct <- readr::cols( .default = "d", From d08942dcbe114aace92e02a50c0d8cc027752800 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 22 Oct 2024 18:13:38 +1100 Subject: [PATCH 08/32] Wf: add active regex bindings + private fields --- R/Wf.R | 78 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index 089d575..6fe882d 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -61,16 +61,26 @@ #' @export Wf <- R6::R6Class( "Wf", + private = list( + .path = NULL, + .wname = NULL, + .regexes = NULL, + .filesystem = NULL + ), + active = list( + regexes = function(value) { + if (missing(value)) { + private$.regexes + } else { + assertthat::assert_that( + tibble::is_tibble(value), + all(c("regex", "fun") %in% colnames(value)) + ) + private$.regexes <- value + } + } + ), public = list( - #' @field path Path to directory with raw workflow results (from GDS, S3, or - #' local filesystem). - #' @field wname Name of workflow (e.g. umccrise, sash). - #' @field filesystem Filesystem of `path` (gds/s3/local). - #' @field regexes Tibble with file `regex` and `fun`ction to parse it. - path = NULL, - wname = NULL, - filesystem = NULL, - regexes = NULL, #' @description Create a new Workflow object. #' @param path Path to directory with raw workflow results. #' @param wname Name of workflow. @@ -94,23 +104,28 @@ Wf <- R6::R6Class( ) subwnames <- c("dragen") assertthat::assert_that(wname %in% c(wnames, subwnames)) - self$path <- sub("/$", "", path) # remove potential trailing slash - self$wname <- wname - self$filesystem <- dplyr::case_when( + private$.path <- sub("/$", "", path) # remove potential trailing slash + private$.wname <- wname + private$.filesystem <- dplyr::case_when( grepl("^gds://", path) ~ "gds", grepl("^s3://", path) ~ "s3", .default = "local" ) - self$regexes <- regexes + assertthat::assert_that( + tibble::is_tibble(regexes), + all(c("regex", "fun") %in% colnames(regexes)) + ) + private$.regexes <- regexes }, #' @description Print details about the Workflow. #' @param ... (ignored). print = function(...) { res <- tibble::tribble( ~var, ~value, - "path", self$path, - "wname", self$wname, - "filesystem", self$filesystem + "path", private$.path, + "wname", private$.wname, + "filesystem", private$.filesystem, + "nregexes", as.character(nrow(private$.regexes)) ) print(res) invisible(self) @@ -120,13 +135,13 @@ Wf <- R6::R6Class( #' @param max_files Max number of files to list (for gds/s3 only). #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param ... Passed on to `gds_list_files_dir` function. - list_files = function(path = self$path, max_files = 1000, + list_files = function(path = private$.path, max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { - if (self$filesystem == "gds") { + if (private$.filesystem == "gds") { d <- gds_list_files_dir( gdsdir = path, token = ica_token, page_size = max_files, ... ) - } else if (self$filesystem == "s3") { + } else if (private$.filesystem == "s3") { d <- s3_list_files_dir(s3dir = path, max_objects = max_files) } else { d <- local_list_files_dir(localdir = path, max_files = max_files) @@ -139,15 +154,15 @@ Wf <- R6::R6Class( #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param ... Passed on to the `gds_list_files_filter_relevant` or #' the `s3_list_files_filter_relevant` function. - list_files_filter_relevant = function(path = self$path, max_files = 1000, + list_files_filter_relevant = function(path = private$.path, max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { - regexes <- self$regexes + regexes <- private$.regexes assertthat::assert_that(!is.null(regexes)) - if (self$filesystem == "gds") { + if (private$.filesystem == "gds") { d <- gds_list_files_filter_relevant( gdsdir = path, regexes = regexes, token = ica_token, page_size = max_files, ... ) - } else if (self$filesystem == "s3") { + } else if (private$.filesystem == "s3") { d <- s3_list_files_filter_relevant( s3dir = path, regexes = regexes, max_objects = max_files, ... ) @@ -167,28 +182,27 @@ Wf <- R6::R6Class( #' download them). #' @param recursive Should files be returned recursively _in and under_ the specified #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - download_files = function(path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + download_files = function(path = private$.path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, dryrun = FALSE, recursive = NULL) { - # TODO: add envvar checker - regexes <- self$regexes + regexes <- private$.regexes assertthat::assert_that(!is.null(regexes)) - if (self$filesystem == "gds") { + if (private$.filesystem == "gds") { d <- dr_gds_download( gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token, page_size = max_files, dryrun = dryrun, recursive = recursive ) if (!dryrun) { - self$filesystem <- "local" - self$path <- outdir + private$.filesystem <- "local" + private$.path <- outdir } - } else if (self$filesystem == "s3") { + } else if (private$.filesystem == "s3") { d <- dr_s3_download( s3dir = path, outdir = outdir, regexes = regexes, max_objects = max_files, dryrun = dryrun ) if (!dryrun) { - self$filesystem <- "local" - self$path <- outdir + private$.filesystem <- "local" + private$.path <- outdir } } else { d <- self$list_files_filter_relevant(regexes = regexes, max_files = max_files) From 483313ed0b0ec8d54045b87519a45a7b695ebfe4 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 22 Oct 2024 18:44:45 +1100 Subject: [PATCH 09/32] update man/ --- man/Wf.Rd | 20 +++----- man/Wf_sash.Rd | 77 +++++++++++++++++++++++------- man/Wf_sash_download_tidy_write.Rd | 10 +++- man/Wf_umccrise.Rd | 31 ++++++++---- 4 files changed, 98 insertions(+), 40 deletions(-) diff --git a/man/Wf.Rd b/man/Wf.Rd index b2c99f3..7e98ebf 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -68,17 +68,11 @@ d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryr } } -\section{Public fields}{ -\if{html}{\out{
}} +\section{Active bindings}{ +\if{html}{\out{
}} \describe{ -\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or -local filesystem).} - -\item{\code{wname}}{Name of workflow (e.g. umccrise, sash).} - -\item{\code{filesystem}}{Filesystem of \code{path} (gds/s3/local).} - -\item{\code{regexes}}{Tibble with file \code{regex} and \code{fun}ction to parse it.} +\item{\code{regexes}}{Get/Set regexes. Tibble with file \code{regex} and \code{fun}ction +to parse it.} } \if{html}{\out{
}} } @@ -140,7 +134,7 @@ Print details about the Workflow. List all files under given path. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$list_files( - path = self$path, + path = private$.path, max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ... @@ -168,7 +162,7 @@ List all files under given path. List dracarys files under given path \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$list_files_filter_relevant( - path = self$path, + path = private$.path, max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ... @@ -197,7 +191,7 @@ the \code{s3_list_files_filter_relevant} function.} Download files from GDS/S3 to local filesystem. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$download_files( - path = self$path, + path = private$.path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd index 1b7c81f..1ae526f 100644 --- a/man/Wf_sash.Rd +++ b/man/Wf_sash.Rd @@ -10,13 +10,27 @@ Reads and writes tidy versions of files from the \code{sash} workflow \dontrun{ #---- Local ----# -p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ03324/sash" -p2 <- "202408309698c304/L2300777_L2300776" +p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ02862/sash" +p2 <- "20240830ece6b0b7/L2201449_L2201450" p <- normalizePath(file.path(p1, p2)) -SubjectID <- "SBJ03324" -SampleID_tumor <- "PRJ230432" -prefix <- glue("{SubjectID}__{SampleID_tumor}") -s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +SubjectID <- "SBJ02862" +SampleID_tumor <- "PRJ222112" +SampleID_normal <- "PRJ222114" +prefix <- glue("{SubjectID}_{SampleID_tumor}") +s1 <- Wf_sash$new( + path = p, SubjectID = SubjectID, + SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal +) +#-- test regexes active binding +counts1 <- glue( + "{p}/{prefix}/smlv_somatic/report/", + "{SampleID_tumor}\\\\.somatic\\\\.variant_counts_process\\\\.json$" +) +regexes1 <- tibble::tribble( + ~regex, ~fun, + counts1, "read_smlvSomCounts" +) +s1$regexes <- regexes1 s1$list_files(max_files = 20) s1$list_files_filter_relevant(max_files = 300) d <- s1$download_files(max_files = 1000, dryrun = F) @@ -29,13 +43,17 @@ d_write <- s1$write( ) #---- S3 ----# -p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" -p2 <- "202408270b93455e/L2401308_L2401307" +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ02862/sash" +p2 <- "20240830ece6b0b7/L2201449_L2201450" p <- file.path(p1, p2) -SubjectID <- "SBJ05571" -SampleID_tumor <- "MDX240307" +SubjectID <- "SBJ02862" +SampleID_tumor <- "PRJ222112" +SampleID_normal <- "PRJ222114" prefix <- glue("{SubjectID}__{SampleID_tumor}") -s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +s1 <- Wf_sash$new( + path = p, SubjectID = SubjectID, + SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal +) s1$list_files(max_files = 20) s1$list_files_filter_relevant() outdir <- sub("s3:/", "~/s3", p) @@ -56,9 +74,11 @@ d_write <- s1$write( \section{Public fields}{ \if{html}{\out{
}} \describe{ -\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} +\item{\code{SubjectID}}{The SubjectID of the sample.} -\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample.} + +\item{\code{SampleID_normal}}{The SampleID of the normal sample.} } \if{html}{\out{
}} } @@ -67,6 +87,7 @@ d_write <- s1$write( \itemize{ \item \href{#method-Wf_sash-new}{\code{Wf_sash$new()}} \item \href{#method-Wf_sash-print}{\code{Wf_sash$print()}} +\item \href{#method-Wf_sash-read_smlvSomCounts}{\code{Wf_sash$read_smlvSomCounts()}} \item \href{#method-Wf_sash-read_pcgrJson}{\code{Wf_sash$read_pcgrJson()}} \item \href{#method-Wf_sash-read_hrdDragen}{\code{Wf_sash$read_hrdDragen()}} \item \href{#method-Wf_sash-read_hrdChord}{\code{Wf_sash$read_hrdChord()}} @@ -93,7 +114,12 @@ d_write <- s1$write( \subsection{Method \code{new()}}{ Create a new Wf_sash object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$new(path = NULL, SubjectID = NULL, SampleID_tumor = NULL)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$new( + path = NULL, + SubjectID = NULL, + SampleID_tumor = NULL, + SampleID_normal = NULL +)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -102,9 +128,11 @@ Create a new Wf_sash object. \item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or local filesystem).} -\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} +\item{\code{SubjectID}}{The SubjectID of the sample.} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample.} -\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +\item{\code{SampleID_normal}}{The SampleID of the tumor sample.} } \if{html}{\out{
}} } @@ -127,6 +155,23 @@ Print details about the Workflow. } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_smlvSomCounts}{}}} +\subsection{Method \code{read_smlvSomCounts()}}{ +Read \code{somatic.variant_counts_process.json} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_smlvSomCounts(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgrJson}{}}} \subsection{Method \code{read_pcgrJson()}}{ diff --git a/man/Wf_sash_download_tidy_write.Rd b/man/Wf_sash_download_tidy_write.Rd index a806fdd..ca0c538 100644 --- a/man/Wf_sash_download_tidy_write.Rd +++ b/man/Wf_sash_download_tidy_write.Rd @@ -8,10 +8,12 @@ Wf_sash_download_tidy_write( path, SubjectID, SampleID_tumor, + SampleID_normal, outdir, format = "rds", max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + regexes = NULL, dryrun = FALSE ) } @@ -19,9 +21,11 @@ Wf_sash_download_tidy_write( \item{path}{Path to directory with raw workflow results (from GDS, S3, or local filesystem).} -\item{SubjectID}{The SubjectID of the sample (needed for path lookup).} +\item{SubjectID}{The SubjectID of the sample.} -\item{SampleID_tumor}{The SampleID of the tumor sample (needed for path lookup).} +\item{SampleID_tumor}{The SampleID of the tumor sample.} + +\item{SampleID_normal}{The SampleID of the normal sample.} \item{outdir}{Path to output directory.} @@ -31,6 +35,8 @@ local filesystem).} \item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} +\item{regexes}{Tibble with file \code{regex} and \code{fun}ction to parse it.} + \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} } diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index f9e92e2..d19c3f6 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -28,14 +28,18 @@ d_write <- um1$write( ) #---- GDS ----# -SubjectID <- "SBJ03043" -SampleID_tumor <- "PRJ230004" +SubjectID <- "SBJ03606" +SampleID_tumor <- "PRJ230726" +SampleID_normal <- "PRJ230725" prefix <- glue("{SubjectID}__{SampleID_tumor}") p1_gds <- "gds://production/analysis_data" -p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +p <- file.path(p1_gds, "SBJ03606/umccrise/20240829d11e13b0/L2300828__L2300827") outdir <- file.path(sub("gds:/", "~/icav1/g", p)) token <- Sys.getenv("ICA_ACCESS_TOKEN") -um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +um2 <- Wf_umccrise$new( + path = p, SubjectID = SubjectID, + SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal +) um2$list_files(max_files = 8) um2$list_files_filter_relevant(ica_token = token, max_files = 500) d <- um2$download_files( @@ -58,9 +62,11 @@ d_write <- um2$write( \section{Public fields}{ \if{html}{\out{
}} \describe{ -\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} +\item{\code{SubjectID}}{The SubjectID of the sample.} -\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample.} + +\item{\code{SampleID_normal}}{The SampleID of the normal sample.} } \if{html}{\out{
}} } @@ -95,7 +101,12 @@ d_write <- um2$write( \subsection{Method \code{new()}}{ Create a new Wf_umccrise object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$new(path = NULL, SubjectID = NULL, SampleID_tumor = NULL)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$new( + path = NULL, + SubjectID = NULL, + SampleID_tumor = NULL, + SampleID_normal = NULL +)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -104,9 +115,11 @@ Create a new Wf_umccrise object. \item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or local filesystem).} -\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} +\item{\code{SubjectID}}{The SubjectID of the sample.} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample.} -\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +\item{\code{SampleID_normal}}{The SampleID of the normal sample.} } \if{html}{\out{
}} } From a30b4ff86ef259546a38e9dc819e9ed9f23a543e Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 22 Oct 2024 18:45:58 +1100 Subject: [PATCH 10/32] sash: update regexes --- R/Wf.R | 2 + R/sash.R | 118 ++++++++++++++++++++++++++++++++++++--------------- R/umccrise.R | 4 +- 3 files changed, 87 insertions(+), 37 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index 6fe882d..d6539c2 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -68,6 +68,8 @@ Wf <- R6::R6Class( .filesystem = NULL ), active = list( + #' @field regexes Get/Set regexes. Tibble with file `regex` and `fun`ction + #' to parse it. regexes = function(value) { if (missing(value)) { private$.regexes diff --git a/R/sash.R b/R/sash.R index 725d36c..c000da6 100644 --- a/R/sash.R +++ b/R/sash.R @@ -7,13 +7,27 @@ #' \dontrun{ #' #' #---- Local ----# -#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ03324/sash" -#' p2 <- "202408309698c304/L2300777_L2300776" +#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ02862/sash" +#' p2 <- "20240830ece6b0b7/L2201449_L2201450" #' p <- normalizePath(file.path(p1, p2)) -#' SubjectID <- "SBJ03324" -#' SampleID_tumor <- "PRJ230432" -#' prefix <- glue("{SubjectID}__{SampleID_tumor}") -#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' SubjectID <- "SBJ02862" +#' SampleID_tumor <- "PRJ222112" +#' SampleID_normal <- "PRJ222114" +#' prefix <- glue("{SubjectID}_{SampleID_tumor}") +#' s1 <- Wf_sash$new( +#' path = p, SubjectID = SubjectID, +#' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal +#' ) +#' #-- test regexes active binding +#' counts1 <- glue( +#' "{p}/{prefix}/smlv_somatic/report/", +#' "{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$" +#' ) +#' regexes1 <- tibble::tribble( +#' ~regex, ~fun, +#' counts1, "read_smlvSomCounts" +#' ) +#' s1$regexes <- regexes1 #' s1$list_files(max_files = 20) #' s1$list_files_filter_relevant(max_files = 300) #' d <- s1$download_files(max_files = 1000, dryrun = F) @@ -26,13 +40,17 @@ #' ) #' #' #---- S3 ----# -#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" -#' p2 <- "202408270b93455e/L2401308_L2401307" +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ02862/sash" +#' p2 <- "20240830ece6b0b7/L2201449_L2201450" #' p <- file.path(p1, p2) -#' SubjectID <- "SBJ05571" -#' SampleID_tumor <- "MDX240307" +#' SubjectID <- "SBJ02862" +#' SampleID_tumor <- "PRJ222112" +#' SampleID_normal <- "PRJ222114" #' prefix <- glue("{SubjectID}__{SampleID_tumor}") -#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' s1 <- Wf_sash$new( +#' path = p, SubjectID = SubjectID, +#' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal +#' ) #' s1$list_files(max_files = 20) #' s1$list_files_filter_relevant() #' outdir <- sub("s3:/", "~/s3", p) @@ -51,51 +69,74 @@ Wf_sash <- R6::R6Class( "Wf_sash", inherit = Wf, public = list( - #' @field SubjectID The SubjectID of the sample (needed for path lookup). - #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + #' @field SubjectID The SubjectID of the sample. + #' @field SampleID_tumor The SampleID of the tumor sample. + #' @field SampleID_normal The SampleID of the normal sample. SubjectID = NULL, SampleID_tumor = NULL, + SampleID_normal = NULL, #' @description Create a new Wf_sash object. #' @param path Path to directory with raw workflow results (from GDS, S3, or #' local filesystem). - #' @param SubjectID The SubjectID of the sample (needed for path lookup). - #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). - initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { + #' @param SubjectID The SubjectID of the sample. + #' @param SampleID_tumor The SampleID of the tumor sample. + #' @param SampleID_normal The SampleID of the tumor sample. + initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL, + SampleID_normal = NULL) { wname <- "sash" pref <- glue("{SubjectID}_{SampleID_tumor}") crep <- "cancer_report/cancer_report_tables" regexes <- tibble::tribble( ~regex, ~fun, - glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord", - glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect", - glue("{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrd_dragen", - glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigstsv", - glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigstsv", - glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigstsv", - glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigstsv", - glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum", - glue("{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgr_json" + glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord", + glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect", + glue("{path}/{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrdDragen", + glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv", + glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum", + glue("{path}/{pref}/purple/{SampleID_tumor}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgrJson", + glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts", + glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi"), "DOWNLOAD_ONLY", ) |> - dplyr::mutate(fun = paste0("read_", .data$fun)) + dplyr::mutate( + fun = paste0("read_", .data$fun), + fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun) + ) super$initialize(path = path, wname = wname, regexes = regexes) self$SubjectID <- SubjectID self$SampleID_tumor <- SampleID_tumor + self$SampleID_normal <- SampleID_normal }, #' @description Print details about the Workflow. #' @param ... (ignored). print = function(...) { res <- tibble::tribble( ~var, ~value, - "path", self$path, - "wname", self$wname, - "filesystem", self$filesystem, + "path", private$.path, + "wname", private$.wname, + "filesystem", private$.filesystem, + "nregexes", as.character(nrow(private$.regexes)), "SubjectID", self$SubjectID, - "SampleID_tumor", self$SampleID_tumor + "SampleID_tumor", self$SampleID_tumor, + "SampleID_normal", self$SampleID_normal ) print(res) invisible(self) }, + #' @description Read `somatic.variant_counts_process.json` file. + #' @param x Path to file. + read_smlvSomCounts = function(x) { + dat <- jsonlite::read_json(x) |> + tibble::as_tibble_row() + tibble::tibble(name = "smlvsomcounts", data = list(dat[])) + }, #' @description Read `pcgr.json.gz` file. #' @param x Path to file. read_pcgrJson = function(x) { @@ -193,14 +234,17 @@ Wf_sash <- R6::R6Class( #' #' @param path Path to directory with raw workflow results (from GDS, S3, or #' local filesystem). -#' @param SubjectID The SubjectID of the sample (needed for path lookup). -#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). +#' @param SubjectID The SubjectID of the sample. +#' @param SampleID_tumor The SampleID of the tumor sample. +#' @param SampleID_normal The SampleID of the normal sample. #' @param outdir Path to output directory. #' @param format Format of output files. #' @param max_files Max number of files to list. #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). +#' @param regexes Tibble with file `regex` and `fun`ction to parse it. +#' #' @return List where each element is a tidy tibble of a sash file. #' #' @examples @@ -218,13 +262,17 @@ Wf_sash <- R6::R6Class( #' ) #' } #' @export -Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor, +Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor, SampleID_normal, outdir, format = "rds", max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - dryrun = FALSE) { + regexes = NULL, dryrun = FALSE) { s <- Wf_sash$new( - path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor + path = path, SubjectID = SubjectID, + SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal ) + if (!is.null(regexes)) { + s$regexes <- regexes + } d_dl <- s$download_files( outdir = outdir, ica_token = ica_token, max_files = max_files, dryrun = dryrun diff --git a/R/umccrise.R b/R/umccrise.R index dc4b5d9..79aeb12 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -68,7 +68,7 @@ Wf_umccrise <- R6::R6Class( #' local filesystem). #' @param SubjectID The SubjectID of the sample. #' @param SampleID_tumor The SampleID of the tumor sample. - #' @field SampleID_normal The SampleID of the normal sample. + #' @param SampleID_normal The SampleID of the normal sample. initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL, SampleID_normal = NULL) { wname <- "umccrise" @@ -90,7 +90,7 @@ Wf_umccrise <- R6::R6Class( glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY", glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY", glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.somatic\\.tsv$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY", glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz$"), "DOWNLOAD_ONLY", glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY" ) |> From ace1028461d205dd1dfae3ae0ce058e5790082cf Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 24 Oct 2024 09:20:05 +1100 Subject: [PATCH 11/32] fix regexes --- R/sash.R | 11 ++++++----- R/umccrise.R | 23 +++++++++++------------ man/Wf_sash.Rd | 5 ++--- man/Wf_sash_download_tidy_write.Rd | 3 ++- man/Wf_umccrise.Rd | 11 +++++------ 5 files changed, 26 insertions(+), 27 deletions(-) diff --git a/R/sash.R b/R/sash.R index c000da6..c73fb06 100644 --- a/R/sash.R +++ b/R/sash.R @@ -46,7 +46,6 @@ #' SubjectID <- "SBJ02862" #' SampleID_tumor <- "PRJ222112" #' SampleID_normal <- "PRJ222114" -#' prefix <- glue("{SubjectID}__{SampleID_tumor}") #' s1 <- Wf_sash$new( #' path = p, SubjectID = SubjectID, #' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal @@ -100,9 +99,9 @@ Wf_sash <- R6::R6Class( glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgrJson", glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY", glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts", - glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi"), "DOWNLOAD_ONLY", + # glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts", + glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", ) |> dplyr::mutate( fun = paste0("read_", .data$fun), @@ -243,7 +242,9 @@ Wf_sash <- R6::R6Class( #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @param regexes Tibble with file `regex` and `fun`ction to parse it. +#' @param regexes Tibble with file `regex` and `fun`ction to parse it. Use only +#' if you want to override the default regexes used for this workflow. +#' #' #' @return List where each element is a tidy tibble of a sash file. #' diff --git a/R/umccrise.R b/R/umccrise.R index 79aeb12..1ffb223 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -25,12 +25,11 @@ #' ) #' #' #---- GDS ----# -#' SubjectID <- "SBJ03606" -#' SampleID_tumor <- "PRJ230726" -#' SampleID_normal <- "PRJ230725" -#' prefix <- glue("{SubjectID}__{SampleID_tumor}") +#' SubjectID <- "SBJ04662" +#' SampleID_tumor <- "PRJ240647" +#' SampleID_normal <- "PRJ240646" #' p1_gds <- "gds://production/analysis_data" -#' p <- file.path(p1_gds, "SBJ03606/umccrise/20240829d11e13b0/L2300828__L2300827") +#' p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") #' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) #' token <- Sys.getenv("ICA_ACCESS_TOKEN") #' um2 <- Wf_umccrise$new( @@ -38,7 +37,7 @@ #' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal #' ) #' um2$list_files(max_files = 8) -#' um2$list_files_filter_relevant(ica_token = token, max_files = 500) +#' um2$list_files_filter_relevant(ica_token = token, max_files = 1000) #' d <- um2$download_files( #' outdir = outdir, ica_token = token, #' max_files = 1000, dryrun = F @@ -73,7 +72,6 @@ Wf_umccrise <- R6::R6Class( SampleID_tumor = NULL, SampleID_normal = NULL) { wname <- "umccrise" pref <- glue("{SubjectID}__{SampleID_tumor}") - pref_norm <- glue("{SubjectID}__{SampleID_normal}") crep <- "cancer_report_tables" smallv <- "small_variants" regexes <- tibble::tribble( @@ -91,8 +89,8 @@ Wf_umccrise <- R6::R6Class( glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY", glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY" + glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY", + glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", ) |> dplyr::mutate( fun = paste0("read_", .data$fun), @@ -108,9 +106,10 @@ Wf_umccrise <- R6::R6Class( print = function(...) { res <- tibble::tribble( ~var, ~value, - "path", self$path, - "wname", self$wname, - "filesystem", self$filesystem, + "path", private$.path, + "wname", private$.wname, + "filesystem", private$.filesystem, + "nregexes", as.character(nrow(private$.regexes)), "SubjectID", self$SubjectID, "SampleID_tumor", self$SampleID_tumor, "SampleID_normal", self$SampleID_normal diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd index 1ae526f..638c69e 100644 --- a/man/Wf_sash.Rd +++ b/man/Wf_sash.Rd @@ -27,8 +27,8 @@ counts1 <- glue( "{SampleID_tumor}\\\\.somatic\\\\.variant_counts_process\\\\.json$" ) regexes1 <- tibble::tribble( - ~regex, ~fun, - counts1, "read_smlvSomCounts" + ~regex, ~fun, + counts1, "read_smlvSomCounts" ) s1$regexes <- regexes1 s1$list_files(max_files = 20) @@ -49,7 +49,6 @@ p <- file.path(p1, p2) SubjectID <- "SBJ02862" SampleID_tumor <- "PRJ222112" SampleID_normal <- "PRJ222114" -prefix <- glue("{SubjectID}__{SampleID_tumor}") s1 <- Wf_sash$new( path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal diff --git a/man/Wf_sash_download_tidy_write.Rd b/man/Wf_sash_download_tidy_write.Rd index ca0c538..2a3260e 100644 --- a/man/Wf_sash_download_tidy_write.Rd +++ b/man/Wf_sash_download_tidy_write.Rd @@ -35,7 +35,8 @@ local filesystem).} \item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -\item{regexes}{Tibble with file \code{regex} and \code{fun}ction to parse it.} +\item{regexes}{Tibble with file \code{regex} and \code{fun}ction to parse it. Use only +if you want to override the default regexes used for this workflow.} \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index d19c3f6..a43d7ad 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -28,12 +28,11 @@ d_write <- um1$write( ) #---- GDS ----# -SubjectID <- "SBJ03606" -SampleID_tumor <- "PRJ230726" -SampleID_normal <- "PRJ230725" -prefix <- glue("{SubjectID}__{SampleID_tumor}") +SubjectID <- "SBJ04662" +SampleID_tumor <- "PRJ240647" +SampleID_normal <- "PRJ240646" p1_gds <- "gds://production/analysis_data" -p <- file.path(p1_gds, "SBJ03606/umccrise/20240829d11e13b0/L2300828__L2300827") +p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") outdir <- file.path(sub("gds:/", "~/icav1/g", p)) token <- Sys.getenv("ICA_ACCESS_TOKEN") um2 <- Wf_umccrise$new( @@ -41,7 +40,7 @@ um2 <- Wf_umccrise$new( SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal ) um2$list_files(max_files = 8) -um2$list_files_filter_relevant(ica_token = token, max_files = 500) +um2$list_files_filter_relevant(ica_token = token, max_files = 1000) d <- um2$download_files( outdir = outdir, ica_token = token, max_files = 1000, dryrun = F From d9e0672ffd4b3b0e1d0bdece66e532436f907ac5 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 24 Oct 2024 15:07:37 +1100 Subject: [PATCH 12/32] DOWNLOAD_ONLY tidy: just return input path --- R/Wf.R | 5 +++++ R/tidy.R | 10 ++++++++-- man/Wf.Rd | 18 ++++++++++++++++++ man/Wf_dragen.Rd | 3 ++- man/Wf_sash.Rd | 3 ++- man/Wf_tso_ctdna_tumor_only.Rd | 3 ++- man/Wf_tso_ctdna_tumor_only_v2.Rd | 3 ++- man/Wf_umccrise.Rd | 3 ++- man/tidy_files.Rd | 6 +++++- 9 files changed, 46 insertions(+), 8 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index d6539c2..84d4644 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -175,6 +175,11 @@ Wf <- R6::R6Class( } d }, + #' @description For DOWNLOAD_ONLY files, just return the input path. + #' @param x Path with raw results. + DOWNLOAD_ONLY = function(x) { + tibble::tibble(name = glue("DOWNLOAD_ONLY"), data = list(tibble::tibble(input_path = x))) + }, #' @description Download files from GDS/S3 to local filesystem. #' @param path Path with raw results. #' @param outdir Path to output directory. diff --git a/R/tidy.R b/R/tidy.R index a8c4780..1130066 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -14,11 +14,17 @@ #' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" #' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables" #' p <- file.path(p1, p2, "SBJ01155__PRJ211091-qc_summary.tsv.gz") +#' p_dl <- file.path( +#' p1, "L2101566__L2101565/SBJ01155__PRJ211091/small_variants", +#' "SBJ01155__PRJ211091-somatic-PASS.vcf.gz" +#' ) #' fun <- function(x) { #' d <- readr::read_tsv(x) #' tibble::tibble(name = "table1", data = list(d[])) #' } -#' x <- tibble::tibble(type = "fun", localpath = p) +#' x <- tibble::tibble( +#' type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl) +#' ) #' tidy_files(x) #' } #' @@ -27,7 +33,7 @@ tidy_files <- function(x, envir = parent.frame()) { assertthat::assert_that(is.data.frame(x)) assertthat::assert_that(all(c("type", "localpath") %in% colnames(x))) x |> - dplyr::filter(.data$type != "DOWNLOAD_ONLY") |> + # dplyr::filter(.data$type != "DOWNLOAD_ONLY") |> dplyr::rowwise() |> dplyr::mutate( data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath)) diff --git a/man/Wf.Rd b/man/Wf.Rd index 7e98ebf..5dff941 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -83,6 +83,7 @@ to parse it.} \item \href{#method-Wf-print}{\code{Wf$print()}} \item \href{#method-Wf-list_files}{\code{Wf$list_files()}} \item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}} +\item \href{#method-Wf-DOWNLOAD_ONLY}{\code{Wf$DOWNLOAD_ONLY()}} \item \href{#method-Wf-download_files}{\code{Wf$download_files()}} \item \href{#method-Wf-tidy_files}{\code{Wf$tidy_files()}} \item \href{#method-Wf-write}{\code{Wf$write()}} @@ -185,6 +186,23 @@ the \code{s3_list_files_filter_relevant} function.} } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-DOWNLOAD_ONLY}{}}} +\subsection{Method \code{DOWNLOAD_ONLY()}}{ +For DOWNLOAD_ONLY files, just return the input path. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$DOWNLOAD_ONLY(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path with raw results.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf-download_files}{}}} \subsection{Method \code{download_files()}}{ diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 141e855..66b91c6 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -65,8 +65,9 @@ d_write <- t1$write( } } \if{html}{\out{ -
Inherited methods +
Inherited methods
    +
  • dracarys::Wf$DOWNLOAD_ONLY()
  • dracarys::Wf$download_files()
  • dracarys::Wf$list_files()
  • dracarys::Wf$list_files_filter_relevant()
  • diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd index 638c69e..d3a8f66 100644 --- a/man/Wf_sash.Rd +++ b/man/Wf_sash.Rd @@ -97,8 +97,9 @@ d_write <- s1$write( } } \if{html}{\out{ -
    Inherited methods +
    Inherited methods
      +
    • dracarys::Wf$DOWNLOAD_ONLY()
    • dracarys::Wf$download_files()
    • dracarys::Wf$list_files()
    • dracarys::Wf$list_files_filter_relevant()
    • diff --git a/man/Wf_tso_ctdna_tumor_only.Rd b/man/Wf_tso_ctdna_tumor_only.Rd index aa37b47..a58245e 100644 --- a/man/Wf_tso_ctdna_tumor_only.Rd +++ b/man/Wf_tso_ctdna_tumor_only.Rd @@ -82,8 +82,9 @@ d_write <- t2$write( } } \if{html}{\out{ -
      Inherited methods +
      Inherited methods
        +
      • dracarys::Wf$DOWNLOAD_ONLY()
      • dracarys::Wf$download_files()
      • dracarys::Wf$list_files()
      • dracarys::Wf$list_files_filter_relevant()
      • diff --git a/man/Wf_tso_ctdna_tumor_only_v2.Rd b/man/Wf_tso_ctdna_tumor_only_v2.Rd index fb84a29..148aca0 100644 --- a/man/Wf_tso_ctdna_tumor_only_v2.Rd +++ b/man/Wf_tso_ctdna_tumor_only_v2.Rd @@ -91,8 +91,9 @@ d_write <- t2$write( } } \if{html}{\out{ -
        Inherited methods +
        Inherited methods
          +
        • dracarys::Wf$DOWNLOAD_ONLY()
        • dracarys::Wf$download_files()
        • dracarys::Wf$list_files()
        • dracarys::Wf$list_files_filter_relevant()
        • diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index a43d7ad..7b93d21 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -84,8 +84,9 @@ d_write <- um2$write( } } \if{html}{\out{ -
          Inherited methods +
          Inherited methods
            +
          • dracarys::Wf$DOWNLOAD_ONLY()
          • dracarys::Wf$download_files()
          • dracarys::Wf$list_files()
          • dracarys::Wf$list_files_filter_relevant()
          • diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd index caff8d2..75f3625 100644 --- a/man/tidy_files.Rd +++ b/man/tidy_files.Rd @@ -25,11 +25,15 @@ Tidies files into a tibble with parsed data. p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables" p <- file.path(p1, p2, "SBJ01155__PRJ211091-qc_summary.tsv.gz") +p_dl <- file.path( + p1, "L2101566__L2101565/SBJ01155__PRJ211091/small_variants", + "SBJ01155__PRJ211091-somatic-PASS.vcf.gz") fun <- function(x) { d <- readr::read_tsv(x) tibble::tibble(name = "table1", data = list(d[])) } -x <- tibble::tibble(type = "fun", localpath = p) +x <- tibble::tibble( + type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl)) tidy_files(x) } From ee0ec621e94db6411019cfc61a4cac4d7eca49ca Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 24 Oct 2024 16:12:46 +1100 Subject: [PATCH 13/32] better handling of DOWNLOAD_ONLY --- R/Wf.R | 12 ++++++-- R/tidy.R | 1 - R/umccrise.R | 38 ++++++++++++++++---------- man/Wf_umccrise.Rd | 13 +++++---- man/Wf_umccrise_download_tidy_write.Rd | 21 ++++++++------ man/tidy_files.Rd | 6 ++-- 6 files changed, 58 insertions(+), 33 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index 84d4644..e18d908 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -238,8 +238,16 @@ Wf <- R6::R6Class( d_write <- x |> dplyr::rowwise() |> dplyr::mutate( - p = glue("{prefix}_{.data$name}"), - out = list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid)) + p = ifelse( + .data$name != "DOWNLOAD_ONLY", + as.character(glue("{prefix}_{.data$name}")), + as.character(.data$data |> dplyr::pull("input_path")) + ), + out = ifelse( + .data$name != "DOWNLOAD_ONLY", + list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid)), + list(.data$data) + ) ) |> dplyr::ungroup() |> dplyr::select("name", "data", prefix = "p") diff --git a/R/tidy.R b/R/tidy.R index 1130066..9ca0496 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -33,7 +33,6 @@ tidy_files <- function(x, envir = parent.frame()) { assertthat::assert_that(is.data.frame(x)) assertthat::assert_that(all(c("type", "localpath") %in% colnames(x))) x |> - # dplyr::filter(.data$type != "DOWNLOAD_ONLY") |> dplyr::rowwise() |> dplyr::mutate( data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath)) diff --git a/R/umccrise.R b/R/umccrise.R index 1ffb223..743c58d 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -7,12 +7,15 @@ #' \dontrun{ #' #' #---- LOCAL ----# -#' SubjectID <- "SBJ03043" -#' SampleID_tumor <- "PRJ230004" -#' prefix <- glue("{SubjectID}__{SampleID_tumor}") +#' SubjectID <- "SBJ04662" +#' SampleID_tumor <- "PRJ240647" +#' SampleID_normal <- "PRJ240646" #' p1_local <- "~/icav1/g/production/analysis_data" -#' p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") -#' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' p <- file.path(normalizePath(p1_local), "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") +#' um1 <- Wf_umccrise$new( +#' path = p, SubjectID = SubjectID, +#' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal +#' ) #' um1$list_files(max_files = 10) #' um1$list_files_filter_relevant() #' d <- um1$download_files(max_files = 1000, dryrun = F) @@ -245,37 +248,42 @@ Wf_umccrise <- R6::R6Class( #' #' @param path Path to directory with raw workflow results (from GDS, S3, or #' local filesystem). -#' @param SubjectID The SubjectID of the sample (needed for path lookup). -#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). +#' @param SubjectID The SubjectID of the sample. +#' @param SampleID_tumor The SampleID of the tumor sample. +#' @param SampleID_normal The SampleID of the normal sample. #' @param outdir Path to output directory. #' @param format Format of output files. #' @param max_files Max number of files to list. #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @return List where each element is a tidy tibble of a umccrise file. +#' @return Tibble of tidy data as list-cols. #' #' @examples #' \dontrun{ -#' SubjectID <- "SBJ03043" -#' SampleID_tumor <- "PRJ230004" -#' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") -#' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +#' SubjectID <- "SBJ04662" +#' SampleID_tumor <- "PRJ240647" +#' SampleID_normal <- "PRJ240646" +#' p1_gds <- "gds://production/analysis_data" +#' p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") #' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) #' token <- Sys.getenv("ICA_ACCESS_TOKEN") #' d <- Wf_umccrise_download_tidy_write( -#' path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, +#' path = p, SubjectID = SubjectID, +#' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal, #' outdir = outdir, #' dryrun = F #' ) #' } #' @export -Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor, +Wf_umccrise_download_tidy_write <- function(path, SubjectID, + SampleID_tumor, SampleID_normal, outdir, format = "rds", max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), dryrun = FALSE) { um <- Wf_umccrise$new( - path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor + path = path, SubjectID = SubjectID, + SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal ) d_dl <- um$download_files( outdir = outdir, ica_token = ica_token, diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 7b93d21..de10726 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -10,12 +10,15 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow \dontrun{ #---- LOCAL ----# -SubjectID <- "SBJ03043" -SampleID_tumor <- "PRJ230004" -prefix <- glue("{SubjectID}__{SampleID_tumor}") +SubjectID <- "SBJ04662" +SampleID_tumor <- "PRJ240647" +SampleID_normal <- "PRJ240646" p1_local <- "~/icav1/g/production/analysis_data" -p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") -um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +p <- file.path(normalizePath(p1_local), "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") +um1 <- Wf_umccrise$new( + path = p, SubjectID = SubjectID, + SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal +) um1$list_files(max_files = 10) um1$list_files_filter_relevant() d <- um1$download_files(max_files = 1000, dryrun = F) diff --git a/man/Wf_umccrise_download_tidy_write.Rd b/man/Wf_umccrise_download_tidy_write.Rd index 05ca0bf..99eeacc 100644 --- a/man/Wf_umccrise_download_tidy_write.Rd +++ b/man/Wf_umccrise_download_tidy_write.Rd @@ -8,6 +8,7 @@ Wf_umccrise_download_tidy_write( path, SubjectID, SampleID_tumor, + SampleID_normal, outdir, format = "rds", max_files = 1000, @@ -19,9 +20,11 @@ Wf_umccrise_download_tidy_write( \item{path}{Path to directory with raw workflow results (from GDS, S3, or local filesystem).} -\item{SubjectID}{The SubjectID of the sample (needed for path lookup).} +\item{SubjectID}{The SubjectID of the sample.} -\item{SampleID_tumor}{The SampleID of the tumor sample (needed for path lookup).} +\item{SampleID_tumor}{The SampleID of the tumor sample.} + +\item{SampleID_normal}{The SampleID of the normal sample.} \item{outdir}{Path to output directory.} @@ -35,21 +38,23 @@ local filesystem).} download them).} } \value{ -List where each element is a tidy tibble of a umccrise file. +Tibble of tidy data as list-cols. } \description{ Downloads files from the \code{umccrise} workflow and writes them in a tidy format. } \examples{ \dontrun{ -SubjectID <- "SBJ03043" -SampleID_tumor <- "PRJ230004" -p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") -p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +SubjectID <- "SBJ04662" +SampleID_tumor <- "PRJ240647" +SampleID_normal <- "PRJ240646" +p1_gds <- "gds://production/analysis_data" +p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") outdir <- file.path(sub("gds:/", "~/icav1/g", p)) token <- Sys.getenv("ICA_ACCESS_TOKEN") d <- Wf_umccrise_download_tidy_write( - path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, + path = p, SubjectID = SubjectID, + SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal, outdir = outdir, dryrun = F ) diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd index 75f3625..c13cffd 100644 --- a/man/tidy_files.Rd +++ b/man/tidy_files.Rd @@ -27,13 +27,15 @@ p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables" p <- file.path(p1, p2, "SBJ01155__PRJ211091-qc_summary.tsv.gz") p_dl <- file.path( p1, "L2101566__L2101565/SBJ01155__PRJ211091/small_variants", - "SBJ01155__PRJ211091-somatic-PASS.vcf.gz") + "SBJ01155__PRJ211091-somatic-PASS.vcf.gz" +) fun <- function(x) { d <- readr::read_tsv(x) tibble::tibble(name = "table1", data = list(d[])) } x <- tibble::tibble( - type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl)) + type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl) +) tidy_files(x) } From a283e13fdfdd07507657712d1a185fa8ce4e4b80 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 26 Oct 2024 18:42:29 +1100 Subject: [PATCH 14/32] better handling of DOWNLOAD_ONLY --- R/Wf.R | 11 +++++++---- R/tidy.R | 21 +++++++++++++++++++-- R/umccrise.R | 36 ++++++++++++++++-------------------- 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index e18d908..ed69d88 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -177,8 +177,11 @@ Wf <- R6::R6Class( }, #' @description For DOWNLOAD_ONLY files, just return the input path. #' @param x Path with raw results. - DOWNLOAD_ONLY = function(x) { - tibble::tibble(name = glue("DOWNLOAD_ONLY"), data = list(tibble::tibble(input_path = x))) + DOWNLOAD_ONLY = function(x, suffix = "") { + tibble::tibble( + name = glue("DOWNLOAD_ONLY{suffix}"), + data = list(tibble::tibble(input_path = x)) + ) }, #' @description Download files from GDS/S3 to local filesystem. #' @param path Path with raw results. @@ -239,12 +242,12 @@ Wf <- R6::R6Class( dplyr::rowwise() |> dplyr::mutate( p = ifelse( - .data$name != "DOWNLOAD_ONLY", + !grepl("DOWNLOAD_ONLY", .data$name), as.character(glue("{prefix}_{.data$name}")), as.character(.data$data |> dplyr::pull("input_path")) ), out = ifelse( - .data$name != "DOWNLOAD_ONLY", + !grepl("DOWNLOAD_ONLY", .data$name), list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid)), list(.data$data) ) diff --git a/R/tidy.R b/R/tidy.R index 9ca0496..cb5e2f0 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -23,7 +23,7 @@ #' tibble::tibble(name = "table1", data = list(d[])) #' } #' x <- tibble::tibble( -#' type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl) +#' type = c("fun", "DOWNLOAD_ONLY_foobar"), localpath = c(p, p_dl) #' ) #' tidy_files(x) #' } @@ -32,10 +32,27 @@ tidy_files <- function(x, envir = parent.frame()) { assertthat::assert_that(is.data.frame(x)) assertthat::assert_that(all(c("type", "localpath") %in% colnames(x))) + # if there's a DOWNLOAD_ONLY_suffix, extract that suffix and call + # the DOWNLOAD_ONLY function + extract_download_suffix <- function(s) { + sub("DOWNLOAD_ONLY(.*)", "\\1", s) + } x |> dplyr::rowwise() |> dplyr::mutate( - data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath)) + data = ifelse( + !grepl("DOWNLOAD_ONLY", .data$type), + list( + dr_func_eval( + f = .data$type, v = .data$type, envir = envir + )(.data$localpath) + ), + list( + dr_func_eval( + f = "DOWNLOAD_ONLY", v = "DOWNLOAD_ONLY", envir = envir + )(.data$localpath, extract_download_suffix(.data$type)) + ) + ) ) |> dplyr::ungroup() |> dplyr::select("data") |> diff --git a/R/umccrise.R b/R/umccrise.R index 743c58d..4b6ed45 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -79,26 +79,22 @@ Wf_umccrise <- R6::R6Class( smallv <- "small_variants" regexes <- tibble::tribble( ~regex, ~fun, - glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord", - glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect", - glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv", - glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv", - glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv", - glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv", - glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum", - glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpair", - glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgrJson", - glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY", - glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY", - glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", - ) |> - dplyr::mutate( - fun = paste0("read_", .data$fun), - fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun) - ) + glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "read_hrdChord", + glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "read_hrdHrdetect", + glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "read_sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "read_sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "read_sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "read_sigsTsv", + glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum", + glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "read_conpair", + glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "read_pcgrJson", + glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers", + glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smallvpassvcf", + glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smallvpassvcfi", + glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene", + glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf", + glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi" + ) super$initialize(path = path, wname = wname, regexes = regexes) self$SubjectID <- SubjectID self$SampleID_tumor <- SampleID_tumor From e773307dc8e9142ed22e8ebbe1e0bd2c9a444c3e Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 1 Nov 2024 23:57:20 +1100 Subject: [PATCH 15/32] fix regexes --- R/sash.R | 29 +++++++++++++++-------------- R/umccrise.R | 2 +- man/Wf.Rd | 2 +- man/tidy_files.Rd | 2 +- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/R/sash.R b/R/sash.R index c73fb06..f1d94b6 100644 --- a/R/sash.R +++ b/R/sash.R @@ -87,21 +87,22 @@ Wf_sash <- R6::R6Class( crep <- "cancer_report/cancer_report_tables" regexes <- tibble::tribble( ~regex, ~fun, - glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord", - glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect", - glue("{path}/{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrdDragen", - glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv", - glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv", - glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv", - glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv", - glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum", - glue("{path}/{pref}/purple/{SampleID_tumor}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgrJson", - glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "read_hrdChord", + glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "read_hrdHrdetect", + glue("{path}/{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "read_hrdDragen", + glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "read_sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "read_sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "read_sigsTsv", + glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "read_sigsTsv", + glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum", + glue("{path}/{pref}/purple/{SampleID_tumor}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene", + glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "read_pcgrJson", + glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers", + glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY-pcgrvcf", + glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-pcgrvcfi", # glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts", - glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY", - glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", + glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf", + glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi", ) |> dplyr::mutate( fun = paste0("read_", .data$fun), diff --git a/R/umccrise.R b/R/umccrise.R index 4b6ed45..ba7b745 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -87,11 +87,11 @@ Wf_umccrise <- R6::R6Class( glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "read_sigsTsv", glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum", glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "read_conpair", + glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene", glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "read_pcgrJson", glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers", glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smallvpassvcf", glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smallvpassvcfi", - glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene", glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf", glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi" ) diff --git a/man/Wf.Rd b/man/Wf.Rd index 5dff941..f8e623d 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -191,7 +191,7 @@ the \code{s3_list_files_filter_relevant} function.} \subsection{Method \code{DOWNLOAD_ONLY()}}{ For DOWNLOAD_ONLY files, just return the input path. \subsection{Usage}{ -\if{html}{\out{
            }}\preformatted{Wf$DOWNLOAD_ONLY(x)}\if{html}{\out{
            }} +\if{html}{\out{
            }}\preformatted{Wf$DOWNLOAD_ONLY(x, suffix = "")}\if{html}{\out{
            }} } \subsection{Arguments}{ diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd index c13cffd..3770191 100644 --- a/man/tidy_files.Rd +++ b/man/tidy_files.Rd @@ -34,7 +34,7 @@ fun <- function(x) { tibble::tibble(name = "table1", data = list(d[])) } x <- tibble::tibble( - type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl) + type = c("fun", "DOWNLOAD_ONLY_foobar"), localpath = c(p, p_dl) ) tidy_files(x) } From 1e569e60109b875387a577058f91d973a6bac6d3 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 5 Nov 2024 23:58:21 +1100 Subject: [PATCH 16/32] add smlv filt vcf --- R/fs_s3.R | 2 +- R/sash.R | 11 ++++------- R/umccrise.R | 9 +++++---- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/R/fs_s3.R b/R/fs_s3.R index f2e3226..4609eac 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -32,7 +32,7 @@ s3_list_files_dir <- function(s3dir, max_objects = 1000) { purrr::map(\(x) tibble::tibble( Key = x[["Key"]], Size = x[["Size"]], - lastmodified = x[["LastModified"]] + lastmodified = as.character(x[["LastModified"]]) )) |> dplyr::bind_rows() |> dplyr::mutate( diff --git a/R/sash.R b/R/sash.R index f1d94b6..91260b4 100644 --- a/R/sash.R +++ b/R/sash.R @@ -96,19 +96,16 @@ Wf_sash <- R6::R6Class( glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "read_sigsTsv", glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum", glue("{path}/{pref}/purple/{SampleID_tumor}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene", + glue("{path}/{pref}/smlv_somatic/filter/{SampleID_tumor}\\.pass\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smlvfiltvcf", + glue("{path}/{pref}/smlv_somatic/filter/{SampleID_tumor}\\.pass\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smlvfiltvcfi", glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "read_pcgrJson", - glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers", glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY-pcgrvcf", glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-pcgrvcfi", + glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers", # glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts", glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf", glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi", - ) |> - dplyr::mutate( - fun = paste0("read_", .data$fun), - fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun) - ) - + ) super$initialize(path = path, wname = wname, regexes = regexes) self$SubjectID <- SubjectID self$SampleID_tumor <- SampleID_tumor diff --git a/R/umccrise.R b/R/umccrise.R index ba7b745..21894d3 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -76,7 +76,6 @@ Wf_umccrise <- R6::R6Class( wname <- "umccrise" pref <- glue("{SubjectID}__{SampleID_tumor}") crep <- "cancer_report_tables" - smallv <- "small_variants" regexes <- tibble::tribble( ~regex, ~fun, glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "read_hrdChord", @@ -88,10 +87,12 @@ Wf_umccrise <- R6::R6Class( glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum", glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "read_conpair", glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene", + glue("{path}/{pref}/small_variants/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smlvfiltvcf", + glue("{path}/{pref}/small_variants/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smlvfiltvcfi", glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "read_pcgrJson", - glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers", - glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smallvpassvcf", - glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smallvpassvcfi", + glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.pass\\.vcf\\.gz$"), "DOWNLOAD_ONLY-pcgrvcf", + glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.pass\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-pcgrvcfi", + glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers", glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf", glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi" ) From 66b8e8243964e90d12483b61713f555d9d80729c Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 8 Nov 2024 19:20:35 +1100 Subject: [PATCH 17/32] dragen swf: remove trailing slash --- R/Wf.R | 4 +++- R/fs_icav1.R | 7 ++++++ R/fs_s3.R | 7 ++++++ R/tso_dragen.R | 6 ++--- R/tsov2.R | 40 ++++++++++++++----------------- man/Wf.Rd | 2 ++ man/Wf_tso_ctdna_tumor_only_v2.Rd | 2 +- 7 files changed, 41 insertions(+), 27 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index ed69d88..b6ff9c3 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -106,7 +106,8 @@ Wf <- R6::R6Class( ) subwnames <- c("dragen") assertthat::assert_that(wname %in% c(wnames, subwnames)) - private$.path <- sub("/$", "", path) # remove potential trailing slash + path <- sub("/$", "", path) # remove potential trailing slash + private$.path <- path private$.wname <- wname private$.filesystem <- dplyr::case_when( grepl("^gds://", path) ~ "gds", @@ -177,6 +178,7 @@ Wf <- R6::R6Class( }, #' @description For DOWNLOAD_ONLY files, just return the input path. #' @param x Path with raw results. + #' @param suffix Suffix. DOWNLOAD_ONLY = function(x, suffix = "") { tibble::tibble( name = glue("DOWNLOAD_ONLY{suffix}"), diff --git a/R/fs_icav1.R b/R/fs_icav1.R index 56c6ad4..5aef0bf 100644 --- a/R/fs_icav1.R +++ b/R/fs_icav1.R @@ -179,6 +179,12 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN no_recurse = FALSE, page_token = NULL, recursive = recursive ) + msg <- glue( + "GDS input path is: {gdsdir}", + "\nNo relevant files found under there.", + "\nPlease check that path with `ica files list`, and try to adjust page size." + ) + assertthat::assert_that(nrow(d) > 0, msg = msg) d <- d |> dplyr::mutate( gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path), @@ -206,6 +212,7 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN ), localpath = normalizePath(.data$localpath) ) |> + dplyr::ungroup() |> dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id") return(res) } else { diff --git a/R/fs_s3.R b/R/fs_s3.R index 4609eac..a8a4da9 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -132,6 +132,12 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, s3dir = s3dir, pattern = NULL, regexes = regexes, max_objects = max_objects, presign = FALSE ) + msg <- glue( + "S3 input path is: {s3dir}", + "\nNo relevant files found under there.", + "\nPlease check that path with `aws s3 ls`, and try to adjust page size." + ) + assertthat::assert_that(nrow(d) > 0, msg = msg) d <- d |> dplyr::mutate( s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path), @@ -163,6 +169,7 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, ), localpath = normalizePath(.data$localpath) ) |> + dplyr::ungroup() |> dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path") return(res) } else { diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 6df67d9..897c45e 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -86,9 +86,9 @@ Wf_dragen <- R6::R6Class( print = function(...) { res <- tibble::tribble( ~var, ~value, - "path", self$path, - "wname", self$wname, - "filesystem", self$filesystem, + "path", private$.path, + "wname", private$.wname, + "filesystem", private$.filesystem, "prefix", self$prefix ) print(res) diff --git a/R/tsov2.R b/R/tsov2.R index 1a08dd9..fbfe581 100644 --- a/R/tsov2.R +++ b/R/tsov2.R @@ -47,7 +47,7 @@ #' d <- t2$download_files( #' outdir = outdir, #' max_files = 500, -#' dryrun = F +#' dryrun = FALSE #' ) #' d_tidy <- t2$tidy_files(d) #' d_write <- t2$write( @@ -76,29 +76,25 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class( res <- glue("Results/{pref}") li <- "Logs_Intermediates" dc <- glue("{li}/DragenCaller/{pref}") + path <- sub("/$", "", path) # remove potential trailing slash self$dragenObj <- Wf_dragen$new(path = file.path(path, dc), prefix = glue("{dc}/{prefix}")) # Results - reg1 <- tibble::tribble( + regexes <- tibble::tribble( ~regex, ~fun, - glue("{res}/{pref}\\.cnv\\.vcf\\.gz$"), "cnv", - glue("{res}/{pref}\\.cnv\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", - glue("{res}/{pref}\\.exon_cov_report\\.tsv$"), "cvgrepe", - glue("{res}/{pref}\\.gene_cov_report\\.tsv$"), "cvgrepg", - glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz$"), "hardfilt", - glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", + glue("{res}/{pref}\\.cnv\\.vcf\\.gz$"), "read_cnv", + glue("{res}/{pref}\\.cnv\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cnvvcfi", + glue("{res}/{pref}\\.exon_cov_report\\.tsv$"), "read_cvgrepe", + glue("{res}/{pref}\\.gene_cov_report\\.tsv$"), "read_cvgrepg", + glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz$"), "read_hardfilt", + glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-hardfiltvcfi", # glue("{res}/{pref}\\.microsat_output\\.json$"), "msi", # in DragenCaller - glue("{res}/{pref}\\.tmb.trace\\.tsv$"), "tmbt", - glue("{res}/{pref}_CombinedVariantOutput\\.tsv$"), "cvo", - glue("{res}/{pref}_Fusions\\.csv$"), "fus", - glue("{res}/{pref}_MetricsOutput\\.tsv$"), "DOWNLOAD_ONLY", - # glue("{res}/{pref}_SmallVariants_Annotated\\.json\\.gz$"), "DOWNLOAD_ONLY", - glue("{li}/SampleAnalysisResults/{pref}_SampleAnalysisResults\\.json$"), "sar" + glue("{res}/{pref}\\.tmb.trace\\.tsv$"), "read_tmbt", + glue("{res}/{pref}_CombinedVariantOutput\\.tsv$"), "read_cvo", + glue("{res}/{pref}_Fusions\\.csv$"), "read_fus", + glue("{res}/{pref}_MetricsOutput\\.tsv$"), "DOWNLOAD_ONLY-metricsoutput", + # glue("{res}/{pref}_SmallVariants_Annotated\\.json\\.gz$"), "DOWNLOAD_ONLY-smallvannjson", + glue("{li}/SampleAnalysisResults/{pref}_SampleAnalysisResults\\.json$"), "read_sar" ) - regexes <- reg1 |> - dplyr::mutate( - fun = paste0("read_", .data$fun), - fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun) - ) super$initialize(path = path, wname = wname, regexes = regexes) self$prefix <- prefix }, @@ -107,9 +103,9 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class( print = function(...) { res <- tibble::tribble( ~var, ~value, - "path", self$path, - "wname", self$wname, - "filesystem", self$filesystem, + "path", private$.path, + "wname", private$.wname, + "filesystem", private$.filesystem, "prefix", self$prefix ) print(res) diff --git a/man/Wf.Rd b/man/Wf.Rd index f8e623d..d6d30c7 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -198,6 +198,8 @@ For DOWNLOAD_ONLY files, just return the input path. \if{html}{\out{
            }} \describe{ \item{\code{x}}{Path with raw results.} + +\item{\code{suffix}}{Suffix.} } \if{html}{\out{
            }} } diff --git a/man/Wf_tso_ctdna_tumor_only_v2.Rd b/man/Wf_tso_ctdna_tumor_only_v2.Rd index 148aca0..a8513e4 100644 --- a/man/Wf_tso_ctdna_tumor_only_v2.Rd +++ b/man/Wf_tso_ctdna_tumor_only_v2.Rd @@ -50,7 +50,7 @@ t2$list_files_filter_relevant(max_files = 500) d <- t2$download_files( outdir = outdir, max_files = 500, - dryrun = F + dryrun = FALSE ) d_tidy <- t2$tidy_files(d) d_write <- t2$write( From 6e1a4f99aca0dd6254207f7ee48b31641fddedc8 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 9 Nov 2024 13:27:27 +1100 Subject: [PATCH 18/32] normalize outputdir --- R/fs_icav1.R | 6 +++--- R/fs_s3.R | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/fs_icav1.R b/R/fs_icav1.R index 5aef0bf..803aa7d 100644 --- a/R/fs_icav1.R +++ b/R/fs_icav1.R @@ -188,9 +188,9 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN d <- d |> dplyr::mutate( gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path), - gdspath_minus_gdsdir_outdir = fs::dir_create( - file.path(outdir, dirname(.data$gdspath_minus_gdsdir)) - ), + gdspath_minus_gdsdir_outdir = file.path(outdir, dirname(.data$gdspath_minus_gdsdir)) |> + fs::dir_create() |> + normalizePath(), localpath = file.path(.data$gdspath_minus_gdsdir_outdir, .data$bname), gdspath = .data$path ) |> diff --git a/R/fs_s3.R b/R/fs_s3.R index a8a4da9..d67a109 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -141,9 +141,9 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, d <- d |> dplyr::mutate( s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path), - s3path_minus_s3dir_outdir = fs::dir_create( - file.path(outdir, dirname(.data$s3path_minus_s3dir)) - ), + s3path_minus_s3dir_outdir = file.path(outdir, dirname(.data$s3path_minus_s3dir)) |> + fs::dir_create() |> + normalizePath(), localpath = file.path(.data$s3path_minus_s3dir_outdir, .data$bname), s3path = .data$path ) |> From c3746ba33dce945c311058c240c6e159c5855e3b Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 9 Nov 2024 15:15:45 +1100 Subject: [PATCH 19/32] dragen: fix cnv/mapping metrics for wgs tn --- R/dragen.R | 39 +++++++++++++++++++++++++++------------ man/Wf_dragen.Rd | 19 ++++++++++++++++++- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/R/dragen.R b/R/dragen.R index b198cb8..907f776 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -209,7 +209,8 @@ dragen_gc_metrics_read <- function(x) { #' @export dragen_cnv_metrics_read <- function(x) { d0 <- readr::read_lines(x) - assertthat::assert_that(grepl("SEX GENOTYPER", d0[1])) + # first row is sometimes SEX GENOTYPER, others CNV SUMMARY + assertthat::assert_that(grepl("CNV SUMMARY", d0[2])) abbrev_nm <- c( "Bases in reference genome" = "bases_in_ref_genome", "Average alignment coverage over genome" = "cov_alignment_avg_over_genome", @@ -227,7 +228,10 @@ dragen_cnv_metrics_read <- function(x) { "Number of amplifications" = "n_amp", "Number of deletions" = "n_del", "Number of passing amplifications" = "n_amp_pass", - "Number of passing deletions" = "n_del_pass" + "Number of passing deletions" = "n_del_pass", + "Estimated tumor purity" = "purity_tumor", + "Diploid coverage" = "cov_diploid", + "Overall ploidy" = "ploidy_overall" ) d1 <- d0 |> tibble::as_tibble_col(column_name = "value") |> @@ -236,10 +240,10 @@ dragen_cnv_metrics_read <- function(x) { names = c("category", "extra", "var", "count", "pct"), delim = ",", too_few = "align_start" ) + # in cttso sexgt <- d1 |> dplyr::filter(.data$category == "SEX GENOTYPER") |> dplyr::select(sexgt = "count", sexgt_pct = "pct") - d2 <- d1 |> dplyr::filter(!.data$category == "SEX GENOTYPER") |> dplyr::mutate( @@ -258,8 +262,10 @@ dragen_cnv_metrics_read <- function(x) { ) |> dplyr::select("var", "value") |> tidyr::pivot_wider(names_from = "var", values_from = "value") - res <- dplyr::bind_cols(sexgt, d2) - return(res) + if (nrow(sexgt) == 0) { + return(d2) + } + dplyr::bind_cols(sexgt, d2) } #' Read DRAGEN SV Metrics @@ -485,6 +491,7 @@ dragen_vc_metrics_read <- function(x) { dragen_mapping_metrics_read <- function(x) { abbrev_nm <- c( "Total input reads" = "reads_tot_input", + "Total reads removed by downsampling" = "reads_removed_downsamp", "Number of duplicate marked reads" = "reads_num_dupmarked", "Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed", "Number of unique reads (excl. duplicate marked reads)" = "reads_num_uniq", @@ -499,6 +506,7 @@ dragen_mapping_metrics_read <- function(x) { "Unmapped reads" = "reads_unmapped", "Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt", "Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj", + "Adjustment of reads matching exclude contigs" = "reads_match_excl_contigs", "Singleton reads (itself mapped; mate unmapped)" = "reads_singleton", "Paired reads (itself & mate mapped)" = "reads_paired", "Properly paired reads" = "reads_paired_proper", @@ -514,6 +522,7 @@ dragen_mapping_metrics_read <- function(x) { "Reads with indel R1" = "reads_indel_r1", "Reads with indel R2" = "reads_indel_r2", "Total bases" = "bases_tot", + "Total bases removed by downsampling" = "bases_removed_downsamp", "Total bases R1" = "bases_tot_r1", "Total bases R2" = "bases_tot_r2", "Mapped bases" = "bases_mapped", @@ -555,17 +564,21 @@ dragen_mapping_metrics_read <- function(x) { "Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj", "Reads with splice junction" = "reads_splicejunc", "Average sequenced coverage over genome" = "cov_avg_seq_over_genome", - "Filtered rRNA reads" = "reads_rrna_filtered" + "Filtered rRNA reads" = "reads_rrna_filtered", + "Mitochondrial reads excluded" = "reads_mito_excl" ) d0 <- readr::read_lines(x) assertthat::assert_that(grepl("MAPPING/ALIGNING", d0[1])) - # split by RG and non-RG - # tidy + # File is separated into two sections, the SUMMARY and the PER RG. + # Based on what I've seen so far, we can have single samples (where + # the first column just has MAPPING/ALIGNING) or TUMOR/NORMAL samples (where + # the first column will have a TUMOR or NORMAL prefix). + reg1 <- paste0("MAPPING/ALIGNING ", c("SUMMARY", "PER RG"), collapse = "|") d <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( "value", - names = c("category", "RG", "var", "count", "pct"), + names = c("dragen_sample", "RG", "var", "count", "pct"), delim = ",", too_few = "align_start" ) |> dplyr::mutate( @@ -573,9 +586,11 @@ dragen_mapping_metrics_read <- function(x) { count = as.numeric(.data$count), pct = as.numeric(.data$pct), var = dplyr::recode(.data$var, !!!abbrev_nm), - RG = dplyr::if_else(.data$RG == "", "Total", .data$RG) + RG = dplyr::if_else(.data$RG == "", "Total", .data$RG), + dragen_sample = sub(reg1, "", .data$dragen_sample) |> trimws(), + dragen_sample = dplyr::if_else(.data$dragen_sample == "", "SINGLE", .data$dragen_sample) ) |> - dplyr::select("RG", "var", "count", "pct") + dplyr::select("dragen_sample", "RG", "var", "count", "pct") dirty_names_cleaned(unique(d$var), abbrev_nm, x) # pivot d |> @@ -584,7 +599,7 @@ dragen_mapping_metrics_read <- function(x) { name = dplyr::if_else(.data$name == "count", "", "_pct"), var = glue("{.data$var}{.data$name}") ) |> - dplyr::select("RG", "var", "value") |> + dplyr::select("dragen_sample", "RG", "var", "value") |> dplyr::filter(!is.na(.data$value)) |> tidyr::pivot_wider(names_from = "var", values_from = "value") } diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 66b91c6..f3465e6 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -19,7 +19,24 @@ p <- file.path( d1 <- Wf_dragen$new(path = p, prefix = prefix) d1$list_files(max_files = 100) d1$list_files_filter_relevant(max_files = 300) -d <- d1$download_files(max_files = 100, dryrun = F) +d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) +d_tidy <- d1$tidy_files(d) +d_write <- t1$write( + d_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = prefix, + format = "tsv" +) +#---- GDS ----# +prefix <- "PRJ222358" +p <- file.path("gds://production/analysis_data/SBJ03001/wgs_tumor_normal", + "20241108fc293a38/L2201805_L2201797_dragen_somatic" +) +outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case +d1 <- Wf_dragen$new(path = p, prefix = prefix) +d1$list_files(max_files = 100) +d1$list_files_filter_relevant(max_files = 300) +d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) d_tidy <- d1$tidy_files(d) d_write <- t1$write( d_tidy, From 4d833f4f0ae32b04ebe565cf9ed64c3f90e0136c Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 9 Nov 2024 16:25:05 +1100 Subject: [PATCH 20/32] dragen: grab tn suffix for cov/hist metrics --- R/tso_dragen.R | 88 +++++++++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 37 deletions(-) diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 897c45e..5dfad15 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -16,7 +16,25 @@ #' d1 <- Wf_dragen$new(path = p, prefix = prefix) #' d1$list_files(max_files = 100) #' d1$list_files_filter_relevant(max_files = 300) -#' d <- d1$download_files(max_files = 100, dryrun = F) +#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) +#' d_tidy <- d1$tidy_files(d) +#' d_write <- t1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = prefix, +#' format = "tsv" +#' ) +#' #---- GDS ----# +#' prefix <- "PRJ222358" +#' p <- file.path( +#' "gds://production/analysis_data/SBJ03001/wgs_tumor_normal", +#' "20241108fc293a38/L2201805_L2201797_dragen_somatic" +#' ) +#' outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case +#' d1 <- Wf_dragen$new(path = p, prefix = prefix) +#' d1$list_files(max_files = 100) +#' d1$list_files_filter_relevant(max_files = 300) +#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) #' d_tidy <- d1$tidy_files(d) #' d_write <- t1$write( #' d_tidy, @@ -39,44 +57,40 @@ Wf_dragen <- R6::R6Class( initialize = function(path = NULL, prefix = NULL) { wname <- "dragen" pref <- prefix - reg1 <- tibble::tribble( + tn1 <- "(|_tumor|_normal)" + regexes <- tibble::tribble( ~regex, ~fun, - glue("{pref}\\-replay\\.json$"), "replay", - glue("{pref}\\.cnv_metrics.csv$"), "cnvMetrics", - glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{pref}\\.wgs_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{pref}\\.exon_coverage_metrics\\.csv$"), "coverageMetrics", - glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "coverageMetrics", - glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "coverageMetrics", - glue("{pref}\\.wgs_coverage_metrics\\.csv$"), "coverageMetrics", - glue("{pref}\\.exon_fine_hist\\.csv$"), "fineHist", - glue("{pref}\\.target_bed_fine_hist\\.csv$"), "fineHist", - glue("{pref}\\.tmb_fine_hist\\.csv$"), "fineHist", - glue("{pref}\\.wgs_fine_hist\\.csv$"), "fineHist", - glue("{pref}\\.exon_hist\\.csv$"), "hist", - glue("{pref}\\.target_bed_hist\\.csv$"), "hist", - glue("{pref}\\.tmb_hist\\.csv$"), "hist", - glue("{pref}\\.wgs_hist\\.csv$"), "hist", - glue("{pref}\\.fastqc_metrics\\.csv$"), "fastqcMetrics", - glue("{pref}\\.fragment_length_hist\\.csv$"), "fragmentLengthHist", - glue("{pref}\\.gc_metrics\\.csv$"), "gcMetrics", - glue("{pref}\\.gvcf_metrics\\.csv$"), "vcMetrics", - glue("{pref}\\.mapping_metrics\\.csv$"), "mappingMetrics", - glue("{pref}\\.microsat_diffs\\.txt$"), "msiDiffs", - glue("{pref}\\.microsat_output\\.json$"), "msi", - glue("{pref}\\.sv_metrics\\.csv$"), "svMetrics", - glue("{pref}\\.time_metrics\\.csv$"), "timeMetrics", - glue("{pref}\\.trimmer_metrics\\.csv$"), "trimmerMetrics", - glue("{pref}\\.umi_metrics\\.csv$"), "umiMetrics", - glue("{pref}\\.vc_metrics\\.csv$"), "vcMetrics" + glue("{pref}\\-replay\\.json$"), "read_replay", + glue("{pref}\\.cnv_metrics.csv$"), "read_cnvMetrics", + glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "read_contigMeanCov", + glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "read_contigMeanCov", + glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "read_contigMeanCov", + glue("{pref}\\.wgs_contig_mean_cov{tn1}\\.csv$"), "read_contigMeanCov", + glue("{pref}\\.exon_coverage_metrics\\.csv$"), "read_coverageMetrics", + glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "read_coverageMetrics", + glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "read_coverageMetrics", + glue("{pref}\\.wgs_coverage_metrics{tn1}\\.csv$"), "read_coverageMetrics", + glue("{pref}\\.exon_fine_hist\\.csv$"), "read_fineHist", + glue("{pref}\\.target_bed_fine_hist\\.csv$"), "read_fineHist", + glue("{pref}\\.tmb_fine_hist\\.csv$"), "read_fineHist", + glue("{pref}\\.wgs_fine_hist{tn1}\\.csv$"), "read_fineHist", + glue("{pref}\\.exon_hist\\.csv$"), "read_hist", + glue("{pref}\\.target_bed_hist\\.csv$"), "read_hist", + glue("{pref}\\.tmb_hist\\.csv$"), "read_hist", + glue("{pref}\\.wgs_hist{tn1}\\.csv$"), "read_hist", + glue("{pref}\\.fastqc_metrics\\.csv$"), "read_fastqcMetrics", + glue("{pref}\\.fragment_length_hist\\.csv$"), "read_fragmentLengthHist", + glue("{pref}\\.gc_metrics\\.csv$"), "read_gcMetrics", + glue("{pref}\\.gvcf_metrics\\.csv$"), "read_vcMetrics", + glue("{pref}\\.mapping_metrics\\.csv$"), "read_mappingMetrics", + glue("{pref}\\.microsat_diffs\\.txt$"), "read_msiDiffs", + glue("{pref}\\.microsat_output\\.json$"), "read_msi", + glue("{pref}\\.sv_metrics\\.csv$"), "read_svMetrics", + glue("{pref}\\.time_metrics\\.csv$"), "read_timeMetrics", + glue("{pref}\\.trimmer_metrics\\.csv$"), "read_trimmerMetrics", + glue("{pref}\\.umi_metrics\\.csv$"), "read_umiMetrics", + glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics" ) - regexes <- reg1 |> - dplyr::mutate( - fun = paste0("read_", .data$fun), - fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun) - ) super$initialize(path = path, wname = wname, regexes = regexes) self$prefix <- prefix From 7ec18e57d98d0b1b9b0f5b7966898d310fde6da6 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 10 Nov 2024 23:58:15 +1100 Subject: [PATCH 21/32] dragen: add ploidy estimation metrics --- NAMESPACE | 1 - R/dragen.R | 116 ++++++------------- man/PloidyEstimationMetricsFile.Rd | 100 ---------------- man/Wf_dragen.Rd | 3 +- man/dragen_ploidy_estimation_metrics_read.Rd | 17 +++ 5 files changed, 52 insertions(+), 185 deletions(-) delete mode 100644 man/PloidyEstimationMetricsFile.Rd create mode 100644 man/dragen_ploidy_estimation_metrics_read.Rd diff --git a/NAMESPACE b/NAMESPACE index 5270441..12a33de 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,6 @@ export(BclconvertReports) export(BclconvertReports375) export(File) export(MultiqcFile) -export(PloidyEstimationMetricsFile) export(Wf) export(Wf_dragen) export(Wf_sash) diff --git a/R/dragen.R b/R/dragen.R index 907f776..ce091f8 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -781,89 +781,39 @@ dragen_contig_mean_coverage_plot <- function(d, top_alt_n = 15) { ggplot2::facet_wrap(ggplot2::vars(.data$panel), nrow = 2, scales = "free") } -#' PloidyEstimationMetricsFile R6 Class +#' Read DRAGEN Ploidy Estimation Metrics #' -#' @description -#' Contains methods for reading contents of -#' the `ploidy_estimation_metrics.csv` file output from DRAGEN. -#' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.ploidy_estimation_metrics.csv.gz", package = "dracarys") -#' pem <- PloidyEstimationMetricsFile$new(x) -#' d <- pem$read() # or read(pem) -#' pem$write(d, out_dir = tempdir(), prefix = "seqc_ploidy", out_format = "tsv") +#' Reads the `ploidy_estimation_metrics.csv` file generated by DRAGEN. +#' @param x Path to file. #' -#' @export -PloidyEstimationMetricsFile <- R6::R6Class( - "PloidyEstimationMetricsFile", - inherit = File, - public = list( - #' @description - #' Reads the `ploidy_estimation_metrics.csv` file output from DRAGEN. - #' - #' @return tibble with one row and metrics spread across individual columns. - read = function() { - x <- self$path - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1])) - abbrev_nm <- c( - "Autosomal median coverage" = "cov_auto_median", - "X median coverage" = "cov_x_median", - "Y median coverage" = "cov_y_median", - "1 median / Autosomal median" = "cov_1_div_auto_median", - "2 median / Autosomal median" = "cov_2_div_auto_median", - "3 median / Autosomal median" = "cov_3_div_auto_median", - "4 median / Autosomal median" = "cov_4_div_auto_median", - "5 median / Autosomal median" = "cov_5_div_auto_median", - "6 median / Autosomal median" = "cov_6_div_auto_median", - "7 median / Autosomal median" = "cov_7_div_auto_median", - "8 median / Autosomal median" = "cov_8_div_auto_median", - "9 median / Autosomal median" = "cov_9_div_auto_median", - "10 median / Autosomal median" = "cov_10_div_auto_median", - "11 median / Autosomal median" = "cov_11_div_auto_median", - "12 median / Autosomal median" = "cov_12_div_auto_median", - "13 median / Autosomal median" = "cov_13_div_auto_median", - "14 median / Autosomal median" = "cov_14_div_auto_median", - "15 median / Autosomal median" = "cov_15_div_auto_median", - "16 median / Autosomal median" = "cov_16_div_auto_median", - "17 median / Autosomal median" = "cov_17_div_auto_median", - "18 median / Autosomal median" = "cov_18_div_auto_median", - "19 median / Autosomal median" = "cov_19_div_auto_median", - "20 median / Autosomal median" = "cov_20_div_auto_median", - "21 median / Autosomal median" = "cov_21_div_auto_median", - "22 median / Autosomal median" = "cov_22_div_auto_median", - "X median / Autosomal median" = "cov_x_div_auto_median", - "Y median / Autosomal median" = "cov_y_div_auto_median", - "Ploidy estimation" = "ploidy_est" - ) - - d <- raw |> - tibble::as_tibble_col(column_name = "value") |> - tidyr::separate_wider_delim("value", names = c("dummy1", "dummy2", "var", "value"), delim = ",") |> - dplyr::select("var", "value") |> - dplyr::mutate( - var = dplyr::recode(.data$var, !!!abbrev_nm) - ) |> - tidyr::pivot_wider(names_from = "var", values_from = "value") - # now convert all except 'Ploidy estimation' to numeric - cols1 <- colnames(d)[colnames(d) != "ploidy_est"] - d |> - dplyr::mutate(dplyr::across(dplyr::all_of(cols1), as.numeric)) - }, - #' @description - #' Writes a tidy version of the `ploidy_estimation_metrics.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } +#' @return Tibble with metrics. +dragen_ploidy_estimation_metrics_read <- function(x) { + raw <- readr::read_lines(x) + assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1])) + fun1 <- function(x) { + setNames( + as.character(glue("cov_{x}_div_auto_median")), + as.character(glue("{x} median / Autosomal median")) + ) + } + abbrev_nm <- c( + "Autosomal median coverage" = "cov_auto_median", + "X median coverage" = "cov_x_median", + "Y median coverage" = "cov_y_median", + "Ploidy estimation" = "ploidy_est", + fun1(c(1:22, "X", "Y")) ) -) + + d <- raw |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim("value", names = c("dummy1", "dummy2", "var", "value"), delim = ",") |> + dplyr::select("var", "value") |> + dplyr::mutate( + var = dplyr::recode(.data$var, !!!abbrev_nm) + ) |> + tidyr::pivot_wider(names_from = "var", values_from = "value") + # now convert all except 'Ploidy estimation' to numeric + cols1 <- colnames(d)[colnames(d) != "ploidy_est"] + d |> + dplyr::mutate(dplyr::across(dplyr::all_of(cols1), as.numeric)) +} diff --git a/man/PloidyEstimationMetricsFile.Rd b/man/PloidyEstimationMetricsFile.Rd deleted file mode 100644 index 7208e81..0000000 --- a/man/PloidyEstimationMetricsFile.Rd +++ /dev/null @@ -1,100 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{PloidyEstimationMetricsFile} -\alias{PloidyEstimationMetricsFile} -\title{PloidyEstimationMetricsFile R6 Class} -\description{ -Contains methods for reading contents of -the \code{ploidy_estimation_metrics.csv} file output from DRAGEN. -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.ploidy_estimation_metrics.csv.gz", package = "dracarys") -pem <- PloidyEstimationMetricsFile$new(x) -d <- pem$read() # or read(pem) -pem$write(d, out_dir = tempdir(), prefix = "seqc_ploidy", out_format = "tsv") - -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{PloidyEstimationMetricsFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-PloidyEstimationMetricsFile-read}{\code{PloidyEstimationMetricsFile$read()}} -\item \href{#method-PloidyEstimationMetricsFile-write}{\code{PloidyEstimationMetricsFile$write()}} -\item \href{#method-PloidyEstimationMetricsFile-clone}{\code{PloidyEstimationMetricsFile$clone()}} -} -} -\if{html}{\out{ -
            Inherited methods - -
            -}} -\if{html}{\out{
            }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PloidyEstimationMetricsFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{ploidy_estimation_metrics.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
            }}\preformatted{PloidyEstimationMetricsFile$read()}\if{html}{\out{
            }} -} - -\subsection{Returns}{ -tibble with one row and metrics spread across individual columns. -} -} -\if{html}{\out{
            }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PloidyEstimationMetricsFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{ploidy_estimation_metrics.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
            }}\preformatted{PloidyEstimationMetricsFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
            }} -} - -\subsection{Arguments}{ -\if{html}{\out{
            }} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
            }} -} -} -\if{html}{\out{
            }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-PloidyEstimationMetricsFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
            }}\preformatted{PloidyEstimationMetricsFile$clone(deep = FALSE)}\if{html}{\out{
            }} -} - -\subsection{Arguments}{ -\if{html}{\out{
            }} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
            }} -} -} -} diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index f3465e6..c11ab54 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -29,7 +29,8 @@ d_write <- t1$write( ) #---- GDS ----# prefix <- "PRJ222358" -p <- file.path("gds://production/analysis_data/SBJ03001/wgs_tumor_normal", +p <- file.path( + "gds://production/analysis_data/SBJ03001/wgs_tumor_normal", "20241108fc293a38/L2201805_L2201797_dragen_somatic" ) outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case diff --git a/man/dragen_ploidy_estimation_metrics_read.Rd b/man/dragen_ploidy_estimation_metrics_read.Rd new file mode 100644 index 0000000..468566f --- /dev/null +++ b/man/dragen_ploidy_estimation_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_ploidy_estimation_metrics_read} +\alias{dragen_ploidy_estimation_metrics_read} +\title{Read DRAGEN Ploidy Estimation Metrics} +\usage{ +dragen_ploidy_estimation_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Reads the \code{ploidy_estimation_metrics.csv} file generated by DRAGEN. +} From d2b76c8888a3a3e7c308bb95e3b620355ab4801b Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 11 Nov 2024 00:20:46 +1100 Subject: [PATCH 22/32] dragen: add ploidy estimation metrics --- R/dragen.R | 14 +++++++++----- R/tso_dragen.R | 9 ++++++++- man/Wf_dragen.Rd | 18 ++++++++++++++++++ 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/R/dragen.R b/R/dragen.R index ce091f8..ea482bf 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -792,18 +792,21 @@ dragen_ploidy_estimation_metrics_read <- function(x) { assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1])) fun1 <- function(x) { setNames( - as.character(glue("cov_{x}_div_auto_median")), + as.character(glue("cov_{tolower(x)}_div_auto_median")), as.character(glue("{x} median / Autosomal median")) ) } + fun2 <- function(x) { + setNames( + as.character(glue("cov_{tolower(x)}_median")), + as.character(glue("{x} median coverage")) + ) + } abbrev_nm <- c( - "Autosomal median coverage" = "cov_auto_median", - "X median coverage" = "cov_x_median", - "Y median coverage" = "cov_y_median", "Ploidy estimation" = "ploidy_est", + fun2(c("X", "Y", "Autosomal")), fun1(c(1:22, "X", "Y")) ) - d <- raw |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim("value", names = c("dummy1", "dummy2", "var", "value"), delim = ",") |> @@ -812,6 +815,7 @@ dragen_ploidy_estimation_metrics_read <- function(x) { var = dplyr::recode(.data$var, !!!abbrev_nm) ) |> tidyr::pivot_wider(names_from = "var", values_from = "value") + dirty_names_cleaned(unique(colnames(d)), abbrev_nm, x) # now convert all except 'Ploidy estimation' to numeric cols1 <- colnames(d)[colnames(d) != "ploidy_est"] d |> diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 5dfad15..d786296 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -89,7 +89,8 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\.time_metrics\\.csv$"), "read_timeMetrics", glue("{pref}\\.trimmer_metrics\\.csv$"), "read_trimmerMetrics", glue("{pref}\\.umi_metrics\\.csv$"), "read_umiMetrics", - glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics" + glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics", + glue("{pref}\\.ploidy_estimation_metrics\\.csv$"), "read_ploidyMetrics" ) super$initialize(path = path, wname = wname, regexes = regexes) @@ -271,6 +272,12 @@ Wf_dragen <- R6::R6Class( dat <- dragen_umi_metrics_read(x) dat }, + #' @description Read `ploidy_estimation_metrics.csv` file. + #' @param x Path to file. + read_ploidyMetrics = function(x) { + dat <- dragen_ploidy_estimation_metrics_read(x) + tibble::tibble(name = "ploidymetrics", data = list(dat)) + }, #' @description Read `microsat_output.json` file. #' @param x Path to file. read_msi = function(x) { diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index c11ab54..4d520c1 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -77,6 +77,7 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_fastqcMetrics}{\code{Wf_dragen$read_fastqcMetrics()}} \item \href{#method-Wf_dragen-read_gcMetrics}{\code{Wf_dragen$read_gcMetrics()}} \item \href{#method-Wf_dragen-read_umiMetrics}{\code{Wf_dragen$read_umiMetrics()}} +\item \href{#method-Wf_dragen-read_ploidyMetrics}{\code{Wf_dragen$read_ploidyMetrics()}} \item \href{#method-Wf_dragen-read_msi}{\code{Wf_dragen$read_msi()}} \item \href{#method-Wf_dragen-read_msiDiffs}{\code{Wf_dragen$read_msiDiffs()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} @@ -380,6 +381,23 @@ Read \code{umi_metrics.csv} file. \if{html}{\out{
            }}\preformatted{Wf_dragen$read_umiMetrics(x)}\if{html}{\out{
            }} } +\subsection{Arguments}{ +\if{html}{\out{
            }} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
            }} +} +} +\if{html}{\out{
            }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_ploidyMetrics}{}}} +\subsection{Method \code{read_ploidyMetrics()}}{ +Read \code{ploidy_estimation_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
            }}\preformatted{Wf_dragen$read_ploidyMetrics(x)}\if{html}{\out{
            }} +} + \subsection{Arguments}{ \if{html}{\out{
            }} \describe{ From 027bba67cd7b08474ab1b1fe01a478b173228f66 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 11 Nov 2024 14:52:57 +1100 Subject: [PATCH 23/32] fix coderabbit nits --- R/sash.R | 2 +- R/tso_dragen.R | 4 ++-- man/Wf_dragen.Rd | 4 ++-- man/Wf_sash.Rd | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/sash.R b/R/sash.R index 91260b4..883d659 100644 --- a/R/sash.R +++ b/R/sash.R @@ -79,7 +79,7 @@ Wf_sash <- R6::R6Class( #' local filesystem). #' @param SubjectID The SubjectID of the sample. #' @param SampleID_tumor The SampleID of the tumor sample. - #' @param SampleID_normal The SampleID of the tumor sample. + #' @param SampleID_normal The SampleID of the normal sample. initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL, SampleID_normal = NULL) { wname <- "sash" diff --git a/R/tso_dragen.R b/R/tso_dragen.R index d786296..d4f7411 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -18,7 +18,7 @@ #' d1$list_files_filter_relevant(max_files = 300) #' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) #' d_tidy <- d1$tidy_files(d) -#' d_write <- t1$write( +#' d_write <- d1$write( #' d_tidy, #' outdir = file.path(p, "dracarys_tidy"), #' prefix = prefix, @@ -36,7 +36,7 @@ #' d1$list_files_filter_relevant(max_files = 300) #' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) #' d_tidy <- d1$tidy_files(d) -#' d_write <- t1$write( +#' d_write <- d1$write( #' d_tidy, #' outdir = file.path(p, "dracarys_tidy"), #' prefix = prefix, diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 4d520c1..a3fc949 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -21,7 +21,7 @@ d1$list_files(max_files = 100) d1$list_files_filter_relevant(max_files = 300) d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) d_tidy <- d1$tidy_files(d) -d_write <- t1$write( +d_write <- d1$write( d_tidy, outdir = file.path(p, "dracarys_tidy"), prefix = prefix, @@ -39,7 +39,7 @@ d1$list_files(max_files = 100) d1$list_files_filter_relevant(max_files = 300) d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) d_tidy <- d1$tidy_files(d) -d_write <- t1$write( +d_write <- d1$write( d_tidy, outdir = file.path(p, "dracarys_tidy"), prefix = prefix, diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd index d3a8f66..23b54d0 100644 --- a/man/Wf_sash.Rd +++ b/man/Wf_sash.Rd @@ -132,7 +132,7 @@ local filesystem).} \item{\code{SampleID_tumor}}{The SampleID of the tumor sample.} -\item{\code{SampleID_normal}}{The SampleID of the tumor sample.} +\item{\code{SampleID_normal}}{The SampleID of the normal sample.} } \if{html}{\out{
            }} } From 2e15dd5c4d0f22cbb26b5d849c26a5dacc8eaa18 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 12 Nov 2024 19:41:20 +1100 Subject: [PATCH 24/32] s3_file_presignedurl: increase expiry to 7 days (604800 secs) --- R/fs_s3.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fs_s3.R b/R/fs_s3.R index d67a109..ed32b0f 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -196,7 +196,7 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, #' } #' #' @export -s3_file_presignedurl <- function(client, s3path, expiry_seconds = 3600) { +s3_file_presignedurl <- function(client, s3path, expiry_seconds = 604800) { bucket <- sub("s3://(.*?)/.*", "\\1", s3path) prefix <- sub("s3://(.*?)/(.*)", "\\2", s3path) client$generate_presigned_url( From 6ba304218ea3231e58c47edbd71aabd0f6a0b72a Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 13 Nov 2024 22:06:28 +1100 Subject: [PATCH 25/32] data-raw/portal_meta.R moved to rportal --- data-raw/portal_meta.R | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 data-raw/portal_meta.R diff --git a/data-raw/portal_meta.R b/data-raw/portal_meta.R deleted file mode 100644 index 56982f6..0000000 --- a/data-raw/portal_meta.R +++ /dev/null @@ -1,31 +0,0 @@ -# portal workflow meta subset -require(dracarys) -require(here) -require(dplyr) -require(purrr) -require(readr) - - -wfs <- c( - "bcl_convert", "rnasum", "tso_ctdna_tumor_only", - "umccrise", "wgs_alignment_qc", "wgs_tumor_normal", "wts_tumor_only", - "wts_alignment_qc", - "oncoanalyser_wgs", "oncoanalyser_wgts_existing_both", - "oncoanalyser_wts", "sash", "star_alignment" -) - -account <- "stg" -get_top_succeeded <- function(wf, num_row = 10, num_top = 4) { - dracarys::portal_meta_read(params = glue::glue("&type_name={wf}"), account = account, rows = num_row) |> - dplyr::filter(.data$end_status == "Succeeded") |> - dplyr::slice_head(n = num_top) -} - -# get top 10 rows, then get top 4 successful runs -d <- wfs |> - purrr::map(\(x) get_top_succeeded(x, 10, 4)) |> - dplyr::bind_rows() -# leave dates as character -d |> - readr::write_csv(here::here("inst/extdata/portal_meta_top4.csv")) -# date_fmt <- "%Y-%m-%dT%H:%M:%S" From cbae470632773f4958c9a2392f53e22e134c8ca4 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 24 Nov 2024 22:46:53 +1100 Subject: [PATCH 26/32] add dtw_Wf_dragen --- NAMESPACE | 1 + R/dragen.R | 357 +++++++++++++++++++++++++++++++++++++++++++ R/tso_dragen.R | 302 ------------------------------------ man/Wf_dragen.Rd | 2 +- man/dtw_Wf_dragen.Rd | 56 +++++++ 5 files changed, 415 insertions(+), 303 deletions(-) create mode 100644 man/dtw_Wf_dragen.Rd diff --git a/NAMESPACE b/NAMESPACE index 12a33de..0463659 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,7 @@ export(dragen_sv_metrics_read) export(dragen_trimmer_metrics_read) export(dragen_umi_metrics_read) export(dragen_vc_metrics_read) +export(dtw_Wf_dragen) export(dtw_Wf_tso_ctdna_tumor_only) export(dtw_Wf_tso_ctdna_tumor_only_v2) export(empty_tbl) diff --git a/R/dragen.R b/R/dragen.R index ea482bf..9e80b62 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -821,3 +821,360 @@ dragen_ploidy_estimation_metrics_read <- function(x) { d |> dplyr::mutate(dplyr::across(dplyr::all_of(cols1), as.numeric)) } + +#' Wf_dragen Download Tidy and Write +#' +#' Downloads files from the `dragen` workflow and writes them in a tidy format. +#' +#' @param path Path to directory with raw workflow results (S3 or local filesystem). +#' @param prefix The LibraryID prefix of the sample. +#' @param outdir Path to output directory with raw files. +#' @param outdir_tidy Path to output directory with tidy files. +#' @param format Format of output files. +#' @param max_files Max number of files to list. +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @return Tibble of tidy tibbles. +#' +#' @examples +#' \dontrun{ +#' #---- Local ----# +#' +#' #---- S3 ----# +#' path <- file.path( +#' "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", +#' "analysis/wgts-qc/20241123ffa837c4/L2401621_dragen_alignment" +#' ) +#' prefix <- "L2401621" +#' outdir <- sub("s3:/", "~/s3", path) +#' dragen_tidy <- dtw_Wf_dragen( +#' path = path, prefix = prefix, outdir = outdir, +#' format = "tsv", +#' dryrun = F +#' ) +#' } +#' @export +dtw_Wf_dragen <- function(path, prefix, outdir, + outdir_tidy = file.path(outdir, "dracarys_tidy"), + format = "rds", + max_files = 1000, + dryrun = FALSE) { + obj <- Wf_dragen$new(path = path, prefix = prefix) + d_dl <- obj$download_files( + outdir = outdir, max_files = max_files, dryrun = dryrun + ) + if (!dryrun) { + d_tidy <- obj$tidy_files(d_dl) + d_write <- obj$write( + d_tidy, + outdir = outdir_tidy, + prefix = prefix, + format = format + ) + return(d_write) + } + return(d_dl) +} + +#' Wf_dragen R6 Class +#' +#' @description +#' Reads and writes tidy versions of files from the `dragen` workflow. +#' +#' @examples +#' \dontrun{ +#' +#' #---- Local ----# +#' prefix <- "L2401290" +#' p <- file.path( +#' "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", +#' "analysis/cttsov2/20240915ff0295ed/Logs_Intermediates/DragenCaller", +#' prefix +#' ) +#' d1 <- Wf_dragen$new(path = p, prefix = prefix) +#' d1$list_files(max_files = 100) +#' d1$list_files_filter_relevant(max_files = 300) +#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) +#' d_tidy <- d1$tidy_files(d) +#' d_write <- d1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = prefix, +#' format = "tsv" +#' ) +#' #---- GDS ----# +#' prefix <- "PRJ222358" +#' p <- file.path( +#' "gds://production/analysis_data/SBJ03001/wgs_tumor_normal", +#' "20241108fc293a38/L2201805_L2201797_dragen_somatic" +#' ) +#' outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case +#' d1 <- Wf_dragen$new(path = p, prefix = prefix) +#' d1$list_files(max_files = 100) +#' d1$list_files_filter_relevant(max_files = 300) +#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) +#' d_tidy <- d1$tidy_files(d) +#' d_write <- d1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = prefix, +#' format = "tsv" +#' ) +#' } +#' @export +Wf_dragen <- R6::R6Class( + "Wf_dragen", + inherit = Wf, + public = list( + #' @field prefix The LibraryID prefix of the sample (needed for path lookup). + prefix = NULL, + #' @description Create a new Wf_dragen object. + #' @param path Path to directory with raw workflow results (from S3 or + #' local filesystem). + #' @param prefix The LibraryID prefix of the sample (needed for path lookup). + initialize = function(path = NULL, prefix = NULL) { + wname <- "dragen" + pref <- prefix + tn1 <- "(|_tumor|_normal)" + regexes <- tibble::tribble( + ~regex, ~fun, + glue("{pref}\\-replay\\.json$"), "read_replay", + glue("{pref}\\.cnv_metrics.csv$"), "read_cnvMetrics", + glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "read_contigMeanCov", + glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "read_contigMeanCov", + glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "read_contigMeanCov", + glue("{pref}\\.wgs_contig_mean_cov{tn1}\\.csv$"), "read_contigMeanCov", + glue("{pref}\\.exon_coverage_metrics\\.csv$"), "read_coverageMetrics", + glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "read_coverageMetrics", + glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "read_coverageMetrics", + glue("{pref}\\.wgs_coverage_metrics{tn1}\\.csv$"), "read_coverageMetrics", + glue("{pref}\\.exon_fine_hist\\.csv$"), "read_fineHist", + glue("{pref}\\.target_bed_fine_hist\\.csv$"), "read_fineHist", + glue("{pref}\\.tmb_fine_hist\\.csv$"), "read_fineHist", + glue("{pref}\\.wgs_fine_hist{tn1}\\.csv$"), "read_fineHist", + glue("{pref}\\.exon_hist\\.csv$"), "read_hist", + glue("{pref}\\.target_bed_hist\\.csv$"), "read_hist", + glue("{pref}\\.tmb_hist\\.csv$"), "read_hist", + glue("{pref}\\.wgs_hist{tn1}\\.csv$"), "read_hist", + glue("{pref}\\.fastqc_metrics\\.csv$"), "read_fastqcMetrics", + glue("{pref}\\.fragment_length_hist\\.csv$"), "read_fragmentLengthHist", + glue("{pref}\\.gc_metrics\\.csv$"), "read_gcMetrics", + glue("{pref}\\.gvcf_metrics\\.csv$"), "read_vcMetrics", + glue("{pref}\\.mapping_metrics\\.csv$"), "read_mappingMetrics", + glue("{pref}\\.microsat_diffs\\.txt$"), "read_msiDiffs", + glue("{pref}\\.microsat_output\\.json$"), "read_msi", + glue("{pref}\\.sv_metrics\\.csv$"), "read_svMetrics", + glue("{pref}\\.time_metrics\\.csv$"), "read_timeMetrics", + glue("{pref}\\.trimmer_metrics\\.csv$"), "read_trimmerMetrics", + glue("{pref}\\.umi_metrics\\.csv$"), "read_umiMetrics", + glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics", + glue("{pref}\\.ploidy_estimation_metrics\\.csv$"), "read_ploidyMetrics" + ) + + super$initialize(path = path, wname = wname, regexes = regexes) + self$prefix <- prefix + }, + #' @description Print details about the Workflow. + #' @param ... (ignored). + print = function(...) { + res <- tibble::tribble( + ~var, ~value, + "path", private$.path, + "wname", private$.wname, + "filesystem", private$.filesystem, + "prefix", self$prefix + ) + print(res) + invisible(self) + }, + #' @description Read `replay.json` file. + #' @param x Path to file. + read_replay = function(x) { + res <- x |> + jsonlite::read_json(simplifyVector = TRUE) |> + purrr::map_if(is.data.frame, tibble::as_tibble) + req_elements <- c("command_line", "hash_table_build", "dragen_config", "system") + assertthat::assert_that(all(names(res) %in% req_elements)) + res[["system"]] <- res[["system"]] |> + tibble::as_tibble_row() + res[["hash_table_build"]] <- res[["hash_table_build"]] |> + tibble::as_tibble_row() + # we don't care if the columns are characters, no analysis likely to be done on dragen options + # (though never say never!) + res[["dragen_config"]] <- res[["dragen_config"]] |> + tidyr::pivot_wider(names_from = "name", values_from = "value") + dat <- dplyr::bind_cols(res) + tibble::tibble(name = "replay", data = list(dat)) + }, + #' @description Read `contig_mean_cov.csv` file. + #' @param x Path to file. + #' @param keep_alt Keep ALT contigs. + read_contigMeanCov = function(x, keep_alt = FALSE) { + subprefix <- private$dragen_subprefix(x, "_contig_mean_cov") + dat <- readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> + dplyr::filter( + if (!keep_alt) { + !grepl("chrM|MT|_|Autosomal|HLA-|EBV|GL|hs37d5", .data$chrom) + } else { + TRUE + } + ) + tibble::tibble(name = glue("contigmeancov_{subprefix}"), data = list(dat[])) + }, + #' @description Read `coverage_metrics.csv` file. + #' @param x Path to file. + read_coverageMetrics = function(x) { + subprefix <- private$dragen_subprefix(x, "_coverage_metrics") + dat <- dragen_coverage_metrics_read(x) + tibble::tibble(name = glue("covmetrics_{subprefix}"), data = list(dat)) + }, + #' @description Read `fine_hist.csv` file. + #' @param x Path to file. + read_fineHist = function(x) { + subprefix <- private$dragen_subprefix(x, "_fine_hist") + d <- readr::read_csv(x, col_types = "cd") + assertthat::assert_that(all(colnames(d) == c("Depth", "Overall"))) + # there's a max Depth of 2000+, so convert to numeric for easier plotting + dat <- d |> + dplyr::mutate( + Depth = ifelse(grepl("+", .data$Depth), sub("(\\d*)\\+", "\\1", .data$Depth), .data$Depth), + Depth = as.integer(.data$Depth) + ) |> + dplyr::select(depth = "Depth", n_loci = "Overall") + tibble::tibble(name = glue("finehist_{subprefix}"), data = list(dat)) + }, + #' @description Read `fragment_length_hist.csv` file. + #' @param x Path to file. + read_fragmentLengthHist = function(x) { + d <- readr::read_lines(x) + assertthat::assert_that(grepl("#Sample", d[1])) + dat <- d |> + tibble::enframe(name = "name", value = "value") |> + dplyr::filter(!grepl("#Sample: |FragmentLength,Count", .data$value)) |> + tidyr::separate_wider_delim(cols = "value", names = c("fragmentLength", "count"), delim = ",") |> + dplyr::mutate( + count = as.numeric(.data$count), + fragmentLength = as.numeric(.data$fragmentLength) + ) |> + dplyr::select("fragmentLength", "count") + tibble::tibble(name = "fraglen", data = list(dat)) + }, + #' @description Read `mapping_metrics.csv` file. + #' @param x Path to file. + read_mappingMetrics = function(x) { + dat <- dragen_mapping_metrics_read(x) + tibble::tibble(name = "mapmetrics", data = list(dat)) + }, + #' @description Read `hist.csv` (not `fine_hist.csv`!) file. + #' @param x Path to file. + read_hist = function(x) { + subprefix <- private$dragen_subprefix(x, "_hist") + d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd") + dat <- d |> + dplyr::mutate( + var = sub("PCT of bases in .* with coverage ", "", .data$var), + var = gsub("\\[|\\]|\\(|\\)", "", .data$var), + var = gsub("x", "", .data$var), + var = gsub("inf", "Inf", .data$var) + ) |> + tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> + dplyr::mutate( + start = as.numeric(.data$start), + end = as.numeric(.data$end), + pct = round(.data$pct, 2), + cumsum = cumsum(.data$pct) + ) + tibble::tibble(name = glue("hist_{subprefix}"), data = list(dat)) + }, + #' @description Read `time_metrics.csv` file. + #' @param x Path to file. + read_timeMetrics = function(x) { + cn <- c("dummy1", "dummy2", "Step", "time_hrs", "time_sec") + ct <- readr::cols( + .default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d" + ) + d <- readr::read_csv(x, col_names = cn, col_types = ct) + assertthat::assert_that(d$dummy1[1] == "RUN TIME", is.na(d$dummy2[1])) + assertthat::assert_that(inherits(d$time_hrs, "hms")) + dat <- d |> + dplyr::mutate( + Step = tools::toTitleCase(sub("Time ", "", .data$Step)), + Step = gsub(" |/", "", .data$Step), + Time = substr(.data$time_hrs, 1, 5) + ) |> + dplyr::select("Step", "Time") |> + tidyr::pivot_wider(names_from = "Step", values_from = "Time") |> + dplyr::relocate("TotalRuntime") + tibble::tibble(name = "timemetrics", data = list(dat)) + }, + #' @description Read `vc_metrics.csv`/`gvcf_metrics.csv` file. + #' @param x Path to file. + read_vcMetrics = function(x) { + subprefix <- private$dragen_subprefix(x, "_metrics") + dat <- dragen_vc_metrics_read(x) + tibble::tibble(name = glue("vcmetrics_{subprefix}"), data = list(dat[])) + }, + #' @description Read `trimmer_metrics.csv` file. + #' @param x Path to file. + read_trimmerMetrics = function(x) { + dat <- dragen_trimmer_metrics_read(x) + tibble::tibble(name = "trimmermetrics", data = list(dat[])) + }, + #' @description Read `sv_metrics.csv` file. + #' @param x Path to file. + read_svMetrics = function(x) { + dat <- dragen_sv_metrics_read(x) + tibble::tibble(name = "svmetrics", data = list(dat[])) + }, + #' @description Read `cnv_metrics.csv` file. + #' @param x Path to file. + read_cnvMetrics = function(x) { + dat <- dragen_cnv_metrics_read(x) + tibble::tibble(name = "cnvmetrics", data = list(dat[])) + }, + #' @description Read `fastqc_metrics.csv` file. + #' @param x Path to file. + read_fastqcMetrics = function(x) { + dat <- dragen_fastqc_metrics_read(x) + dat + }, + #' @description Read `gc_metrics.csv` file. + #' @param x Path to file. + read_gcMetrics = function(x) { + dat <- dragen_gc_metrics_read(x) + dat + }, + #' @description Read `umi_metrics.csv` file. + #' @param x Path to file. + read_umiMetrics = function(x) { + dat <- dragen_umi_metrics_read(x) + dat + }, + #' @description Read `ploidy_estimation_metrics.csv` file. + #' @param x Path to file. + read_ploidyMetrics = function(x) { + dat <- dragen_ploidy_estimation_metrics_read(x) + tibble::tibble(name = "ploidymetrics", data = list(dat)) + }, + #' @description Read `microsat_output.json` file. + #' @param x Path to file. + read_msi = function(x) { + dat <- tso_msi_read(x) + tibble::tibble(name = "msi", data = list(dat[])) + }, + #' @description Read `microsat_diffs.txt` file. + #' @param x Path to file. + read_msiDiffs = function(x) { + dat <- readr::read_tsv(x, col_types = "cdccddc") |> + dplyr::rename(Chromosome = "#Chromosome") + tibble::tibble(name = "msidiffs", data = list(dat[])) + } + ), # end public + private = list( + dragen_subprefix = function(x, suffix) { + bname <- basename(x) + s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov + sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon" + } + ) +) # end Wf_dragen diff --git a/R/tso_dragen.R b/R/tso_dragen.R index d4f7411..e69de29 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -1,302 +0,0 @@ -#' Wf_dragen R6 Class -#' -#' @description -#' Reads and writes tidy versions of files from the `dragen` workflow. -#' -#' @examples -#' \dontrun{ -#' -#' #---- Local ----# -#' prefix <- "L2401290" -#' p <- file.path( -#' "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", -#' "analysis/cttsov2/20240915ff0295ed/Logs_Intermediates/DragenCaller", -#' prefix -#' ) -#' d1 <- Wf_dragen$new(path = p, prefix = prefix) -#' d1$list_files(max_files = 100) -#' d1$list_files_filter_relevant(max_files = 300) -#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) -#' d_tidy <- d1$tidy_files(d) -#' d_write <- d1$write( -#' d_tidy, -#' outdir = file.path(p, "dracarys_tidy"), -#' prefix = prefix, -#' format = "tsv" -#' ) -#' #---- GDS ----# -#' prefix <- "PRJ222358" -#' p <- file.path( -#' "gds://production/analysis_data/SBJ03001/wgs_tumor_normal", -#' "20241108fc293a38/L2201805_L2201797_dragen_somatic" -#' ) -#' outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case -#' d1 <- Wf_dragen$new(path = p, prefix = prefix) -#' d1$list_files(max_files = 100) -#' d1$list_files_filter_relevant(max_files = 300) -#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) -#' d_tidy <- d1$tidy_files(d) -#' d_write <- d1$write( -#' d_tidy, -#' outdir = file.path(p, "dracarys_tidy"), -#' prefix = prefix, -#' format = "tsv" -#' ) -#' } -#' @export -Wf_dragen <- R6::R6Class( - "Wf_dragen", - inherit = Wf, - public = list( - #' @field prefix The LibraryID prefix of the sample (needed for path lookup). - prefix = NULL, - #' @description Create a new Wf_dragen object. - #' @param path Path to directory with raw workflow results (from S3 or - #' local filesystem). - #' @param prefix The LibraryID prefix of the sample (needed for path lookup). - initialize = function(path = NULL, prefix = NULL) { - wname <- "dragen" - pref <- prefix - tn1 <- "(|_tumor|_normal)" - regexes <- tibble::tribble( - ~regex, ~fun, - glue("{pref}\\-replay\\.json$"), "read_replay", - glue("{pref}\\.cnv_metrics.csv$"), "read_cnvMetrics", - glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "read_contigMeanCov", - glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "read_contigMeanCov", - glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "read_contigMeanCov", - glue("{pref}\\.wgs_contig_mean_cov{tn1}\\.csv$"), "read_contigMeanCov", - glue("{pref}\\.exon_coverage_metrics\\.csv$"), "read_coverageMetrics", - glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "read_coverageMetrics", - glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "read_coverageMetrics", - glue("{pref}\\.wgs_coverage_metrics{tn1}\\.csv$"), "read_coverageMetrics", - glue("{pref}\\.exon_fine_hist\\.csv$"), "read_fineHist", - glue("{pref}\\.target_bed_fine_hist\\.csv$"), "read_fineHist", - glue("{pref}\\.tmb_fine_hist\\.csv$"), "read_fineHist", - glue("{pref}\\.wgs_fine_hist{tn1}\\.csv$"), "read_fineHist", - glue("{pref}\\.exon_hist\\.csv$"), "read_hist", - glue("{pref}\\.target_bed_hist\\.csv$"), "read_hist", - glue("{pref}\\.tmb_hist\\.csv$"), "read_hist", - glue("{pref}\\.wgs_hist{tn1}\\.csv$"), "read_hist", - glue("{pref}\\.fastqc_metrics\\.csv$"), "read_fastqcMetrics", - glue("{pref}\\.fragment_length_hist\\.csv$"), "read_fragmentLengthHist", - glue("{pref}\\.gc_metrics\\.csv$"), "read_gcMetrics", - glue("{pref}\\.gvcf_metrics\\.csv$"), "read_vcMetrics", - glue("{pref}\\.mapping_metrics\\.csv$"), "read_mappingMetrics", - glue("{pref}\\.microsat_diffs\\.txt$"), "read_msiDiffs", - glue("{pref}\\.microsat_output\\.json$"), "read_msi", - glue("{pref}\\.sv_metrics\\.csv$"), "read_svMetrics", - glue("{pref}\\.time_metrics\\.csv$"), "read_timeMetrics", - glue("{pref}\\.trimmer_metrics\\.csv$"), "read_trimmerMetrics", - glue("{pref}\\.umi_metrics\\.csv$"), "read_umiMetrics", - glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics", - glue("{pref}\\.ploidy_estimation_metrics\\.csv$"), "read_ploidyMetrics" - ) - - super$initialize(path = path, wname = wname, regexes = regexes) - self$prefix <- prefix - }, - #' @description Print details about the Workflow. - #' @param ... (ignored). - print = function(...) { - res <- tibble::tribble( - ~var, ~value, - "path", private$.path, - "wname", private$.wname, - "filesystem", private$.filesystem, - "prefix", self$prefix - ) - print(res) - invisible(self) - }, - #' @description Read `replay.json` file. - #' @param x Path to file. - read_replay = function(x) { - res <- x |> - jsonlite::read_json(simplifyVector = TRUE) |> - purrr::map_if(is.data.frame, tibble::as_tibble) - req_elements <- c("command_line", "hash_table_build", "dragen_config", "system") - assertthat::assert_that(all(names(res) %in% req_elements)) - res[["system"]] <- res[["system"]] |> - tibble::as_tibble_row() - res[["hash_table_build"]] <- res[["hash_table_build"]] |> - tibble::as_tibble_row() - # we don't care if the columns are characters, no analysis likely to be done on dragen options - # (though never say never!) - res[["dragen_config"]] <- res[["dragen_config"]] |> - tidyr::pivot_wider(names_from = "name", values_from = "value") - dat <- dplyr::bind_cols(res) - tibble::tibble(name = "replay", data = list(dat)) - }, - #' @description Read `contig_mean_cov.csv` file. - #' @param x Path to file. - #' @param keep_alt Keep ALT contigs. - read_contigMeanCov = function(x, keep_alt = FALSE) { - subprefix <- private$dragen_subprefix(x, "_contig_mean_cov") - dat <- readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> - dplyr::filter( - if (!keep_alt) { - !grepl("chrM|MT|_|Autosomal|HLA-|EBV|GL|hs37d5", .data$chrom) - } else { - TRUE - } - ) - tibble::tibble(name = glue("contigmeancov_{subprefix}"), data = list(dat[])) - }, - #' @description Read `coverage_metrics.csv` file. - #' @param x Path to file. - read_coverageMetrics = function(x) { - subprefix <- private$dragen_subprefix(x, "_coverage_metrics") - dat <- dragen_coverage_metrics_read(x) - tibble::tibble(name = glue("covmetrics_{subprefix}"), data = list(dat)) - }, - #' @description Read `fine_hist.csv` file. - #' @param x Path to file. - read_fineHist = function(x) { - subprefix <- private$dragen_subprefix(x, "_fine_hist") - d <- readr::read_csv(x, col_types = "cd") - assertthat::assert_that(all(colnames(d) == c("Depth", "Overall"))) - # there's a max Depth of 2000+, so convert to numeric for easier plotting - dat <- d |> - dplyr::mutate( - Depth = ifelse(grepl("+", .data$Depth), sub("(\\d*)\\+", "\\1", .data$Depth), .data$Depth), - Depth = as.integer(.data$Depth) - ) |> - dplyr::select(depth = "Depth", n_loci = "Overall") - tibble::tibble(name = glue("finehist_{subprefix}"), data = list(dat)) - }, - #' @description Read `fragment_length_hist.csv` file. - #' @param x Path to file. - read_fragmentLengthHist = function(x) { - d <- readr::read_lines(x) - assertthat::assert_that(grepl("#Sample", d[1])) - dat <- d |> - tibble::enframe(name = "name", value = "value") |> - dplyr::filter(!grepl("#Sample: |FragmentLength,Count", .data$value)) |> - tidyr::separate_wider_delim(cols = "value", names = c("fragmentLength", "count"), delim = ",") |> - dplyr::mutate( - count = as.numeric(.data$count), - fragmentLength = as.numeric(.data$fragmentLength) - ) |> - dplyr::select("fragmentLength", "count") - tibble::tibble(name = "fraglen", data = list(dat)) - }, - #' @description Read `mapping_metrics.csv` file. - #' @param x Path to file. - read_mappingMetrics = function(x) { - dat <- dragen_mapping_metrics_read(x) - tibble::tibble(name = "mapmetrics", data = list(dat)) - }, - #' @description Read `hist.csv` (not `fine_hist.csv`!) file. - #' @param x Path to file. - read_hist = function(x) { - subprefix <- private$dragen_subprefix(x, "_hist") - d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd") - dat <- d |> - dplyr::mutate( - var = sub("PCT of bases in .* with coverage ", "", .data$var), - var = gsub("\\[|\\]|\\(|\\)", "", .data$var), - var = gsub("x", "", .data$var), - var = gsub("inf", "Inf", .data$var) - ) |> - tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> - dplyr::mutate( - start = as.numeric(.data$start), - end = as.numeric(.data$end), - pct = round(.data$pct, 2), - cumsum = cumsum(.data$pct) - ) - tibble::tibble(name = glue("hist_{subprefix}"), data = list(dat)) - }, - #' @description Read `time_metrics.csv` file. - #' @param x Path to file. - read_timeMetrics = function(x) { - cn <- c("dummy1", "dummy2", "Step", "time_hrs", "time_sec") - ct <- readr::cols( - .default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d" - ) - d <- readr::read_csv(x, col_names = cn, col_types = ct) - assertthat::assert_that(d$dummy1[1] == "RUN TIME", is.na(d$dummy2[1])) - assertthat::assert_that(inherits(d$time_hrs, "hms")) - dat <- d |> - dplyr::mutate( - Step = tools::toTitleCase(sub("Time ", "", .data$Step)), - Step = gsub(" |/", "", .data$Step), - Time = substr(.data$time_hrs, 1, 5) - ) |> - dplyr::select("Step", "Time") |> - tidyr::pivot_wider(names_from = "Step", values_from = "Time") |> - dplyr::relocate("TotalRuntime") - tibble::tibble(name = "timemetrics", data = list(dat)) - }, - #' @description Read `vc_metrics.csv`/`gvcf_metrics.csv` file. - #' @param x Path to file. - read_vcMetrics = function(x) { - subprefix <- private$dragen_subprefix(x, "_metrics") - dat <- dragen_vc_metrics_read(x) - tibble::tibble(name = glue("vcmetrics_{subprefix}"), data = list(dat[])) - }, - #' @description Read `trimmer_metrics.csv` file. - #' @param x Path to file. - read_trimmerMetrics = function(x) { - dat <- dragen_trimmer_metrics_read(x) - tibble::tibble(name = "trimmermetrics", data = list(dat[])) - }, - #' @description Read `sv_metrics.csv` file. - #' @param x Path to file. - read_svMetrics = function(x) { - dat <- dragen_sv_metrics_read(x) - tibble::tibble(name = "svmetrics", data = list(dat[])) - }, - #' @description Read `cnv_metrics.csv` file. - #' @param x Path to file. - read_cnvMetrics = function(x) { - dat <- dragen_cnv_metrics_read(x) - tibble::tibble(name = "cnvmetrics", data = list(dat[])) - }, - #' @description Read `fastqc_metrics.csv` file. - #' @param x Path to file. - read_fastqcMetrics = function(x) { - dat <- dragen_fastqc_metrics_read(x) - dat - }, - #' @description Read `gc_metrics.csv` file. - #' @param x Path to file. - read_gcMetrics = function(x) { - dat <- dragen_gc_metrics_read(x) - dat - }, - #' @description Read `umi_metrics.csv` file. - #' @param x Path to file. - read_umiMetrics = function(x) { - dat <- dragen_umi_metrics_read(x) - dat - }, - #' @description Read `ploidy_estimation_metrics.csv` file. - #' @param x Path to file. - read_ploidyMetrics = function(x) { - dat <- dragen_ploidy_estimation_metrics_read(x) - tibble::tibble(name = "ploidymetrics", data = list(dat)) - }, - #' @description Read `microsat_output.json` file. - #' @param x Path to file. - read_msi = function(x) { - dat <- tso_msi_read(x) - tibble::tibble(name = "msi", data = list(dat[])) - }, - #' @description Read `microsat_diffs.txt` file. - #' @param x Path to file. - read_msiDiffs = function(x) { - dat <- readr::read_tsv(x, col_types = "cdccddc") |> - dplyr::rename(Chromosome = "#Chromosome") - tibble::tibble(name = "msidiffs", data = list(dat[])) - } - ), # end public - private = list( - dragen_subprefix = function(x, suffix) { - bname <- basename(x) - s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov - sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon" - } - ) -) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index a3fc949..eeeabf1 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tso_dragen.R +% Please edit documentation in R/dragen.R \name{Wf_dragen} \alias{Wf_dragen} \title{Wf_dragen R6 Class} diff --git a/man/dtw_Wf_dragen.Rd b/man/dtw_Wf_dragen.Rd new file mode 100644 index 0000000..e5ecf7a --- /dev/null +++ b/man/dtw_Wf_dragen.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dtw_Wf_dragen} +\alias{dtw_Wf_dragen} +\title{Wf_dragen Download Tidy and Write} +\usage{ +dtw_Wf_dragen( + path, + prefix, + outdir, + outdir_tidy = file.path(outdir, "dracarys_tidy"), + format = "rds", + max_files = 1000, + dryrun = FALSE +) +} +\arguments{ +\item{path}{Path to directory with raw workflow results (S3 or local filesystem).} + +\item{prefix}{The LibraryID prefix of the sample.} + +\item{outdir}{Path to output directory with raw files.} + +\item{outdir_tidy}{Path to output directory with tidy files.} + +\item{format}{Format of output files.} + +\item{max_files}{Max number of files to list.} + +\item{dryrun}{If TRUE, just list the files that will be downloaded (don't +download them).} +} +\value{ +Tibble of tidy tibbles. +} +\description{ +Downloads files from the \code{dragen} workflow and writes them in a tidy format. +} +\examples{ +\dontrun{ +#---- Local ----# + +#---- S3 ----# +path <- file.path( + "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", + "analysis/wgts-qc/20241123ffa837c4/L2401621_dragen_alignment" +) +prefix <- "L2401621" +outdir <- sub("s3:/", "~/s3", path) +dragen_tidy <- dtw_Wf_dragen( + path = path, prefix = prefix, outdir = outdir, + format = "tsv", + dryrun = F +) +} +} From 00c820e026581f0f29b496a4bc77e2711e9a7eb2 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 24 Nov 2024 22:47:34 +1100 Subject: [PATCH 27/32] alignqc: refactor s3 dl_and_tidy script --- .../alignment_qc/dl_and_tidy.R | 139 +++++++++++------- man/s3_file_presignedurl.Rd | 2 +- 2 files changed, 87 insertions(+), 54 deletions(-) diff --git a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R index 9fbfa12..559290d 100755 --- a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R +++ b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R @@ -5,26 +5,38 @@ require(dracarys, include.only = "umccr_tidy") require(glue, include.only = "glue") require(here, include.only = "here") - require(rportal, include.only = c("portaldb_query_workflow")) + require(rportal, include.only = c("orca_workflow_list")) + require(stringr, include.only = "str_remove_all") + require(tidyr, include.only = "unnest") + require(fs, include.only = "dir_create") } -# make sure you have logged into AWS and ICA +# make sure you have logged into AWS c("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION") |> rportal::envvar_defined() |> stopifnot() -icav1_token <- Sys.getenv("ICA_ACCESS_TOKEN") |> - dracarys::ica_token_validate() - -query_workflow_alignqc <- function(start_date) { - wfs <- c("wgs_alignment_qc", "wts_alignment_qc") |> - shQuote() |> - paste(collapse = ", ") - q1 <- glue( - "WHERE \"type_name\" IN ({wfs}) AND \"start\" > date(\'{start_date}\') ", - "ORDER BY \"start\" DESC;" - ) - rportal::portaldb_query_workflow(q1) -} +token <- rportal::orca_jwt() |> + rportal::jwt_validate() +dates <- c( + "2024-11-23", + "2024-11-24" +) |> + stringr::str_remove_all("-") |> + paste(collapse = "|") +wf0 <- rportal::orca_workflow_list(wf_name = "wgts-qc", token = token, page_size = 500) +# get pld +wf1 <- wf0 |> + filter(grepl(dates, .data$portalRunId)) |> + rowwise() |> + mutate(pld = list(rportal::orca_wfrid2payload(wfrid = .data$orcabusId, token = token))) |> + ungroup() +# tidy pld +wf2 <- wf1 |> + rowwise() |> + mutate(pld_tidy = list(rportal::pld_wgtsqc(.data$pld))) |> + ungroup() |> + select(workflowRunId = "orcabusId", portalRunId, currentStateTimestamp, pld_tidy) |> + tidyr::unnest(pld_tidy) query_limsrow_libids <- function(libids) { assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) @@ -34,61 +46,82 @@ query_limsrow_libids <- function(libids) { rportal::portaldb_query_limsrow(q1) } -# first read in the workflows table, extract metadata, then join with lims -start_date <- "2024-10-11" -p_raw <- query_workflow_alignqc(start_date) +lims0 <- query_limsrow_libids(wf2$libraryId) -wgs <- p_raw |> - rportal::meta_wgs_alignment_qc(status = "Succeeded") -wts <- p_raw |> - rportal::meta_wts_alignment_qc(status = "Succeeded") -p <- bind_rows(wgs, wts) -lims_raw <- query_limsrow_libids(p$LibraryID) - -lims <- lims_raw |> +lims1 <- lims0 |> tidyr::separate_wider_delim( library_id, delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" ) |> select( - subject_id, library_id, sample_id, sample_name, - external_subject_id, external_sample_id, - project_name, project_owner, phenotype, type, - source, assay, quality, workflow + individualId = "subject_id", + libraryId = "library_id", + sampleId = "sample_id", + sampleName = "sample_name", + subjectId = "external_subject_id", + externalSampleId = "external_sample_id", + projectName = "project_name", + projectOwner = "project_owner", + phenotype, type, source, assay, quality, workflow ) |> distinct() -d <- p |> - left_join(lims, by = c("SubjectID" = "subject_id", "LibraryID" = "library_id")) |> +wf_lims <- wf2 |> + left_join(lims1, by = "libraryId") |> select( - "SubjectID", "LibraryID", "SampleID", "lane", "phenotype", "type", "source", - "assay", "workflow", "external_subject_id", "project_name", "project_owner", - "start", "end", "portal_run_id", "gds_outdir_dragen", "fq1", "fq2" + "libraryId", "individualId", "sampleId", "sampleName", "subjectId", + "externalSampleId", "projectName", "projectOwner", + lane = "input_lane", + "phenotype", "sampleType", + date = "currentStateTimestamp", + "source", "assay", "quality", "workflow", + "portalRunId", "output_dragenAlignmentOutputUri", + "input_read1FileUri", "input_read2FileUri", ) |> - mutate(rownum = row_number()) - -tidy_script <- system.file("cli/dracarys.R", package = "dracarys") + mutate(rownum = row_number()) |> + relocate("rownum") +# set up progress bar for the dtw function +nticks <- nrow(wf_lims) +bar_width <- 50 +pb <- progress::progress_bar$new( + format = "[:bar] :current/:total (:percent) elapsed :elapsedfull eta :eta", + total = nticks, clear = FALSE, + show_after = 0, width = bar_width +) +# wrapping the dtw function to use the progress bar +fun1 <- function(path, prefix, outdir) { + pb$tick(0) + res <- dracarys::dtw_Wf_dragen( + path = path, prefix = prefix, + outdir = outdir, format = "rds", + max_files = 1000, + dryrun = FALSE + ) + pb$tick() + return(res) +} -meta <- d |> - relocate(rownum) |> +data_tidy <- wf_lims |> rowwise() |> mutate( - indir = gds_outdir_dragen, - outdir = file.path(sub("gds://", "", .data$indir)), - outdir = file.path(normalizePath("~/icav1/g"), .data$outdir), - # indir = file.path(outdir, "dracarys_gds_sync"), # for when debugging locally - cmd = system( - glue( - "echo ---{.data$rownum}--- && ", - "{tidy_script} tidy --in_dir {.data$indir} ", - "--out_dir {.data$outdir} --prefix {.data$SampleID} ", - "--token {icav1_token} ", - "--format rds" + indir = .data$output_dragenAlignmentOutputUri, + outdir = file.path(sub("s3://", "", .data$indir)), + outdir = file.path(normalizePath("~/s3"), .data$outdir) + # indir = file.path(outdir, "dracarys_s3_sync"), # for when debugging locally + ) |> + mutate( + data_tidy = list( + fun1( + path = .data$indir, + prefix = .data$libraryId, + outdir = .data$outdir ) ) ) |> ungroup() -meta |> - saveRDS(here(glue("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/{start_date}_wgts.rds"))) +outdir1 <- fs::dir_create("inst/rmd/umccr_workflows/alignment_qc/nogit/tidy_data_rds") +date1 <- "2024-11-24" +data_tidy |> + saveRDS(here(glue("{outdir1}/{date1}_wgts.rds"))) diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd index 79eb7d6..f7dddd2 100644 --- a/man/s3_file_presignedurl.Rd +++ b/man/s3_file_presignedurl.Rd @@ -4,7 +4,7 @@ \alias{s3_file_presignedurl} \title{S3 Generate Presigned URL} \usage{ -s3_file_presignedurl(client, s3path, expiry_seconds = 3600) +s3_file_presignedurl(client, s3path, expiry_seconds = 604800) } \arguments{ \item{client}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).} From 814df0c44265c122b8369c2d87a1bc4d9418a0c5 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 24 Nov 2024 23:08:23 +1100 Subject: [PATCH 28/32] alignqc: summary.Rmd -> summary.qmd --- .../alignment_qc/summary.Rmd => reports/wgts-qc/summary.qmd} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename inst/{rmd/umccr_workflows/alignment_qc/summary.Rmd => reports/wgts-qc/summary.qmd} (100%) diff --git a/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd b/inst/reports/wgts-qc/summary.qmd similarity index 100% rename from inst/rmd/umccr_workflows/alignment_qc/summary.Rmd rename to inst/reports/wgts-qc/summary.qmd From 592f3cd6af6e54b48d1ec4bd0f0fde2e24edd92f Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 25 Nov 2024 01:08:40 +1100 Subject: [PATCH 29/32] alignqc: refactor summary report --- inst/reports/wgts-qc/summary.qmd | 375 ++++++++++++++++--------------- 1 file changed, 191 insertions(+), 184 deletions(-) diff --git a/inst/reports/wgts-qc/summary.qmd b/inst/reports/wgts-qc/summary.qmd index 95807e2..51b729b 100644 --- a/inst/reports/wgts-qc/summary.qmd +++ b/inst/reports/wgts-qc/summary.qmd @@ -1,36 +1,37 @@ --- -author: "University of Melbourne Centre for Cancer Research" -date: "`r Sys.time()`" -output: - html_document: - toc: true - theme: cosmo - rmdformats::material: - highlight: kate +title: "WGTS Alignment QC Summary" +author: "CCGCM - Genomics Platform Group" +date: now +date-format: "YYYY-MM-DD HH:mm Z" +execute: + echo: false +format: + html: + toc: false + toc-expand: 1 + toc-title: Contents + toc-location: body + highlight-style: github + number-sections: false + link-external-icon: true + link-external-newwindow: true + embed-resources: true + code-copy: true + code-link: true + code-fold: true + code-block-border-left: true + smooth-scroll: true + grid: + body-width: 1300px params: - title: "UMCCR Alignment QC Summary Report" - meta: !r here::here("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/2024-10-11_wgts.rds") -description: "UMCCR Alignment QC Summary Report" -title: "`r params$title`" + tidy_data: "~/projects/dracarys/inst/rmd/umccr_workflows/alignment_qc/nogit/tidy_data_rds/2024-11-24_wgts.rds" --- -```{r knitr_opts, include=F} -knitr::opts_chunk$set( - collapse = TRUE, echo = FALSE, - warning = FALSE, message = FALSE, - fig.width = 10, fig.height = 15 -) -``` - -```{css} -.main-container { - max-width: 1400px !important; - margin-left: auto; - margin-right: auto; -} -``` +```{r} +#| label: pkgs +#| message: false +#| warning: false -```{r load_pkgs} { require(dplyr) require(dracarys, include.only = "session_info_kable") @@ -49,92 +50,103 @@ knitr::opts_chunk$set( } ``` -```{r data_setup} -ggplot2::theme_set(ggplot2::theme_bw()) -meta <- params[["meta"]] |> +```{r} +#| label: data_import +tidy_data_path <- params[["tidy_data"]] +d0 <- tidy_data_path |> readr::read_rds() |> - mutate(topup_or_rerun = stringr::str_extract(fq1, "topup|rerun(2)?")) + slice(1:4) |> + mutate( + umccrId = glue("{.data$individualId}_{.data$libraryId}_{.data$lane}"), + umccrId = factor(.data$umccrId), + projectOwnerName = glue("{.data$projectOwner}_{.data$projectName}") + ) |> + select( + "umccrId", + "subjectId", + "libraryId", + "projectOwnerName", + "sampleType", + "phenotype", + "source", + "quality", + "assay", + "workflow", + "portalRunId", + "date", + "data_tidy" + ) +``` + +```{r} +#| label: data_setup +ggplot2::theme_set(ggplot2::theme_bw()) stopifnot(all(dir.exists(meta$outdir))) options(scipen = 999) # disable scientific notation options(width = 150) -filepaths <- function(indir, sampleid, suffix = "rds") { - tibble::tibble( - ftype = c( - paste0( - "FastqcMetricsFile_", - c( - "positional_base_content", "positional_base_mean_quality", - "positional_quality", "read_gc_content", "read_gc_content_quality", - "read_lengths", "read_mean_quality", "sequence_positions" - ) - ), - "FragmentLengthHistFile", - "MappingMetricsFile", - "PloidyEstimationMetricsFile", - "ReplayFile", - "TimeMetricsFile", - "TrimmerMetricsFile", - "WgsContigMeanCovFile", - "WgsCoverageMetricsFile", - "WgsFineHistFile", - "WgsHistFile" - ) - ) |> - mutate( - fpath = file.path(indir, glue("{sampleid}_{.data$ftype}.{suffix}")), - file_exists = file.exists(.data$fpath) - ) -} -dat <- meta |> - rowwise() |> - mutate( - fpaths = list(filepaths(indir = .data$outdir, sampleid = .data$SampleID)), - umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}"), - umccrid = if_else(is.na(.data$topup_or_rerun), .data$umccrid, glue("{umccrid}_{.data$topup_or_rerun}")) - ) |> - select("umccrid", "phenotype", "type", "source", "fpaths") |> - tidyr::unnest(fpaths) |> - filter(.data$file_exists) |> - rowwise() |> - mutate( - dat = list(readr::read_rds(.data$fpath)) +d_unnest <- d0 |> + select( + "umccrId", "libraryId", "subjectId", + type = "sampleType", "phenotype", "source", + "quality", "assay", "workflow", "projectOwnerName", "portalRunId", tidy = "data_tidy" ) |> - ungroup() - -eval <- dat |> - group_by(ftype) |> - count(file_exists, name = "nf") |> - mutate(eval = nf > 0) |> - select("ftype", "eval") |> - tibble::deframe() |> - as.list() - -# filetype-specific access -d <- dat |> - select("umccrid", "phenotype", "type", "source", "ftype", "dat") |> - tidyr::nest(data = c("umccrid", "phenotype", "type", "source", "dat")) + tidyr::unnest("tidy", names_sep = "_") +# tablename-specific access +# columns: tidy_name, data +# rows: 1 per tidy_table name +d_name <- d_unnest |> + tidyr::nest(.by = "tidy_name", .key = "data") + +## A tibble: 18 × 2 +# tidy_name data +# +# 1 contigmeancov_wgs +# 2 covmetrics_wgs +# 3 finehist_wgs +# 4 fqc_positionalBaseContent +# 5 fqc_positionalBaseMeanQuality +# 6 fqc_positionalQuality +# 7 fqc_readGCContent +# 8 fqc_readGCContentQuality +# 9 fqc_readLengths +# 10 fqc_readMeanQuality +# 11 fqc_sequencePositions +# 12 fraglen +# 13 hist_wgs +# 14 mapmetrics +# 15 ploidymetrics +# 16 replay +# 17 timemetrics +# 18 trimmermetrics # sample-specific access -# d_samp <- dat |> -# select("umccrid", "phenotype", "type", "ftype", "dat") |> -# tidyr::nest(data = c("phenotype", "type", "ftype", "dat")) |> -# arrange(desc("umccrid")) +# columns: umccrId, data +# rows: 1 per umccrId +d_samp <- d_unnest |> + tidyr::nest(.by = "umccrId", .key = "data") |> + arrange(desc(.data$umccrId)) +## A tibble: 4 × 2 +# umccrId data +# +# 1 SBJ05890_L2401624 +# 2 SBJ05889_L2401644 +# 3 SBJ05888_L2401621 +# 4 SBJ05856_L2401572 + +dr_unnest <- function(x1, ...) { + d_name |> + dplyr::filter(.data$tidy_name == x1) |> + tidyr::unnest("data") |> + dplyr::mutate(nrows = purrr::map_int(.data$tidy_data, nrow)) |> + dplyr::filter(.data$nrows > 0) |> + dplyr::select(dplyr::everything(), -c("tidy_name", "nrows")) |> + dplyr::relocate("tidy_data", .after = dplyr::last_col()) |> + tidyr::unnest("tidy_data") +} ``` ```{r funcs} -dr_unnest <- function(x1) { - d |> - filter(.data$ftype == x1) |> - tidyr::unnest(data) |> - rowwise() |> - mutate(nrows = nrow(.data$dat)) |> - ungroup() |> - filter(nrows > 0) |> - tidyr::unnest(dat) |> - select("umccrid", "phenotype", "type", "source", everything(), -c("ftype", "nrows")) -} - dt_view <- function(x, caption = NULL, scroll_y = 10 + min(nrow(x) * 35, 570), ...) { x |> DT::datatable( @@ -155,15 +167,12 @@ blank_lines <- function(n = 10) { cat(rep("  ", n), sep = "\n") } -get_sbj_url <- function(x, colour = NULL, account = "pro") { - assertthat::assert_that(account %in% c("pro", "stg", "dev")) - account <- ifelse(account == "pro", "", account) - sbj_url <- glue("https://portal{account}.umccr.org/subjects/{x}/overview") +get_lib_url <- function(lid, text, colour = NULL) { + url <- glue("https://orcaui.umccr.org/lab?tab=library&search={lid}") if (!is.null(colour)) { - return(glue("{x}")) + return(glue("{text}")) } - sbj_url <- glue("{x}") - sbj_url + return(glue("{text}")) } type_col <- list( @@ -175,17 +184,13 @@ type_col <- list( ## Sample Metadata ```{r meta} -meta |> - arrange(desc(SubjectID), type, LibraryID, lane) |> +d0 |> + arrange(desc(.data$umccrId), libraryId, sampleType) |> mutate( - SubjectID = get_sbj_url(.data$SubjectID), - durationMin = round(end - start) - ) |> - select( - SubjectID, type, LibraryID, lane, durationMin, topup_or_rerun, - everything(), - -c("rownum", "indir", "outdir", "cmd", "fq1", "fq2") + umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId), + date_analysis_end = lubridate::ymd_hms(.data$date) ) |> + select(umccrId, type = "sampleType", everything(), -data_tidy) |> dt_view(escape = FALSE) |> DT::formatStyle( "type", @@ -200,33 +205,39 @@ meta |> ### Mapping ```{r mm, eval=eval$MappingMetricsFile} -d_map <- dr_unnest("MappingMetricsFile") |> - arrange(desc(umccrid), type) |> +d_map <- dr_unnest("mapmetrics") |> + arrange(desc(umccrId), type) |> + mutate( + umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId), + RG = ifelse(.data$RG == "Total", "Total", "RG") + ) |> select( - umccrid, phenotype, type, - source, - tot = reads_tot_rg_dragen, - dup = reads_num_dupmarked_dragen, - `dup%` = reads_num_dupmarked_dragen_pct, - `map%` = reads_mapped_dragen_pct, - `unmap%` = reads_unmapped_dragen_pct, - `uniq%` = reads_num_uniq_dragen_pct, - `uniq_map%` = reads_num_uniq_mapped_dragen_pct, - `paired%` = reads_paired_dragen_pct, - `paired_proper%` = reads_paired_proper_dragen_pct, - `singleton%` = reads_singleton_dragen_pct, - `discordant%` = reads_discordant_dragen_pct, - `rrna_filt%` = reads_rrna_filtered_dragen_pct, - `splicejunc%` = reads_splicejunc_dragen_pct, - `mapq_0-10%` = reads_mapq_0_10_dragen_pct, - `mapq_10-20%` = reads_mapq_10_20_dragen_pct, - `mapq_20-30%` = reads_mapq_20_30_dragen_pct, - `mapq_30-40%` = reads_mapq_30_40_dragen_pct, - `mapq_40-Inf%` = reads_mapq_40_inf_dragen_pct, - read_len = read_len_dragen, - insert_len_med = insert_len_median_dragen, - insert_len_mean = insert_len_mean_dragen, - everything() + umccrId, subjectId, + phenotype, type, + source, quality, assay, workflow, projectOwnerName, portalRunId, RG, + tot_reads = reads_tot_rg, + dup_reads = reads_num_dupmarked, + `dup%` = reads_num_dupmarked_pct, + `map%` = reads_mapped_pct, + `unmap%` = reads_unmapped_pct, + `uniq%` = reads_num_uniq_pct, + `uniq_map%` = reads_num_uniq_mapped_pct, + `paired%` = reads_paired_pct, + `paired_proper%` = reads_paired_proper_pct, + `singleton%` = reads_singleton_pct, + `discordant%` = reads_discordant_pct, + `rrna_filt%` = reads_rrna_filtered_pct, + `splicejunc%` = reads_splicejunc_pct, + `mapq_0-10%` = reads_mapq_0_10_pct, + `mapq_10-20%` = reads_mapq_10_20_pct, + `mapq_20-30%` = reads_mapq_20_30_pct, + `mapq_30-40%` = reads_mapq_30_40_pct, + `mapq_40-Inf%` = reads_mapq_40_inf_pct, + read_len = read_len, + insert_len_med = insert_len_median, + insert_len_mean = insert_len_mean, + everything(), + -c("libraryId", "tidy_prefix", "dragen_sample") ) num_cols <- purrr::map_lgl(d_map, is.numeric) num_pct_cols <- grepl("%", names(d_map)) & num_cols @@ -239,14 +250,7 @@ conf <- list( pink_range = c(8, 20) ) d_map |> - left_join( - meta |> - mutate(umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}")) |> - select(umccrid, assay, workflow, project_name, project_owner), - by = "umccrid" - ) |> - select(umccrid, phenotype, type, source, assay, workflow, project_name, project_owner, everything()) |> - dt_view(scroll_y = 1500) |> + dt_view(scroll_y = 1500, escape = FALSE) |> DT::formatCurrency(columns = names(d_map)[num_pct_cols], currency = "", digits = 1) |> DT::formatCurrency(columns = names(d_map)[no_numpct_cols], currency = "", digits = 0) |> DT::formatStyle( @@ -276,49 +280,52 @@ d_map |> - Ploidy metrics only for **WGS**. ```{r covm, eval=eval$WgsCoverageMetricsFile} -d_pl <- dr_unnest("PloidyEstimationMetricsFile") |> - arrange(desc(umccrid)) +d_pl <- dr_unnest("ploidymetrics") |> + arrange(desc(umccrId)) d_pl_metrics <- d_pl |> select( - umccrid, phenotype, type, source, - ploidy = ploidy_est_dragen, - cvg_auto_med_ploidy = cov_auto_median_dragen, - cvg_x_med_ploidy = cov_x_median_dragen, - cvg_y_med_ploidy = cov_y_median_dragen + umccrId, subjectId, + phenotype, type, + source, quality, assay, workflow, projectOwnerName, portalRunId, + ploidy = ploidy_est, + cvg_auto_med_ploidy = cov_autosomal_median, + cvg_x_med_ploidy = cov_x_median, + cvg_y_med_ploidy = cov_y_median ) # cov_genome_pct_* metrics are in the Hist data, so filter out here -d_cvg <- dr_unnest("WgsCoverageMetricsFile") |> - arrange(desc(umccrid)) |> - left_join(d_pl_metrics, by = c("umccrid", "phenotype", "type", "source")) |> +d_cvg <- dr_unnest("covmetrics_wgs") |> + arrange(desc(umccrId)) |> + left_join(d_pl_metrics, by = c( + "umccrId", "subjectId", "phenotype", "type", "source", + "quality", "assay", + "workflow", "projectOwnerName", "portalRunId" + )) |> + mutate(umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId)) |> select( - umccrid, phenotype, type, source, + "umccrId", "phenotype", "type", "source", + "quality", "assay", + "workflow", "projectOwnerName", "portalRunId", ploidy, - cvg_auto_avg = cov_avg_auto_over_genome_dragen, - cvg_auto_med = cov_median_auto_over_genome_dragen, - cvg_x_avg = cov_avg_x_over_genome_dragen, - cvg_y_avg = cov_avg_y_over_genome_dragen, - cvg_uniq = cov_alignment_avg_over_genome_dragen, - cvg_mito_avg = cov_avg_mt_over_genome_dragen, + cvg_auto_avg = cov_avg_auto_over_genome, + cvg_auto_med = cov_median_auto_over_genome, + cvg_x_avg = cov_avg_x_over_genome, + cvg_y_avg = cov_avg_y_over_genome, + cvg_uniq = cov_alignment_avg_over_genome, + cvg_mito_avg = cov_avg_mt_over_genome, cvg_auto_med_ploidy, cvg_x_med_ploidy, cvg_y_med_ploidy, - reads_aligned_dragen, - bases_aligned_dragen, - cvg_gt02 = cov_uniformity_over_genome_pct_gt02mean_dragen, - cvg_gt04 = cov_uniformity_over_genome_pct_gt04mean_dragen, + reads_aligned_tot, + bases_aligned_tot, + cvg_gt02 = cov_uniformity_pct_gt02mean_genome, + cvg_gt04 = cov_uniformity_pct_gt04mean_genome, , everything(), - -contains("cov_genome_pct_") + -contains("cov_pct_"), + -c("libraryId", "tidy_prefix") ) num_cols <- names(d_cvg)[purrr::map_lgl(d_cvg, is.numeric)] d_cvg |> - left_join( - meta |> - mutate(umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}")) |> - select(umccrid, assay, workflow, project_name, project_owner), - by = "umccrid" - ) |> - select(umccrid, phenotype, type, source, assay, workflow, project_name, project_owner, everything()) |> - dt_view(scroll_y = 1500) |> + dt_view(scroll_y = 1500, escape = FALSE) |> DT::formatCurrency(columns = num_cols, currency = "", digits = 1) |> DT::formatStyle( "ploidy", @@ -337,7 +344,7 @@ d_cvg |> ### Trimmer ```{r trim, eval=eval$TrimmerMetricsFile} -d_tr <- dr_unnest("TrimmerMetricsFile") |> +d_tr <- dr_unnest("trimmermetrics") |> arrange(desc(umccrid)) |> select( umccrid, phenotype, type, source, From 661e01ee51535c44bad5288e46434337e4d2e065 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 1 Dec 2024 15:06:25 +1100 Subject: [PATCH 30/32] alignqc: refactor summary report --- inst/reports/wgts-qc/summary.qmd | 286 +++++++++--------- .../alignment_qc/dl_and_tidy.R | 4 +- 2 files changed, 143 insertions(+), 147 deletions(-) diff --git a/inst/reports/wgts-qc/summary.qmd b/inst/reports/wgts-qc/summary.qmd index 51b729b..3601013 100644 --- a/inst/reports/wgts-qc/summary.qmd +++ b/inst/reports/wgts-qc/summary.qmd @@ -13,7 +13,7 @@ format: toc-location: body highlight-style: github number-sections: false - link-external-icon: true + link-external-icon: false link-external-newwindow: true embed-resources: true code-copy: true @@ -55,7 +55,6 @@ params: tidy_data_path <- params[["tidy_data"]] d0 <- tidy_data_path |> readr::read_rds() |> - slice(1:4) |> mutate( umccrId = glue("{.data$individualId}_{.data$libraryId}_{.data$lane}"), umccrId = factor(.data$umccrId), @@ -81,7 +80,6 @@ d0 <- tidy_data_path |> ```{r} #| label: data_setup ggplot2::theme_set(ggplot2::theme_bw()) -stopifnot(all(dir.exists(meta$outdir))) options(scipen = 999) # disable scientific notation options(width = 150) @@ -146,7 +144,8 @@ dr_unnest <- function(x1, ...) { } ``` -```{r funcs} +```{r} +#| label: funcs dt_view <- function(x, caption = NULL, scroll_y = 10 + min(nrow(x) * 35, 570), ...) { x |> DT::datatable( @@ -183,14 +182,15 @@ type_col <- list( ## Sample Metadata -```{r meta} +```{r} +#| label: meta d0 |> arrange(desc(.data$umccrId), libraryId, sampleType) |> mutate( umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId), date_analysis_end = lubridate::ymd_hms(.data$date) ) |> - select(umccrId, type = "sampleType", everything(), -data_tidy) |> + select(umccrId, type = "sampleType", everything(), -data_tidy, -date) |> dt_view(escape = FALSE) |> DT::formatStyle( "type", @@ -204,7 +204,8 @@ d0 |> ### Mapping -```{r mm, eval=eval$MappingMetricsFile} +```{r} +#| label: mapmetrics d_map <- dr_unnest("mapmetrics") |> arrange(desc(umccrId), type) |> mutate( @@ -279,7 +280,8 @@ d_map |> - Ploidy metrics only for **WGS**. -```{r covm, eval=eval$WgsCoverageMetricsFile} +```{r} +#| label: covmetrics d_pl <- dr_unnest("ploidymetrics") |> arrange(desc(umccrId)) d_pl_metrics <- d_pl |> @@ -343,38 +345,45 @@ d_cvg |> ### Trimmer -```{r trim, eval=eval$TrimmerMetricsFile} +```{r} +#| label: trimmermetrics d_tr <- dr_unnest("trimmermetrics") |> - arrange(desc(umccrid)) |> + arrange(desc(umccrId), type) |> + mutate( + umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId) + ) |> select( - umccrid, phenotype, type, source, - reads_tot = reads_tot_input_dragen, - read_len_avg = read_len_avg_dragen, - `polygkmers3r1_remain%` = polygkmers3r1_remaining_dragen_pct, - `polygkmers3r2_remain%` = polygkmers3r2_remaining_dragen_pct, - `polyg_soft_trimmed_reads_unfilt_3r1%` = polyg_soft_trimmed_reads_unfilt_3r1_dragen_pct, - `polyg_soft_trimmed_reads_unfilt_3r2%` = polyg_soft_trimmed_reads_unfilt_3r2_dragen_pct, - `polyg_soft_trimmed_bases_unfilt_3r1%` = polyg_soft_trimmed_bases_unfilt_3r1_dragen_pct, - `polyg_soft_trimmed_bases_unfilt_3r2%` = polyg_soft_trimmed_bases_unfilt_3r2_dragen_pct, - polygkmers3r1_remaining = polygkmers3r1_remaining_dragen, - polygkmers3r2_remaining = polygkmers3r2_remaining_dragen, - polyg_soft_trimmed_reads_unfilt_3r1 = polyg_soft_trimmed_reads_unfilt_3r1_dragen, - polyg_soft_trimmed_reads_unfilt_3r2 = polyg_soft_trimmed_reads_unfilt_3r2_dragen, - polyg_soft_trimmed_bases_unfilt_3r1 = polyg_soft_trimmed_bases_unfilt_3r1_dragen, - polyg_soft_trimmed_bases_unfilt_3r2 = polyg_soft_trimmed_bases_unfilt_3r2_dragen, - bases_tot = bases_tot_dragen, - bases_r1 = bases_r1_dragen, - bases_r2 = bases_r2_dragen, - reads_trimmed_tot = reads_trimmed_tot_dragen, - `reads_trimmed_tot%` = reads_trimmed_tot_dragen_pct, - bases_trimmed_tot = bases_trimmed_tot_dragen, - `bases_trimmed_tot%` = bases_trimmed_tot_dragen_pct, - reads_tot_filt = reads_tot_filt_dragen, - `reads_tot_filt%` = reads_tot_filt_dragen_pct, - everything() + umccrId, subjectId, + phenotype, type, + source, quality, assay, workflow, projectOwnerName, portalRunId, + reads_tot = reads_tot_input, + read_len_avg = read_len_avg, + `polygkmers3r1_remain%` = polygkmers3r1_remaining_pct, + `polygkmers3r2_remain%` = polygkmers3r2_remaining_pct, + `polyg_soft_trimmed_reads_unfilt_3r1%` = polyg_soft_trimmed_reads_unfilt_3r1_pct, + `polyg_soft_trimmed_reads_unfilt_3r2%` = polyg_soft_trimmed_reads_unfilt_3r2_pct, + `polyg_soft_trimmed_bases_unfilt_3r1%` = polyg_soft_trimmed_bases_unfilt_3r1_pct, + `polyg_soft_trimmed_bases_unfilt_3r2%` = polyg_soft_trimmed_bases_unfilt_3r2_pct, + polygkmers3r1_remaining = polygkmers3r1_remaining, + polygkmers3r2_remaining = polygkmers3r2_remaining, + polyg_soft_trimmed_reads_unfilt_3r1 = polyg_soft_trimmed_reads_unfilt_3r1, + polyg_soft_trimmed_reads_unfilt_3r2 = polyg_soft_trimmed_reads_unfilt_3r2, + polyg_soft_trimmed_bases_unfilt_3r1 = polyg_soft_trimmed_bases_unfilt_3r1, + polyg_soft_trimmed_bases_unfilt_3r2 = polyg_soft_trimmed_bases_unfilt_3r2, + bases_tot = bases_tot, + bases_r1 = bases_r1, + bases_r2 = bases_r2, + reads_trimmed_tot = reads_trimmed_tot, + `reads_trimmed_tot%` = reads_trimmed_tot_pct, + bases_trimmed_tot = bases_trimmed_tot, + `bases_trimmed_tot%` = bases_trimmed_tot_pct, + reads_tot_filt = reads_tot_filt, + `reads_tot_filt%` = reads_tot_filt_pct, + everything(), + -c("libraryId", "tidy_prefix") ) d_tr |> - dt_view() |> + dt_view(escape = FALSE) |> DT::formatStyle( "type", color = DT::styleEqual( @@ -391,9 +400,11 @@ in the legend. ### Read Mean Quality ('Per-Sequence Quality Scores') -```{r read_mean_qual, fig.height=10} -f1 <- dr_unnest("FastqcMetricsFile_read_mean_quality") |> - group_by(umccrid, mate) |> +```{r} +#| label: fqc_readMeanQuality +#| fig-height: 10 +f1 <- dr_unnest("fqc_readMeanQuality") |> + group_by(umccrId, mate) |> mutate( tot = sum(.data$value), prop = round(.data$value / .data$tot, 3), @@ -413,7 +424,7 @@ f1_plot <- ggplot() + fill = rep(fqc_colours1$col, length(unique(f1$type))), alpha = 0.7 ) + - geom_line(data = f1, aes(x = q, y = prop, colour = umccrid, linetype = mate), linewidth = 1, show.legend = FALSE) + + geom_line(data = f1, aes(x = q, y = prop, colour = umccrId, linetype = mate), linewidth = 1, show.legend = FALSE) + scale_y_continuous(labels = scales::label_comma()) + theme(panel.grid.major = element_blank()) + facet_wrap(~type, ncol = 1) + @@ -422,15 +433,17 @@ f1_plot <- ggplot() + subtitle = glue("Percentage of reads with average quality scores. Shows if\na subset of reads has poor quality.") ) -plotly::ggplotly(f1_plot) -# f1_plot +# plotly::ggplotly(f1_plot) +f1_plot ``` ### GC Content ('Per-Sequence GC Content') -```{r read_gc, fig.height=10} -gc_data <- dr_unnest("FastqcMetricsFile_read_gc_content") |> - group_by(umccrid, mate) |> +```{r} +#| label: fqc_readGCContent +#| fig-height: 10 +gc_data <- dr_unnest("fqc_readGCContent") |> + group_by(umccrId, mate) |> mutate( tot = sum(.data$value), prop = round(.data$value / .data$tot, 3), @@ -439,7 +452,7 @@ gc_data <- dr_unnest("FastqcMetricsFile_read_gc_content") |> ungroup() gc_data_plot <- gc_data |> - ggplot(aes(x = pct, y = prop, colour = umccrid)) + + ggplot(aes(x = pct, y = prop, colour = umccrId)) + geom_line(aes(linetype = mate), alpha = 0.4, linewidth = 1) + facet_wrap(~type, ncol = 1) + labs( @@ -448,14 +461,15 @@ gc_data_plot <- gc_data |> title = "Read GC Content", subtitle = glue("Total number of reads with each GC content\npercentile between 0% and 100%") ) -plotly::ggplotly(gc_data_plot) -# gc_data_plot +# plotly::ggplotly(gc_data_plot) +gc_data_plot ``` ### GC Content Quality ('GC Content Mean Quality Scores') -```{r read_gc_qual} -f1 <- dr_unnest("FastqcMetricsFile_read_gc_content_quality") |> +```{r} +#| label: fqc_readGCContentQuality +f1 <- dr_unnest("fqc_readGCContentQuality") |> filter(!is.na(.data$value)) fqc_colours2 <- tibble::tibble( start = c(0, 20, 28), @@ -469,28 +483,31 @@ f1_plot <- ggplot() + fill = rep(fqc_colours2$col, length(unique(f1$type))), alpha = 0.7 ) + - geom_line(data = f1, aes(x = pct, y = value, colour = umccrid, linetype = mate), linewidth = 1) + + geom_line(data = f1, aes(x = pct, y = value, colour = umccrId, linetype = mate), linewidth = 1) + facet_wrap(~type, ncol = 1) + labs( title = "GC Content Quality", subtitle = glue("Average Phred-scale read mean quality for reads with\neach GC content percentile between 0% and 100%.") ) -plotly::ggplotly(f1_plot) -# f1_plot +# plotly::ggplotly(f1_plot) +f1_plot ``` ### Positional Base Content ('Per-Position Sequence Content') - TODO: create heatmap instead -```{r fqc_pbc, eval=F, fig.height=42} -f1 <- dr_unnest("FastqcMetricsFile_positional_base_content") +```{r} +#| label: fqc_pbc +#| fig-height: 42 +#| eval: false +f1 <- dr_unnest("fqc_positionalBaseContent") f1 |> filter(base != "N") |> mutate(prop = prop * 100) |> ggplot(aes(x = pos, y = prop, colour = base, group = base)) + geom_line() + - facet_grid(forcats::fct_rev(umccrid) ~ mate) + + facet_grid(forcats::fct_rev(umccrId) ~ mate) + labs( x = "Position in Read (bp)", y = "Proportion of Bases", @@ -505,17 +522,20 @@ f1 |> ### Positional Base Mean Quality ('Per-Position Mean Quality Scores') -```{r fqc_bmq, eval=F, fig.height=80} -f1 <- dr_unnest("FastqcMetricsFile_positional_base_mean_quality") +```{r} +#| label: fqc_bmq +#| fig-height: 80 +#| eval: false +f1 <- dr_unnest("fqc_positionalBaseMeanQuality") ggplot() + geom_rect( data = fqc_colours2, mapping = aes(ymin = start, ymax = end, xmin = -Inf, xmax = Inf), - fill = rep(fqc_colours2$col, length(unique(f1$umccrid)) * length(unique(f1$mate))), + fill = rep(fqc_colours2$col, length(unique(f1$umccrId)) * length(unique(f1$mate))), alpha = 0.7 ) + geom_line(data = f1, aes(x = pos, y = value, colour = base)) + - facet_grid(forcats::fct_rev(umccrid) ~ mate) + + facet_grid(forcats::fct_rev(umccrId) ~ mate) + labs( x = "Position in Read (bp)", y = "Quality Score", @@ -529,16 +549,20 @@ ggplot() + ### Positional Quality ('Per-Position Quality Score Ranges') -```{r fqc_pq, eval=F, fig.width=13} +```{r} +#| label: fqc_pq +#| fig-width: 13 +#| eval: false + # TODO: use boxplot instead of point -f1 <- dr_unnest("FastqcMetricsFile_positional_quality") +f1 <- dr_unnest("fqc_positionalQuality") quants <- c(25, 50, 75) f1 |> mutate(pos = as.integer(.data$pos)) |> filter(pct %in% quants) |> ggplot(aes(x = pos, y = value, colour = pct)) + geom_point() + - facet_wrap(~ forcats::fct_rev(umccrid)) + + facet_wrap(~ forcats::fct_rev(umccrId)) + labs( title = "Positional Quality", subtitle = glue("Phred-scale quality value for bases at a given location and a\ngiven quantile of the distribution ({paste(quants, collapse = ', ')})") @@ -547,17 +571,20 @@ f1 |> ### Read Lengths ('Sequence Length Distribution') -```{r read_len, fig.height=8} -read_len <- dr_unnest("FastqcMetricsFile_read_lengths") +```{r} +#| label: fqc_readLengths +#| fig-height: 8 +read_len <- dr_unnest("fqc_readLengths") read_len_plot <- read_len |> - group_by(umccrid, mate) |> + group_by(umccrId, mate) |> mutate( tot = sum(.data$value), prop = round(.data$value / .data$tot, 3), prop = 100 * prop ) |> ungroup() |> - ggplot(aes(x = bp, y = prop, colour = umccrid)) + + select(umccrId, type, mate, bp, value, tot, prop) |> + ggplot(aes(x = bp, y = prop, colour = umccrId)) + geom_line(aes(linetype = mate), linewidth = 1) + theme( panel.grid.major = element_blank() @@ -567,19 +594,20 @@ read_len_plot <- read_len |> title = "Read Lengths", subtitle = glue("Read percentage with each observed length.") ) -plotly::ggplotly(read_len_plot) -# read_len_plot +# plotly::ggplotly(read_len_plot) +read_len_plot ``` ### Sequence Positions ('Adapter Content') - -```{r seq_pos, eval=T, fig.height=42} -f1 <- dr_unnest("FastqcMetricsFile_sequence_positions") +```{r} +#| label: fqc_sequencePositions +#| fig-height: 42 +f1 <- dr_unnest("fqc_sequencePositions") f1 |> ggplot(aes(x = bp, y = value, colour = seq)) + geom_line() + - facet_grid(forcats::fct_rev(umccrid) ~ mate, scales = "free_y") + + facet_grid(forcats::fct_rev(umccrId) ~ mate, scales = "free_y") + labs(title = glue( "Number of times an adapter or other kmer sequence is found,\n", "starting at a given position in the input reads." @@ -590,16 +618,21 @@ f1 |> ## Coverage {.tabset .tabset-pills} -```{r contig_cvg, eval=F, results='asis', fig.height=5} -d1 <- dr_unnest("WgsContigMeanCovFile") |> - arrange(desc("umccrid")) +```{r} +#| label: contig_cvg +#| fig-height: 5 +#| eval: false +#| results: asis +# TODO: FIXME +d1 <- dr_unnest("contigmeancov_wgs") |> + arrange(desc("umccrId")) for (type1 in sort(unique(d1$type), decreasing = FALSE)) { cat(glue("\n\n### {type1} {{.tabset .tabset-pills}}"), "\n\n") d1_type <- d1 |> filter(type == type1) - for (s in sort(unique(d1_type$umccrid), decreasing = TRUE)) { + for (s in sort(unique(d1_type$umccrId), decreasing = TRUE)) { p1 <- d1_type |> - filter(umccrid == s) |> + filter(umccrId == s) |> dracarys::WgsContigMeanCovFile$public_methods$plot() + ggplot2::labs(subtitle = s) cat(glue("\n#### {s}"), "\n") @@ -618,13 +651,16 @@ for (type1 in sort(unique(d1$type), decreasing = FALSE)) { - Only for WGS. -```{r fraglenhist_plot, eval=eval$FragmentLengthHistFile, fig.height=8} -fl1 <- dr_unnest("FragmentLengthHistFile") +```{r} +#| label: fraglenhist_plot +#| fig-height: 8 + +fl1 <- dr_unnest("fraglen") min_count <- 10 flp <- fl1 |> filter(.data$count >= min_count) |> ggplot(aes(x = .data$fragmentLength, y = .data$count)) + - geom_line(aes(colour = umccrid)) + + geom_line(aes(colour = umccrId)) + labs(title = "Fragment Length Distribution") + xlab("Fragment Length (bp)") + ylab(glue("Read Count (min: {min_count})")) + @@ -643,21 +679,25 @@ plotly::ggplotly(flp) - Only for WGS. ```{r pe, eval=T, fig.height=5} +#| label: pe +#| eval: FALSE +#| fig-height: 5 + chrom_levels <- c(1:22, "x", "y") d_pl_plot_data <- d_pl |> select( - umccrid, phenotype, type, + umccrId, phenotype, type, contains("div_auto_median") ) |> - tidyr::pivot_longer(-c("umccrid", "phenotype", "type")) |> + tidyr::pivot_longer(-c("umccrId", "phenotype", "type")) |> tidyr::separate_wider_delim("name", delim = "_", names = c("cov", "chrom", "rest"), too_many = "merge") |> mutate(chrom = factor(chrom, levels = chrom_levels)) |> - select(umccrid, phenotype, type, chrom, value) + select(umccrId, phenotype, type, chrom, value) d_pl_plot <- d_pl_plot_data |> ggplot(aes(x = chrom, y = value)) + - geom_line(aes(colour = umccrid, group = umccrid), na.rm = TRUE) + - geom_point(aes(colour = umccrid), na.rm = TRUE) + + geom_line(aes(colour = umccrId, group = umccrId), na.rm = TRUE) + + geom_point(aes(colour = umccrId), na.rm = TRUE) + labs(title = "Chromosome Median / Autosomal Median") plotly::ggplotly(d_pl_plot) # d_pl_plot @@ -668,10 +708,14 @@ plotly::ggplotly(d_pl_plot) ## Hist -```{r cvgm, eval=T, fig.height=8, fig.width=12} -d_hist <- dr_unnest("WgsHistFile") +```{r} +#| label: cvgm +#| eval: false +#| fig-height: 8 +#| fig-width: 12 +d_hist <- dr_unnest("hist_wgs") d_hist1 <- d_hist |> - ggplot(aes(x = start, y = pct, colour = umccrid)) + + ggplot(aes(x = start, y = pct, colour = umccrId)) + geom_point() + geom_linerange(aes(xmin = start, xmax = end)) + scale_y_continuous(n.breaks = 10) + @@ -683,7 +727,7 @@ d_hist1 <- d_hist |> subtitle = "e.g. X PCT of bases have coverage between 100 and 500." ) d_hist2 <- d_hist |> - ggplot(aes(x = start, y = cumsum, colour = umccrid)) + + ggplot(aes(x = start, y = cumsum, colour = umccrId)) + geom_point() + geom_line() + scale_x_continuous(n.breaks = 10) + @@ -697,63 +741,15 @@ d_hist2 <- d_hist |> ## FineHist -```{r finehist, eval=F, fig.height=10, fig.width=12} -d_fhist <- dr_unnest("WgsFineHistFile") +```{r} +#| label: finehist +#| eval: false +#| fig-height: 10 +#| fig-width: 12 +d_fhist <- dr_unnest("finehist_wgs") d_fhist |> dracarys::WgsFineHistFile$public_methods$plot(c(0, 150)) + - facet_wrap(~ forcats::fct_rev(umccrid), scales = "free_y") + facet_wrap(~ forcats::fct_rev(umccrId), scales = "free_y") ``` --- - -## Addendum {.tabset .tabset-pills} - -
            - -Details - -### Params - -```{r params_info} -params |> - purrr::modify_if(is.null, \(x) "NULL", .else = as.character) |> - tibble::enframe(name = "Parameter", value = "Value") |> - tidyr::unnest("Value", keep_empty = TRUE) |> - knitr::kable() -``` - -### SessionInfo {.tabset .tabset-pills} - -```{r si_prep} -si <- dracarys:::session_info_tbls() -si_pkg <- si$si_pkg -si_pl <- si$si_pl -``` - -#### Platform - -```{r si_pl} -si_pl |> - knitr::kable() -``` - -#### Packages - -```{r si_pkg} -si_pkg |> - knitr::kable() -``` - -#### SysInfo - -```{r reporter_details, comment = NA} -tibble::tribble( - ~Info, ~Value, - "Node", Sys.info()["nodename"], - "OS", Sys.info()["sysname"], - "User", Sys.info()["user"], -) |> - knitr::kable() -``` - -
            diff --git a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R index 559290d..c1977d4 100755 --- a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R +++ b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R @@ -39,7 +39,7 @@ wf2 <- wf1 |> tidyr::unnest(pld_tidy) query_limsrow_libids <- function(libids) { - assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) + stopifnot(!is.null(libids), all(grepl("^L", libids))) libids <- unique(libids) |> paste(collapse = "|") q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');") @@ -107,7 +107,7 @@ data_tidy <- wf_lims |> mutate( indir = .data$output_dragenAlignmentOutputUri, outdir = file.path(sub("s3://", "", .data$indir)), - outdir = file.path(normalizePath("~/s3"), .data$outdir) + outdir = fs::as_fs_path(file.path(normalizePath("~/s3"), .data$outdir)) # indir = file.path(outdir, "dracarys_s3_sync"), # for when debugging locally ) |> mutate( From f8be64757fae0a43cc0a60b330cd5e3d5cdc18d1 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 3 Dec 2024 00:11:01 +1100 Subject: [PATCH 31/32] remove trailing comma --- inst/reports/wgts-qc/summary.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/reports/wgts-qc/summary.qmd b/inst/reports/wgts-qc/summary.qmd index 3601013..88ceafd 100644 --- a/inst/reports/wgts-qc/summary.qmd +++ b/inst/reports/wgts-qc/summary.qmd @@ -320,7 +320,7 @@ d_cvg <- dr_unnest("covmetrics_wgs") |> reads_aligned_tot, bases_aligned_tot, cvg_gt02 = cov_uniformity_pct_gt02mean_genome, - cvg_gt04 = cov_uniformity_pct_gt04mean_genome, , + cvg_gt04 = cov_uniformity_pct_gt04mean_genome, everything(), -contains("cov_pct_"), -c("libraryId", "tidy_prefix") From 8694edec86db604a62e7edb10612d5fa3afd4b12 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 3 Dec 2024 00:16:04 +1100 Subject: [PATCH 32/32] use purrr instead of stats --- DESCRIPTION | 1 - R/dragen.R | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fe296e3..fb57955 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -58,4 +58,3 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.1 -VignetteBuilder: knitr diff --git a/R/dragen.R b/R/dragen.R index 9e80b62..4cba49b 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -791,13 +791,13 @@ dragen_ploidy_estimation_metrics_read <- function(x) { raw <- readr::read_lines(x) assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1])) fun1 <- function(x) { - setNames( + purrr::set_names( as.character(glue("cov_{tolower(x)}_div_auto_median")), as.character(glue("{x} median / Autosomal median")) ) } fun2 <- function(x) { - setNames( + purrr::set_names( as.character(glue("cov_{tolower(x)}_median")), as.character(glue("{x} median coverage")) )