From 92063e18b0999dc3bce0575ad0666bf5cbb0a299 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 19 Oct 2024 17:33:35 +1100
Subject: [PATCH 01/32] pcgr: use central parser

---
 NAMESPACE             |   3 +-
 R/pcgr.R              | 159 ++++++------------------------------------
 R/sash.R              |  18 +----
 R/umccrise.R          |  18 +----
 man/PcgrJsonFile.Rd   |  93 ------------------------
 man/PcgrTiersFile.Rd  |  93 ------------------------
 man/pcgr_json_read.Rd |  22 ++++++
 7 files changed, 49 insertions(+), 357 deletions(-)
 delete mode 100644 man/PcgrJsonFile.Rd
 delete mode 100644 man/PcgrTiersFile.Rd
 create mode 100644 man/pcgr_json_read.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 6239a8d..5270441 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,8 +6,6 @@ export(BclconvertReports)
 export(BclconvertReports375)
 export(File)
 export(MultiqcFile)
-export(PcgrJsonFile)
-export(PcgrTiersFile)
 export(PloidyEstimationMetricsFile)
 export(Wf)
 export(Wf_dragen)
@@ -59,6 +57,7 @@ export(multiqc_parse_raw_interop)
 export(multiqc_parse_xyline_plot)
 export(multiqc_parse_xyline_plot_contig_cvg)
 export(multiqc_tidy_json)
+export(pcgr_json_read)
 export(rdf2tab)
 export(read)
 export(s3_file_presignedurl)
diff --git a/R/pcgr.R b/R/pcgr.R
index 193fb77..c69be63 100644
--- a/R/pcgr.R
+++ b/R/pcgr.R
@@ -1,144 +1,29 @@
-#' PcgrJson R6 Class
+#' PCGR JSON Read
 #'
-#' @description
-#' Contains methods for reading and displaying contents of the
-#' `pcgr.json.gz` file output from PCGR.
+#' @param x Path to file.
 #'
-#' @examples
-#' \dontrun{
-#' x <- "/path/to/pcgr.json.gz"
-#' d <- PcgrJsonFile$new(x)
-#' d_parsed <- d$read() # or read(d)
-#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both")
-#' }
-#' @export
-PcgrJsonFile <- R6::R6Class(
-  "PcgrJsonFile",
-  inherit = File,
-  public = list(
-    #' @description
-    #' Reads the `pcgr.json.gz` file output from PCGR.
-    #'
-    #' @return List of tibbles.
-    read = function() {
-      x <- self$path
-      j <- read_jsongz_jsonlite(x)
-      # l2tib <- function(el) {
-      #   purrr::flatten(el) |>
-      #     dplyr::bind_rows() |>
-      #     dplyr::mutate(dplyr::across(dplyr::everything(), ~ as.character(.)))
-      # }
-      # dbrel <- j[["metadata"]][["pcgr_db_release"]] |>
-      #   purrr::map(l2tib) |>
-      #   dplyr::bind_rows(.id = "name_tidy") |>
-      #   dplyr::select("name", "name_tidy", "version", "url", "resource_type")
-      # handle nulls and rename - see umccr/dracarys#99
-      tmb <-
-        j[["content"]][["tmb"]][["variant_statistic"]] %||%
-        j[["content"]][["tmb"]][["v_stat"]] %||%
-        list(tmb_estimate = NA, n_tmb = NA)
-      tmb <- purrr::flatten(tmb) |>
-        tibble::as_tibble_row() |>
-        dplyr::select("tmb_estimate", "n_tmb")
-      msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]]
-      # handle nulls
-      msi <- msi %||% list(fracIndels = NA, predicted_class = NA)
-      msi <- purrr::flatten(msi) |>
-        tibble::as_tibble_row() |>
-        dplyr::select("fracIndels", "predicted_class")
-      metrics <- dplyr::bind_cols(msi, tmb)
-      list(
-        # using list in case we want other data as well
-        metrics = metrics
-      )
-    },
-
-    #' @description
-    #' Writes a tidy version of the `pcgr.json.gz` file output from PCGR.
-    #'
-    #' @param d Parsed object from `self$read()`.
-    #' @param prefix Prefix of output file(s).
-    #' @param out_dir Output directory.
-    #' @param out_format Format of output file(s) (one of 'tsv' (def.),
-    #' 'parquet', 'both').
-    write = function(d, out_dir, prefix, out_format = "tsv") {
-      prefix <- file.path(out_dir, prefix)
-      p <- glue("{prefix}_pcgr")
-      l <- list(
-        meta = list(
-          obj = d[["metrics"]],
-          pref = glue("{p}_metrics")
-        )
-      )
-      purrr::map(l, function(k) {
-        write_dracarys(obj = k[["obj"]], prefix = k[["pref"]], out_format = out_format)
-      })
-    }
-  )
-)
-
-#' PcgrTiersFile R6 Class
-#'
-#' @description
-#' Contains methods for reading and displaying contents of the
-#' `pcgr.snvs_indels.tiers.tsv` file output from PCGR.
+#' @return A tibble with: `fracIndels`, `predicted_class`, `tmb_estimate`, `n_tmb`.
 #'
 #' @examples
 #' \dontrun{
-#' x <- "/path/to/pcgr.snvs_indels.tiers.tsv"
-#' d <- PcgrTiersFile$new(x)
-#' d_parsed <- d$read() # or read(d)
-#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both")
+#' pcgr_json_read(x)
 #' }
 #' @export
-PcgrTiersFile <- R6::R6Class(
-  "PcgrTiersFile",
-  inherit = File,
-  public = list(
-    #' @description
-    #' Reads the `pcgr.snvs_indels.tiers.tsv` file output from PCGR.
-    #'
-    #' @return List of tibbles.
-    read = function() {
-      x <- self$path
-      ct <- readr::cols(
-        CHROM = "c", POS = "i", REF = "c", ALT = "c", GENOMIC_CHANGE = "c",
-        GENOME_VERSION = "c", VCF_SAMPLE_ID = "c", VARIANT_CLASS = "c",
-        SYMBOL = "c", GENE_NAME = "c", CCDS = "c", CANONICAL = "c",
-        ENTREZ_ID = "d", UNIPROT_ID = "c", ENSEMBL_TRANSCRIPT_ID = "c",
-        ENSEMBL_GENE_ID = "c", REFSEQ_MRNA = "c", ONCOSCORE = "d",
-        ONCOGENE = "l", TUMOR_SUPPRESSOR = "l", ONCOGENE_EVIDENCE = "c",
-        TUMOR_SUPPRESSOR_EVIDENCE = "c", DISGENET_CUI = "c",
-        DISGENET_TERMS = "c", CONSEQUENCE = "c", PROTEIN_CHANGE = "c",
-        PROTEIN_DOMAIN = "c", CODING_STATUS = "c", EXONIC_STATUS = "c",
-        CDS_CHANGE = "c", HGVSp = "c", HGVSc = "c", EFFECT_PREDICTIONS = "c",
-        MUTATION_HOTSPOT = "c", MUTATION_HOTSPOT_TRANSCRIPT = "c",
-        MUTATION_HOTSPOT_CANCERTYPE = "c", PUTATIVE_DRIVER_MUTATION = "l",
-        CHASMPLUS_DRIVER = "c", CHASMPLUS_TTYPE = "c", VEP_ALL_CSQ = "c",
-        DBSNPRSID = "c", COSMIC_MUTATION_ID = "c", TCGA_PANCANCER_COUNT = "d",
-        TCGA_FREQUENCY = "c", ICGC_PCAWG_OCCURRENCE = "c",
-        CHEMBL_COMPOUND_ID = "c", CHEMBL_COMPOUND_TERMS = "c",
-        SIMPLEREPEATS_HIT = "l", WINMASKER_HIT = "l", OPENTARGETS_RANK = "d",
-        CLINVAR = "c", CLINVAR_CLNSIG = "c", GLOBAL_AF_GNOMAD = "d",
-        GLOBAL_AF_1KG = "d", CALL_CONFIDENCE = "l", DP_TUMOR = "d",
-        AF_TUMOR = "d", DP_CONTROL = "l", AF_CONTROL = "l", TIER = "c",
-        TIER_DESCRIPTION = "c"
-      )
-      readr::read_tsv(x, col_types = ct)
-    },
-
-    #' @description
-    #' Writes a tidy version of the `pcgr.snvs_indels.tiers.tsv` file output from PCGR.
-    #'
-    #' @param d Parsed object from `self$read()`.
-    #' @param prefix Prefix of output file(s).
-    #' @param out_dir Output directory.
-    #' @param out_format Format of output file(s) (one of 'tsv' (def.),
-    #' 'parquet', 'both').
-    write = function(d, out_dir, prefix, out_format = "tsv") {
-      prefix <- file.path(out_dir, prefix)
-      prefix2 <- glue("{prefix}_tiers")
-      write_dracarys(obj = d, prefix = prefix2, out_format = out_format)
-    }
-  )
-)
+pcgr_json_read <- function(x) {
+  j <- read_jsongz_jsonlite(x)
+  tmb <-
+    j[["content"]][["tmb"]][["variant_statistic"]] %||%
+    j[["content"]][["tmb"]][["v_stat"]] %||%
+    list(tmb_estimate = NA, n_tmb = NA)
+  tmb <- purrr::flatten(tmb) |>
+    tibble::as_tibble_row() |>
+    dplyr::select("tmb_estimate", "n_tmb")
+  msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]]
+  # handle nulls
+  msi <- msi %||% list(fracIndels = NA, predicted_class = NA)
+  msi <- purrr::flatten(msi) |>
+    tibble::as_tibble_row() |>
+    dplyr::select("fracIndels", "predicted_class")
+  metrics <- dplyr::bind_cols(msi, tmb)
+  return(metrics)
+}
diff --git a/R/sash.R b/R/sash.R
index 345fc2b..9fc2166 100644
--- a/R/sash.R
+++ b/R/sash.R
@@ -99,22 +99,8 @@ Wf_sash <- R6::R6Class(
     #' @description Read `pcgr.json.gz` file.
     #' @param x Path to file.
     read_pcgr_json = function(x) {
-      j <- read_jsongz_jsonlite(x)
-      tmb <-
-        j[["content"]][["tmb"]][["variant_statistic"]] %||%
-        j[["content"]][["tmb"]][["v_stat"]] %||%
-        list(tmb_estimate = NA, n_tmb = NA)
-      tmb <- purrr::flatten(tmb) |>
-        tibble::as_tibble_row() |>
-        dplyr::select("tmb_estimate", "n_tmb")
-      msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]]
-      # handle nulls
-      msi <- msi %||% list(fracIndels = NA, predicted_class = NA)
-      msi <- purrr::flatten(msi) |>
-        tibble::as_tibble_row() |>
-        dplyr::select("fracIndels", "predicted_class")
-      metrics <- dplyr::bind_cols(msi, tmb)
-      return(metrics)
+      dat <- pcgr_json_read(x)
+      tibble::tibble(name = "pcgrjson", data = list(dat))
     },
     #' @description Read `dragen.tsv.gz` cancer report hrd file.
     #' @param x Path to file.
diff --git a/R/umccrise.R b/R/umccrise.R
index efcd5fe..c097a8f 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -101,22 +101,8 @@ Wf_umccrise <- R6::R6Class(
     #' @description Read `pcgr.json.gz` file.
     #' @param x Path to file.
     read_pcgr_json = function(x) {
-      j <- read_jsongz_jsonlite(x)
-      tmb <-
-        j[["content"]][["tmb"]][["variant_statistic"]] %||%
-        j[["content"]][["tmb"]][["v_stat"]] %||%
-        list(tmb_estimate = NA, n_tmb = NA)
-      tmb <- purrr::flatten(tmb) |>
-        tibble::as_tibble_row() |>
-        dplyr::select("tmb_estimate", "n_tmb")
-      msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]]
-      # handle nulls
-      msi <- msi %||% list(fracIndels = NA, predicted_class = NA)
-      msi <- purrr::flatten(msi) |>
-        tibble::as_tibble_row() |>
-        dplyr::select("fracIndels", "predicted_class")
-      metrics <- dplyr::bind_cols(msi, tmb)
-      return(metrics)
+      dat <- pcgr_json_read(x)
+      tibble::tibble(name = "pcgrjson", data = list(dat))
     },
     #' @description Read `chord.tsv.gz` cancer report file.
     #' @param x Path to file.
diff --git a/man/PcgrJsonFile.Rd b/man/PcgrJsonFile.Rd
deleted file mode 100644
index a554687..0000000
--- a/man/PcgrJsonFile.Rd
+++ /dev/null
@@ -1,93 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pcgr.R
-\name{PcgrJsonFile}
-\alias{PcgrJsonFile}
-\title{PcgrJson R6 Class}
-\description{
-Contains methods for reading and displaying contents of the
-\code{pcgr.json.gz} file output from PCGR.
-}
-\examples{
-\dontrun{
-x <- "/path/to/pcgr.json.gz"
-d <- PcgrJsonFile$new(x)
-d_parsed <- d$read() # or read(d)
-d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both")
-}
-}
-\section{Super class}{
-\code{\link[dracarys:File]{dracarys::File}} -> \code{PcgrJsonFile}
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-PcgrJsonFile-read}{\code{PcgrJsonFile$read()}}
-\item \href{#method-PcgrJsonFile-write}{\code{PcgrJsonFile$write()}}
-\item \href{#method-PcgrJsonFile-clone}{\code{PcgrJsonFile$clone()}}
-}
-}
-\if{html}{\out{
-<details open><summary>Inherited methods</summary>
-<ul>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="bname"><a href='../../dracarys/html/File.html#method-File-bname'><code>dracarys::File$bname()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="initialize"><a href='../../dracarys/html/File.html#method-File-initialize'><code>dracarys::File$initialize()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="print"><a href='../../dracarys/html/File.html#method-File-print'><code>dracarys::File$print()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="type"><a href='../../dracarys/html/File.html#method-File-type'><code>dracarys::File$type()</code></a></span></li>
-</ul>
-</details>
-}}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PcgrJsonFile-read"></a>}}
-\if{latex}{\out{\hypertarget{method-PcgrJsonFile-read}{}}}
-\subsection{Method \code{read()}}{
-Reads the \code{pcgr.json.gz} file output from PCGR.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PcgrJsonFile$read()}\if{html}{\out{</div>}}
-}
-
-\subsection{Returns}{
-List of tibbles.
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PcgrJsonFile-write"></a>}}
-\if{latex}{\out{\hypertarget{method-PcgrJsonFile-write}{}}}
-\subsection{Method \code{write()}}{
-Writes a tidy version of the \code{pcgr.json.gz} file output from PCGR.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PcgrJsonFile$write(d, out_dir, prefix, out_format = "tsv")}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{d}}{Parsed object from \code{self$read()}.}
-
-\item{\code{out_dir}}{Output directory.}
-
-\item{\code{prefix}}{Prefix of output file(s).}
-
-\item{\code{out_format}}{Format of output file(s) (one of 'tsv' (def.),
-'parquet', 'both').}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PcgrJsonFile-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-PcgrJsonFile-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PcgrJsonFile$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/man/PcgrTiersFile.Rd b/man/PcgrTiersFile.Rd
deleted file mode 100644
index a0dc272..0000000
--- a/man/PcgrTiersFile.Rd
+++ /dev/null
@@ -1,93 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pcgr.R
-\name{PcgrTiersFile}
-\alias{PcgrTiersFile}
-\title{PcgrTiersFile R6 Class}
-\description{
-Contains methods for reading and displaying contents of the
-\code{pcgr.snvs_indels.tiers.tsv} file output from PCGR.
-}
-\examples{
-\dontrun{
-x <- "/path/to/pcgr.snvs_indels.tiers.tsv"
-d <- PcgrTiersFile$new(x)
-d_parsed <- d$read() # or read(d)
-d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both")
-}
-}
-\section{Super class}{
-\code{\link[dracarys:File]{dracarys::File}} -> \code{PcgrTiersFile}
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-PcgrTiersFile-read}{\code{PcgrTiersFile$read()}}
-\item \href{#method-PcgrTiersFile-write}{\code{PcgrTiersFile$write()}}
-\item \href{#method-PcgrTiersFile-clone}{\code{PcgrTiersFile$clone()}}
-}
-}
-\if{html}{\out{
-<details open><summary>Inherited methods</summary>
-<ul>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="bname"><a href='../../dracarys/html/File.html#method-File-bname'><code>dracarys::File$bname()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="initialize"><a href='../../dracarys/html/File.html#method-File-initialize'><code>dracarys::File$initialize()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="print"><a href='../../dracarys/html/File.html#method-File-print'><code>dracarys::File$print()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="type"><a href='../../dracarys/html/File.html#method-File-type'><code>dracarys::File$type()</code></a></span></li>
-</ul>
-</details>
-}}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PcgrTiersFile-read"></a>}}
-\if{latex}{\out{\hypertarget{method-PcgrTiersFile-read}{}}}
-\subsection{Method \code{read()}}{
-Reads the \code{pcgr.snvs_indels.tiers.tsv} file output from PCGR.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PcgrTiersFile$read()}\if{html}{\out{</div>}}
-}
-
-\subsection{Returns}{
-List of tibbles.
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PcgrTiersFile-write"></a>}}
-\if{latex}{\out{\hypertarget{method-PcgrTiersFile-write}{}}}
-\subsection{Method \code{write()}}{
-Writes a tidy version of the \code{pcgr.snvs_indels.tiers.tsv} file output from PCGR.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PcgrTiersFile$write(d, out_dir, prefix, out_format = "tsv")}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{d}}{Parsed object from \code{self$read()}.}
-
-\item{\code{out_dir}}{Output directory.}
-
-\item{\code{prefix}}{Prefix of output file(s).}
-
-\item{\code{out_format}}{Format of output file(s) (one of 'tsv' (def.),
-'parquet', 'both').}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PcgrTiersFile-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-PcgrTiersFile-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PcgrTiersFile$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/man/pcgr_json_read.Rd b/man/pcgr_json_read.Rd
new file mode 100644
index 0000000..edd1436
--- /dev/null
+++ b/man/pcgr_json_read.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pcgr.R
+\name{pcgr_json_read}
+\alias{pcgr_json_read}
+\title{PCGR JSON Read}
+\usage{
+pcgr_json_read(x)
+}
+\arguments{
+\item{x}{Path to file.}
+}
+\value{
+A tibble with: \code{fracIndels}, \code{predicted_class}, \code{tmb_estimate}, \code{n_tmb}.
+}
+\description{
+PCGR JSON Read
+}
+\examples{
+\dontrun{
+pcgr_json_read(x)
+}
+}

From cae0e6e5933e7087bffa55ec181b9620b66d64a0 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 19 Oct 2024 18:47:26 +1100
Subject: [PATCH 02/32] dragen_subprefix tests

---
 R/dragen.R                                    | 26 +++++++++++++++---
 man/dragen_subprefix.Rd                       | 27 +++++++++++++++++++
 .../test-roxytest-testexamples-dragen.R       | 17 ++++++++++++
 3 files changed, 67 insertions(+), 3 deletions(-)
 create mode 100644 man/dragen_subprefix.Rd
 create mode 100644 tests/testthat/test-roxytest-testexamples-dragen.R

diff --git a/R/dragen.R b/R/dragen.R
index 2e6cbbd..d481e82 100644
--- a/R/dragen.R
+++ b/R/dragen.R
@@ -853,13 +853,33 @@ PloidyEstimationMetricsFile <- R6::R6Class(
   )
 )
 
+#' DRAGEN File Subprefix
+#'
+#' Extracts a file subprefix for better table naming.
+#'
+#' @param x File name.
+#' @param suffix Suffix to remove.
+#'
+#' @return Clean string.
+#'
+#' @examples
+#' x1 <- "L2401290.exon_contig_mean_cov.csv"
+#' x2 <- "L2401290.tmb_contig_mean_cov.csv.gz"
+#' x3 <- "foo.bar.exon_contig_mean_cov.csv.gz"
+#' (s1 <- dragen_subprefix(x1, "_contig_mean_cov"))
+#' (s2 <- dragen_subprefix(x2, "_contig_mean_cov"))
+#' (s3 <- dragen_subprefix(x3, "_contig_mean_cov"))
+#' @testexamples
+#' expect_equal(s1, "exon")
+#' expect_equal(s2, "tmb")
+#' expect_equal(s3, "bar")
 dragen_subprefix <- function(x, suffix) {
   # L2401290.exon_contig_mean_cov.csv -> exon
   # L2401290.target_bed_contig_mean_cov.csv -> target_bed
   # L2401290.tmb_contig_mean_cov.csv -> tmb
   # L2401290.wgs_contig_mean_cov.csv -> wgs
+  # capture the substring between the first dot and the next dot.
   bname <- basename(x)
-  s1 <- tools::file_path_sans_ext(bname)
-  s2 <- sub(".*\\.(.*)", "\\1", s1)
-  sub(suffix, "", s2)
+  s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov
+  sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon"
 }
diff --git a/man/dragen_subprefix.Rd b/man/dragen_subprefix.Rd
new file mode 100644
index 0000000..6a25ce2
--- /dev/null
+++ b/man/dragen_subprefix.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dragen.R
+\name{dragen_subprefix}
+\alias{dragen_subprefix}
+\title{DRAGEN File Subprefix}
+\usage{
+dragen_subprefix(x, suffix)
+}
+\arguments{
+\item{x}{File name.}
+
+\item{suffix}{Suffix to remove.}
+}
+\value{
+Clean string.
+}
+\description{
+Extracts a file subprefix for better table naming.
+}
+\examples{
+x1 <- "L2401290.exon_contig_mean_cov.csv"
+x2 <- "L2401290.tmb_contig_mean_cov.csv.gz"
+x3 <- "foo.bar.exon_contig_mean_cov.csv.gz"
+(s1 <- dragen_subprefix(x1, "_contig_mean_cov"))
+(s2 <- dragen_subprefix(x2, "_contig_mean_cov"))
+(s3 <- dragen_subprefix(x3, "_contig_mean_cov"))
+}
diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R
new file mode 100644
index 0000000..7305361
--- /dev/null
+++ b/tests/testthat/test-roxytest-testexamples-dragen.R
@@ -0,0 +1,17 @@
+# Generated by roxytest: do not edit by hand!
+
+# File R/dragen.R: @testexamples
+
+test_that("Function dragen_subprefix() @ L876", {
+  
+  x1 <- "L2401290.exon_contig_mean_cov.csv"
+  x2 <- "L2401290.tmb_contig_mean_cov.csv.gz"
+  x3 <- "foo.bar.exon_contig_mean_cov.csv.gz"
+  (s1 <- dragen_subprefix(x1, "_contig_mean_cov"))
+  (s2 <- dragen_subprefix(x2, "_contig_mean_cov"))
+  (s3 <- dragen_subprefix(x3, "_contig_mean_cov"))
+  expect_equal(s1, "exon")
+  expect_equal(s2, "tmb")
+  expect_equal(s3, "bar")
+})
+

From ab4680c1261f6aaaa0c98ba780477ff3530d362e Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 19 Oct 2024 19:21:51 +1100
Subject: [PATCH 03/32] sigs: use single parser

---
 R/sash.R       | 68 +++++++++++++++++++++---------------------
 man/Wf_sash.Rd | 80 +++-----------------------------------------------
 2 files changed, 37 insertions(+), 111 deletions(-)

diff --git a/R/sash.R b/R/sash.R
index 9fc2166..19f677e 100644
--- a/R/sash.R
+++ b/R/sash.R
@@ -7,11 +7,11 @@
 #' \dontrun{
 #'
 #' #---- Local ----#
-#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash"
-#' p2 <- "202408270b93455e/L2401308_L2401307"
+#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ03324/sash"
+#' p2 <- "202408309698c304/L2300777_L2300776"
 #' p <- normalizePath(file.path(p1, p2))
-#' SubjectID <- "SBJ05571"
-#' SampleID_tumor <- "MDX240307"
+#' SubjectID <- "SBJ03324"
+#' SampleID_tumor <- "PRJ230432"
 #' prefix <- glue("{SubjectID}__{SampleID_tumor}")
 #' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
 #' s1$list_files(max_files = 20)
@@ -69,10 +69,10 @@ Wf_sash <- R6::R6Class(
         glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord",
         glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect",
         glue("{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrd_dragen",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020",
-        glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs",
-        glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel",
+        glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigstsv",
+        glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigstsv",
+        glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigstsv",
+        glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigstsv",
         glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum",
         glue("{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgr_json"
       ) |>
@@ -100,13 +100,14 @@ Wf_sash <- R6::R6Class(
     #' @param x Path to file.
     read_pcgr_json = function(x) {
       dat <- pcgr_json_read(x)
-      tibble::tibble(name = "pcgrjson", data = list(dat))
+      tibble::tibble(name = "pcgrjson", data = list(dat[]))
     },
     #' @description Read `dragen.tsv.gz` cancer report hrd file.
     #' @param x Path to file.
     read_hrd_dragen = function(x) {
       ct <- readr::cols(.default = "d", Sample = "c")
-      read_tsvgz(x, col_types = ct)
+      dat <- read_tsvgz(x, col_types = ct)
+      tibble::tibble(name = "hrddragen", data = list(dat[]))
     },
     #' @description Read `chord.tsv.gz` cancer report hrd file.
     #' @param x Path to file.
@@ -118,7 +119,8 @@ Wf_sash <- R6::R6Class(
         p_BRCA1 = "d",
         p_BRCA2 = "d"
       )
-      read_tsvgz(x, col_types = ct)
+      dat <- read_tsvgz(x, col_types = ct)
+      tibble::tibble(name = "hrdchord", data = list(dat[]))
     },
     #' @description Read `hrdetect.tsv.gz` cancer report hrd file.
     #' @param x Path to file.
@@ -127,43 +129,26 @@ Wf_sash <- R6::R6Class(
         .default = "d",
         sample = "c"
       )
-      read_tsvgz(x, col_types = ct) |>
+      dat <- read_tsvgz(x, col_types = ct) |>
         dplyr::select(-c("sample"))
+      tibble::tibble(name = "hrdhrdetect", data = list(dat[]))
     },
     #' @description Read signature cancer report file.
     #' @param x Path to file.
     read_sigstsv = function(x) {
+      suffix <- private$sigs_suffix(x)
       ct <- readr::cols(
         .default = "d",
         Signature = "c"
       )
-      read_tsvgz(x, col_types = ct)
-    },
-    #' @description Read `snv_2015.tsv.gz` sigs cancer report file.
-    #' @param x Path to file.
-    read_sigs_snv2015 = function(x) {
-      self$read_sigstsv(x)
-    },
-    #' @description Read `snv_2020.tsv.gz` sigs cancer report file.
-    #' @param x Path to file.
-    read_sigs_snv2020 = function(x) {
-      self$read_sigstsv(x)
-    },
-    #' @description Read `dbs.tsv.gz` sigs cancer report file.
-    #' @param x Path to file.
-    read_sigs_dbs = function(x) {
-      self$read_sigstsv(x)
-    },
-    #' @description Read `indel.tsv.gz` sigs cancer report file.
-    #' @param x Path to file.
-    read_sigs_indel = function(x) {
-      self$read_sigstsv(x)
+      dat <- read_tsvgz(x, col_types = ct)
+      tibble::tibble(name = glue("sigs_{suffix}"), data = list(dat[]))
     },
     #' @description Read `qc_summary.tsv.gz` cancer report file.
     #' @param x Path to file.
     read_qcsum = function(x) {
       d <- read_tsvgz(x, col_types = readr::cols(.default = "c"))
-      d |>
+      dat <- d |>
         dplyr::select("variable", "value") |>
         tidyr::pivot_wider(names_from = "variable", values_from = "value") |>
         dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |>
@@ -187,8 +172,21 @@ Wf_sash <- R6::R6Class(
           wgd_hmf = "WGD",
           "hypermutated"
         )
+      tibble::tibble(name = glue("qcsum"), data = list(dat[]))
     }
-  ) # end public
+  ), # end public
+  private = list(
+    sigs_suffix = function(x) {
+      x <- basename(x)
+      dplyr::case_when(
+        grepl("-dbs", x) ~ "dbs",
+        grepl("-indel", x) ~ "ind",
+        grepl("-snv_2015", x) ~ "snv2015",
+        grepl("-snv_2020", x) ~ "snv2020",
+        .default = ""
+      )
+    }
+  )
 )
 
 #' sash Download Tidy and Write
diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd
index 1caa51a..f236de8 100644
--- a/man/Wf_sash.Rd
+++ b/man/Wf_sash.Rd
@@ -10,11 +10,11 @@ Reads and writes tidy versions of files from the \code{sash} workflow
 \dontrun{
 
 #---- Local ----#
-p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash"
-p2 <- "202408270b93455e/L2401308_L2401307"
+p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ03324/sash"
+p2 <- "202408309698c304/L2300777_L2300776"
 p <- normalizePath(file.path(p1, p2))
-SubjectID <- "SBJ05571"
-SampleID_tumor <- "MDX240307"
+SubjectID <- "SBJ03324"
+SampleID_tumor <- "PRJ230432"
 prefix <- glue("{SubjectID}__{SampleID_tumor}")
 s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
 s1$list_files(max_files = 20)
@@ -72,10 +72,6 @@ d_write <- s1$write(
 \item \href{#method-Wf_sash-read_hrd_chord}{\code{Wf_sash$read_hrd_chord()}}
 \item \href{#method-Wf_sash-read_hrd_hrdetect}{\code{Wf_sash$read_hrd_hrdetect()}}
 \item \href{#method-Wf_sash-read_sigstsv}{\code{Wf_sash$read_sigstsv()}}
-\item \href{#method-Wf_sash-read_sigs_snv2015}{\code{Wf_sash$read_sigs_snv2015()}}
-\item \href{#method-Wf_sash-read_sigs_snv2020}{\code{Wf_sash$read_sigs_snv2020()}}
-\item \href{#method-Wf_sash-read_sigs_dbs}{\code{Wf_sash$read_sigs_dbs()}}
-\item \href{#method-Wf_sash-read_sigs_indel}{\code{Wf_sash$read_sigs_indel()}}
 \item \href{#method-Wf_sash-read_qcsum}{\code{Wf_sash$read_qcsum()}}
 \item \href{#method-Wf_sash-clone}{\code{Wf_sash$clone()}}
 }
@@ -207,74 +203,6 @@ Read signature cancer report file.
 \if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_sigstsv(x)}\if{html}{\out{</div>}}
 }
 
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Path to file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_sigs_snv2015"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_snv2015}{}}}
-\subsection{Method \code{read_sigs_snv2015()}}{
-Read \code{snv_2015.tsv.gz} sigs cancer report file.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_sigs_snv2015(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Path to file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_sigs_snv2020"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_snv2020}{}}}
-\subsection{Method \code{read_sigs_snv2020()}}{
-Read \code{snv_2020.tsv.gz} sigs cancer report file.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_sigs_snv2020(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Path to file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_sigs_dbs"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_dbs}{}}}
-\subsection{Method \code{read_sigs_dbs()}}{
-Read \code{dbs.tsv.gz} sigs cancer report file.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_sigs_dbs(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Path to file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_sigs_indel"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_indel}{}}}
-\subsection{Method \code{read_sigs_indel()}}{
-Read \code{indel.tsv.gz} sigs cancer report file.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_sigs_indel(x)}\if{html}{\out{</div>}}
-}
-
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{

From 1ea69787ba13e6dfa25b53c12eff06a226c1c4b8 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 19 Oct 2024 23:55:59 +1100
Subject: [PATCH 04/32] sigs: use single parser

---
 R/umccrise.R       | 58 ++++++++++++++++++-------------------
 man/Wf_umccrise.Rd | 72 ----------------------------------------------
 2 files changed, 28 insertions(+), 102 deletions(-)

diff --git a/R/umccrise.R b/R/umccrise.R
index c097a8f..e372b75 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -70,10 +70,10 @@ Wf_umccrise <- R6::R6Class(
         ~regex, ~fun,
         glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord",
         glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020",
-        glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs",
-        glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel",
+        glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigstsv",
+        glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigstsv",
+        glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigstsv",
+        glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigstsv",
         glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum",
         glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpairmultiqc",
         glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgr_json"
@@ -114,7 +114,8 @@ Wf_umccrise <- R6::R6Class(
         p_BRCA1 = "d",
         p_BRCA2 = "d"
       )
-      read_tsvgz(x, col_types = ct)
+      dat <- read_tsvgz(x, col_types = ct)
+      tibble::tibble(name = "hrdchord", data = list(dat[]))
     },
     #' @description Read `hrdetect.tsv.gz` cancer report file.
     #' @param x Path to file.
@@ -123,43 +124,26 @@ Wf_umccrise <- R6::R6Class(
         .default = "d",
         sample = "c"
       )
-      read_tsvgz(x, col_types = ct) |>
+      dat <- read_tsvgz(x, col_types = ct) |>
         dplyr::select(-c("sample"))
+      tibble::tibble(name = "hrdhrdetect", data = list(dat[]))
     },
     #' @description Read signature cancer report file.
     #' @param x Path to file.
     read_sigstsv = function(x) {
+      suffix <- private$sigs_suffix(x)
       ct <- readr::cols(
         .default = "d",
         Signature = "c"
       )
-      read_tsvgz(x, col_types = ct)
-    },
-    #' @description Read `snv_2015.tsv.gz` sigs cancer report file.
-    #' @param x Path to file.
-    read_sigs_snv2015 = function(x) {
-      self$read_sigstsv(x)
-    },
-    #' @description Read `snv_2020.tsv.gz` sigs cancer report file.
-    #' @param x Path to file.
-    read_sigs_snv2020 = function(x) {
-      self$read_sigstsv(x)
-    },
-    #' @description Read `dbs.tsv.gz` sigs cancer report file.
-    #' @param x Path to file.
-    read_sigs_dbs = function(x) {
-      self$read_sigstsv(x)
-    },
-    #' @description Read `indel.tsv.gz` sigs cancer report file.
-    #' @param x Path to file.
-    read_sigs_indel = function(x) {
-      self$read_sigstsv(x)
+      dat <- read_tsvgz(x, col_types = ct)
+      tibble::tibble(name = glue("sigs_{suffix}"), data = list(dat[]))
     },
     #' @description Read `qc_summary.tsv.gz` cancer report file.
     #' @param x Path to file.
     read_qcsum = function(x) {
       d <- read_tsvgz(x, col_types = readr::cols(.default = "c"))
-      d |>
+      dat <- d |>
         dplyr::select("variable", "value") |>
         tidyr::pivot_wider(names_from = "variable", values_from = "value") |>
         dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |>
@@ -188,6 +172,7 @@ Wf_umccrise <- R6::R6Class(
           wgd_hmf = "WGD",
           "hypermutated", "bpi_enabled"
         )
+      tibble::tibble(name = glue("qcsum"), data = list(dat[]))
     },
     #' @description Read multiqc_conpair.txt file.
     #' @param x Path to file.
@@ -216,12 +201,25 @@ Wf_umccrise <- R6::R6Class(
       }
       d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c"))
       assertthat::assert_that(all(colnames(d1) == cnames$old))
-      d1 |>
+      dat <- d1 |>
         dplyr::filter(!.data$Sample %in% um_ref_samples) |>
         dplyr::relocate("contamination", .after = "Sample") |>
         rlang::set_names(cnames$new)
+      tibble::tibble(name = glue("conpair"), data = list(dat[]))
     }
-  ) # end public
+  ), # end public
+  private = list(
+    sigs_suffix = function(x) {
+      x <- basename(x)
+      dplyr::case_when(
+        grepl("-dbs", x) ~ "dbs",
+        grepl("-indel", x) ~ "ind",
+        grepl("-snv_2015", x) ~ "snv2015",
+        grepl("-snv_2020", x) ~ "snv2020",
+        .default = ""
+      )
+    }
+  )
 )
 
 #' umccrise Download Tidy and Write
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index 131a62e..76a6b1e 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -73,10 +73,6 @@ d_write <- um2$write(
 \item \href{#method-Wf_umccrise-read_hrd_chord}{\code{Wf_umccrise$read_hrd_chord()}}
 \item \href{#method-Wf_umccrise-read_hrd_hrdetect}{\code{Wf_umccrise$read_hrd_hrdetect()}}
 \item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}}
-\item \href{#method-Wf_umccrise-read_sigs_snv2015}{\code{Wf_umccrise$read_sigs_snv2015()}}
-\item \href{#method-Wf_umccrise-read_sigs_snv2020}{\code{Wf_umccrise$read_sigs_snv2020()}}
-\item \href{#method-Wf_umccrise-read_sigs_dbs}{\code{Wf_umccrise$read_sigs_dbs()}}
-\item \href{#method-Wf_umccrise-read_sigs_indel}{\code{Wf_umccrise$read_sigs_indel()}}
 \item \href{#method-Wf_umccrise-read_qcsum}{\code{Wf_umccrise$read_qcsum()}}
 \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}}
 \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}}
@@ -192,74 +188,6 @@ Read signature cancer report file.
 \if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{</div>}}
 }
 
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Path to file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_sigs_snv2015"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_snv2015}{}}}
-\subsection{Method \code{read_sigs_snv2015()}}{
-Read \code{snv_2015.tsv.gz} sigs cancer report file.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigs_snv2015(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Path to file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_sigs_snv2020"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_snv2020}{}}}
-\subsection{Method \code{read_sigs_snv2020()}}{
-Read \code{snv_2020.tsv.gz} sigs cancer report file.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigs_snv2020(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Path to file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_sigs_dbs"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_dbs}{}}}
-\subsection{Method \code{read_sigs_dbs()}}{
-Read \code{dbs.tsv.gz} sigs cancer report file.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigs_dbs(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Path to file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_sigs_indel"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_indel}{}}}
-\subsection{Method \code{read_sigs_indel()}}{
-Read \code{indel.tsv.gz} sigs cancer report file.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigs_indel(x)}\if{html}{\out{</div>}}
-}
-
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{

From 4297519d3d1fe8a85d52ea680a2f02d239f21818 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 20 Oct 2024 00:00:53 +1100
Subject: [PATCH 05/32] make dragen_subprefix private

---
 R/dragen.R                                    | 31 -------------------
 R/tso_dragen.R                                | 23 +++++++++-----
 man/dragen_subprefix.Rd                       | 27 ----------------
 .../test-roxytest-testexamples-dragen.R       | 17 ----------
 4 files changed, 16 insertions(+), 82 deletions(-)
 delete mode 100644 man/dragen_subprefix.Rd
 delete mode 100644 tests/testthat/test-roxytest-testexamples-dragen.R

diff --git a/R/dragen.R b/R/dragen.R
index d481e82..b198cb8 100644
--- a/R/dragen.R
+++ b/R/dragen.R
@@ -852,34 +852,3 @@ PloidyEstimationMetricsFile <- R6::R6Class(
     }
   )
 )
-
-#' DRAGEN File Subprefix
-#'
-#' Extracts a file subprefix for better table naming.
-#'
-#' @param x File name.
-#' @param suffix Suffix to remove.
-#'
-#' @return Clean string.
-#'
-#' @examples
-#' x1 <- "L2401290.exon_contig_mean_cov.csv"
-#' x2 <- "L2401290.tmb_contig_mean_cov.csv.gz"
-#' x3 <- "foo.bar.exon_contig_mean_cov.csv.gz"
-#' (s1 <- dragen_subprefix(x1, "_contig_mean_cov"))
-#' (s2 <- dragen_subprefix(x2, "_contig_mean_cov"))
-#' (s3 <- dragen_subprefix(x3, "_contig_mean_cov"))
-#' @testexamples
-#' expect_equal(s1, "exon")
-#' expect_equal(s2, "tmb")
-#' expect_equal(s3, "bar")
-dragen_subprefix <- function(x, suffix) {
-  # L2401290.exon_contig_mean_cov.csv -> exon
-  # L2401290.target_bed_contig_mean_cov.csv -> target_bed
-  # L2401290.tmb_contig_mean_cov.csv -> tmb
-  # L2401290.wgs_contig_mean_cov.csv -> wgs
-  # capture the substring between the first dot and the next dot.
-  bname <- basename(x)
-  s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov
-  sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon"
-}
diff --git a/R/tso_dragen.R b/R/tso_dragen.R
index dc35091..6df67d9 100644
--- a/R/tso_dragen.R
+++ b/R/tso_dragen.R
@@ -117,7 +117,7 @@ Wf_dragen <- R6::R6Class(
     #' @param x Path to file.
     #' @param keep_alt Keep ALT contigs.
     read_contigMeanCov = function(x, keep_alt = FALSE) {
-      subprefix <- dragen_subprefix(x, "_contig_mean_cov")
+      subprefix <- private$dragen_subprefix(x, "_contig_mean_cov")
       dat <- readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |>
         dplyr::filter(
           if (!keep_alt) {
@@ -131,14 +131,14 @@ Wf_dragen <- R6::R6Class(
     #' @description Read `coverage_metrics.csv` file.
     #' @param x Path to file.
     read_coverageMetrics = function(x) {
-      subprefix <- dragen_subprefix(x, "_coverage_metrics")
+      subprefix <- private$dragen_subprefix(x, "_coverage_metrics")
       dat <- dragen_coverage_metrics_read(x)
       tibble::tibble(name = glue("covmetrics_{subprefix}"), data = list(dat))
     },
     #' @description Read `fine_hist.csv` file.
     #' @param x Path to file.
     read_fineHist = function(x) {
-      subprefix <- dragen_subprefix(x, "_fine_hist")
+      subprefix <- private$dragen_subprefix(x, "_fine_hist")
       d <- readr::read_csv(x, col_types = "cd")
       assertthat::assert_that(all(colnames(d) == c("Depth", "Overall")))
       # there's a max Depth of 2000+, so convert to numeric for easier plotting
@@ -175,7 +175,7 @@ Wf_dragen <- R6::R6Class(
     #' @description Read `hist.csv` (not `fine_hist.csv`!) file.
     #' @param x Path to file.
     read_hist = function(x) {
-      subprefix <- dragen_subprefix(x, "_hist")
+      subprefix <- private$dragen_subprefix(x, "_hist")
       d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd")
       dat <- d |>
         dplyr::mutate(
@@ -197,7 +197,9 @@ Wf_dragen <- R6::R6Class(
     #' @param x Path to file.
     read_timeMetrics = function(x) {
       cn <- c("dummy1", "dummy2", "Step", "time_hrs", "time_sec")
-      ct <- readr::cols(.default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d")
+      ct <- readr::cols(
+        .default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d"
+      )
       d <- readr::read_csv(x, col_names = cn, col_types = ct)
       assertthat::assert_that(d$dummy1[1] == "RUN TIME", is.na(d$dummy2[1]))
       assertthat::assert_that(inherits(d$time_hrs, "hms"))
@@ -215,7 +217,7 @@ Wf_dragen <- R6::R6Class(
     #' @description Read `vc_metrics.csv`/`gvcf_metrics.csv` file.
     #' @param x Path to file.
     read_vcMetrics = function(x) {
-      subprefix <- dragen_subprefix(x, "_metrics")
+      subprefix <- private$dragen_subprefix(x, "_metrics")
       dat <- dragen_vc_metrics_read(x)
       tibble::tibble(name = glue("vcmetrics_{subprefix}"), data = list(dat[]))
     },
@@ -268,5 +270,12 @@ Wf_dragen <- R6::R6Class(
         dplyr::rename(Chromosome = "#Chromosome")
       tibble::tibble(name = "msidiffs", data = list(dat[]))
     }
-  ) # end public
+  ), # end public
+  private = list(
+    dragen_subprefix = function(x, suffix) {
+      bname <- basename(x)
+      s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov
+      sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon"
+    }
+  )
 ) # end Wf_dragen
diff --git a/man/dragen_subprefix.Rd b/man/dragen_subprefix.Rd
deleted file mode 100644
index 6a25ce2..0000000
--- a/man/dragen_subprefix.Rd
+++ /dev/null
@@ -1,27 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dragen.R
-\name{dragen_subprefix}
-\alias{dragen_subprefix}
-\title{DRAGEN File Subprefix}
-\usage{
-dragen_subprefix(x, suffix)
-}
-\arguments{
-\item{x}{File name.}
-
-\item{suffix}{Suffix to remove.}
-}
-\value{
-Clean string.
-}
-\description{
-Extracts a file subprefix for better table naming.
-}
-\examples{
-x1 <- "L2401290.exon_contig_mean_cov.csv"
-x2 <- "L2401290.tmb_contig_mean_cov.csv.gz"
-x3 <- "foo.bar.exon_contig_mean_cov.csv.gz"
-(s1 <- dragen_subprefix(x1, "_contig_mean_cov"))
-(s2 <- dragen_subprefix(x2, "_contig_mean_cov"))
-(s3 <- dragen_subprefix(x3, "_contig_mean_cov"))
-}
diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R
deleted file mode 100644
index 7305361..0000000
--- a/tests/testthat/test-roxytest-testexamples-dragen.R
+++ /dev/null
@@ -1,17 +0,0 @@
-# Generated by roxytest: do not edit by hand!
-
-# File R/dragen.R: @testexamples
-
-test_that("Function dragen_subprefix() @ L876", {
-  
-  x1 <- "L2401290.exon_contig_mean_cov.csv"
-  x2 <- "L2401290.tmb_contig_mean_cov.csv.gz"
-  x3 <- "foo.bar.exon_contig_mean_cov.csv.gz"
-  (s1 <- dragen_subprefix(x1, "_contig_mean_cov"))
-  (s2 <- dragen_subprefix(x2, "_contig_mean_cov"))
-  (s3 <- dragen_subprefix(x3, "_contig_mean_cov"))
-  expect_equal(s1, "exon")
-  expect_equal(s2, "tmb")
-  expect_equal(s3, "bar")
-})
-

From cb5c41436450b4bacdc46e82740437e4a512fcec Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 20 Oct 2024 00:12:03 +1100
Subject: [PATCH 06/32] sigs: refactor

---
 R/Wf.R             |  3 ++-
 R/sash.R           | 38 ++++++++++++++---------------
 R/umccrise.R       | 57 ++++++++++++++++++++++---------------------
 man/Wf_sash.Rd     | 60 +++++++++++++++++++++++-----------------------
 man/Wf_umccrise.Rd | 60 +++++++++++++++++++++++-----------------------
 5 files changed, 108 insertions(+), 110 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index 9d35956..089d575 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -199,7 +199,8 @@ Wf <- R6::R6Class(
     #' @param x Tibble with `localpath` to file and the function `type` to parse it.
     tidy_files = function(x) {
       # awesomeness
-      tidy_files(x, envir = self)
+      tidy_files(x, envir = self) |>
+        dplyr::arrange(.data$name)
     },
     #' @description Write tidy data.
     #' @param x Tibble with tidy `data` list-column.
diff --git a/R/sash.R b/R/sash.R
index 19f677e..725d36c 100644
--- a/R/sash.R
+++ b/R/sash.R
@@ -98,20 +98,20 @@ Wf_sash <- R6::R6Class(
     },
     #' @description Read `pcgr.json.gz` file.
     #' @param x Path to file.
-    read_pcgr_json = function(x) {
+    read_pcgrJson = function(x) {
       dat <- pcgr_json_read(x)
       tibble::tibble(name = "pcgrjson", data = list(dat[]))
     },
     #' @description Read `dragen.tsv.gz` cancer report hrd file.
     #' @param x Path to file.
-    read_hrd_dragen = function(x) {
+    read_hrdDragen = function(x) {
       ct <- readr::cols(.default = "d", Sample = "c")
       dat <- read_tsvgz(x, col_types = ct)
       tibble::tibble(name = "hrddragen", data = list(dat[]))
     },
     #' @description Read `chord.tsv.gz` cancer report hrd file.
     #' @param x Path to file.
-    read_hrd_chord = function(x) {
+    read_hrdChord = function(x) {
       ct <- readr::cols_only(
         p_hrd = "d",
         hr_status = "c",
@@ -124,7 +124,7 @@ Wf_sash <- R6::R6Class(
     },
     #' @description Read `hrdetect.tsv.gz` cancer report hrd file.
     #' @param x Path to file.
-    read_hrd_hrdetect = function(x) {
+    read_hrdHrdetect = function(x) {
       ct <- readr::cols(
         .default = "d",
         sample = "c"
@@ -135,8 +135,18 @@ Wf_sash <- R6::R6Class(
     },
     #' @description Read signature cancer report file.
     #' @param x Path to file.
-    read_sigstsv = function(x) {
-      suffix <- private$sigs_suffix(x)
+    read_sigsTsv = function(x) {
+      .sigsSuffix <- function(x) {
+        x <- basename(x)
+        dplyr::case_when(
+          grepl("-dbs", x) ~ "dbs",
+          grepl("-indel", x) ~ "ind",
+          grepl("-snv_2015", x) ~ "snv2015",
+          grepl("-snv_2020", x) ~ "snv2020",
+          .default = ""
+        )
+      }
+      suffix <- .sigsSuffix(x)
       ct <- readr::cols(
         .default = "d",
         Signature = "c"
@@ -146,7 +156,7 @@ Wf_sash <- R6::R6Class(
     },
     #' @description Read `qc_summary.tsv.gz` cancer report file.
     #' @param x Path to file.
-    read_qcsum = function(x) {
+    read_qcSum = function(x) {
       d <- read_tsvgz(x, col_types = readr::cols(.default = "c"))
       dat <- d |>
         dplyr::select("variable", "value") |>
@@ -174,19 +184,7 @@ Wf_sash <- R6::R6Class(
         )
       tibble::tibble(name = glue("qcsum"), data = list(dat[]))
     }
-  ), # end public
-  private = list(
-    sigs_suffix = function(x) {
-      x <- basename(x)
-      dplyr::case_when(
-        grepl("-dbs", x) ~ "dbs",
-        grepl("-indel", x) ~ "ind",
-        grepl("-snv_2015", x) ~ "snv2015",
-        grepl("-snv_2020", x) ~ "snv2020",
-        .default = ""
-      )
-    }
-  )
+  ) # end public
 )
 
 #' sash Download Tidy and Write
diff --git a/R/umccrise.R b/R/umccrise.R
index e372b75..0c0c296 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -68,15 +68,15 @@ Wf_umccrise <- R6::R6Class(
       crep <- "cancer_report_tables"
       regexes <- tibble::tribble(
         ~regex, ~fun,
-        glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord",
-        glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigstsv",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigstsv",
-        glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigstsv",
-        glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigstsv",
-        glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum",
-        glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpairmultiqc",
-        glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgr_json"
+        glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord",
+        glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect",
+        glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv",
+        glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv",
+        glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv",
+        glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv",
+        glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum",
+        glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpair",
+        glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgrJson"
       ) |>
         dplyr::mutate(fun = paste0("read_", .data$fun))
 
@@ -100,13 +100,13 @@ Wf_umccrise <- R6::R6Class(
     },
     #' @description Read `pcgr.json.gz` file.
     #' @param x Path to file.
-    read_pcgr_json = function(x) {
+    read_pcgrJson = function(x) {
       dat <- pcgr_json_read(x)
       tibble::tibble(name = "pcgrjson", data = list(dat))
     },
     #' @description Read `chord.tsv.gz` cancer report file.
     #' @param x Path to file.
-    read_hrd_chord = function(x) {
+    read_hrdChord = function(x) {
       ct <- readr::cols_only(
         p_hrd = "d",
         hr_status = "c",
@@ -119,7 +119,7 @@ Wf_umccrise <- R6::R6Class(
     },
     #' @description Read `hrdetect.tsv.gz` cancer report file.
     #' @param x Path to file.
-    read_hrd_hrdetect = function(x) {
+    read_hrdHrdetect = function(x) {
       ct <- readr::cols(
         .default = "d",
         sample = "c"
@@ -130,8 +130,19 @@ Wf_umccrise <- R6::R6Class(
     },
     #' @description Read signature cancer report file.
     #' @param x Path to file.
-    read_sigstsv = function(x) {
-      suffix <- private$sigs_suffix(x)
+    read_sigsTsv = function(x) {
+      .sigsSuffix <- function(x) {
+        x <- basename(x)
+        dplyr::case_when(
+          grepl("-dbs", x) ~ "dbs",
+          grepl("-indel", x) ~ "ind",
+          grepl("-snv_2015", x) ~ "snv2015",
+          grepl("-snv_2020", x) ~ "snv2020",
+          .default = ""
+        )
+      }
+
+      suffix <- .sigsSuffix(x)
       ct <- readr::cols(
         .default = "d",
         Signature = "c"
@@ -141,7 +152,7 @@ Wf_umccrise <- R6::R6Class(
     },
     #' @description Read `qc_summary.tsv.gz` cancer report file.
     #' @param x Path to file.
-    read_qcsum = function(x) {
+    read_qcSum = function(x) {
       d <- read_tsvgz(x, col_types = readr::cols(.default = "c"))
       dat <- d |>
         dplyr::select("variable", "value") |>
@@ -176,7 +187,7 @@ Wf_umccrise <- R6::R6Class(
     },
     #' @description Read multiqc_conpair.txt file.
     #' @param x Path to file.
-    read_conpairmultiqc = function(x) {
+    read_conpair = function(x) {
       um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota")
       um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples)))
       cnames <- list(
@@ -207,19 +218,7 @@ Wf_umccrise <- R6::R6Class(
         rlang::set_names(cnames$new)
       tibble::tibble(name = glue("conpair"), data = list(dat[]))
     }
-  ), # end public
-  private = list(
-    sigs_suffix = function(x) {
-      x <- basename(x)
-      dplyr::case_when(
-        grepl("-dbs", x) ~ "dbs",
-        grepl("-indel", x) ~ "ind",
-        grepl("-snv_2015", x) ~ "snv2015",
-        grepl("-snv_2020", x) ~ "snv2020",
-        .default = ""
-      )
-    }
-  )
+  ) # end public
 )
 
 #' umccrise Download Tidy and Write
diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd
index f236de8..1b7c81f 100644
--- a/man/Wf_sash.Rd
+++ b/man/Wf_sash.Rd
@@ -67,12 +67,12 @@ d_write <- s1$write(
 \itemize{
 \item \href{#method-Wf_sash-new}{\code{Wf_sash$new()}}
 \item \href{#method-Wf_sash-print}{\code{Wf_sash$print()}}
-\item \href{#method-Wf_sash-read_pcgr_json}{\code{Wf_sash$read_pcgr_json()}}
-\item \href{#method-Wf_sash-read_hrd_dragen}{\code{Wf_sash$read_hrd_dragen()}}
-\item \href{#method-Wf_sash-read_hrd_chord}{\code{Wf_sash$read_hrd_chord()}}
-\item \href{#method-Wf_sash-read_hrd_hrdetect}{\code{Wf_sash$read_hrd_hrdetect()}}
-\item \href{#method-Wf_sash-read_sigstsv}{\code{Wf_sash$read_sigstsv()}}
-\item \href{#method-Wf_sash-read_qcsum}{\code{Wf_sash$read_qcsum()}}
+\item \href{#method-Wf_sash-read_pcgrJson}{\code{Wf_sash$read_pcgrJson()}}
+\item \href{#method-Wf_sash-read_hrdDragen}{\code{Wf_sash$read_hrdDragen()}}
+\item \href{#method-Wf_sash-read_hrdChord}{\code{Wf_sash$read_hrdChord()}}
+\item \href{#method-Wf_sash-read_hrdHrdetect}{\code{Wf_sash$read_hrdHrdetect()}}
+\item \href{#method-Wf_sash-read_sigsTsv}{\code{Wf_sash$read_sigsTsv()}}
+\item \href{#method-Wf_sash-read_qcSum}{\code{Wf_sash$read_qcSum()}}
 \item \href{#method-Wf_sash-clone}{\code{Wf_sash$clone()}}
 }
 }
@@ -127,12 +127,12 @@ Print details about the Workflow.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_pcgr_json"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgr_json}{}}}
-\subsection{Method \code{read_pcgr_json()}}{
+\if{html}{\out{<a id="method-Wf_sash-read_pcgrJson"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgrJson}{}}}
+\subsection{Method \code{read_pcgrJson()}}{
 Read \code{pcgr.json.gz} file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_pcgr_json(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_pcgrJson(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -144,12 +144,12 @@ Read \code{pcgr.json.gz} file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_hrd_dragen"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_dragen}{}}}
-\subsection{Method \code{read_hrd_dragen()}}{
+\if{html}{\out{<a id="method-Wf_sash-read_hrdDragen"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdDragen}{}}}
+\subsection{Method \code{read_hrdDragen()}}{
 Read \code{dragen.tsv.gz} cancer report hrd file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_hrd_dragen(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_hrdDragen(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -161,12 +161,12 @@ Read \code{dragen.tsv.gz} cancer report hrd file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_hrd_chord"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_chord}{}}}
-\subsection{Method \code{read_hrd_chord()}}{
+\if{html}{\out{<a id="method-Wf_sash-read_hrdChord"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdChord}{}}}
+\subsection{Method \code{read_hrdChord()}}{
 Read \code{chord.tsv.gz} cancer report hrd file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_hrd_chord(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_hrdChord(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -178,12 +178,12 @@ Read \code{chord.tsv.gz} cancer report hrd file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_hrd_hrdetect"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_hrdetect}{}}}
-\subsection{Method \code{read_hrd_hrdetect()}}{
+\if{html}{\out{<a id="method-Wf_sash-read_hrdHrdetect"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdHrdetect}{}}}
+\subsection{Method \code{read_hrdHrdetect()}}{
 Read \code{hrdetect.tsv.gz} cancer report hrd file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_hrd_hrdetect(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_hrdHrdetect(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -195,12 +195,12 @@ Read \code{hrdetect.tsv.gz} cancer report hrd file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_sigstsv"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigstsv}{}}}
-\subsection{Method \code{read_sigstsv()}}{
+\if{html}{\out{<a id="method-Wf_sash-read_sigsTsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigsTsv}{}}}
+\subsection{Method \code{read_sigsTsv()}}{
 Read signature cancer report file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_sigstsv(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_sigsTsv(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -212,12 +212,12 @@ Read signature cancer report file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_sash-read_qcsum"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_sash-read_qcsum}{}}}
-\subsection{Method \code{read_qcsum()}}{
+\if{html}{\out{<a id="method-Wf_sash-read_qcSum"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_sash-read_qcSum}{}}}
+\subsection{Method \code{read_qcSum()}}{
 Read \code{qc_summary.tsv.gz} cancer report file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_qcsum(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_qcSum(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index 76a6b1e..f9e92e2 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -69,12 +69,12 @@ d_write <- um2$write(
 \itemize{
 \item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}}
 \item \href{#method-Wf_umccrise-print}{\code{Wf_umccrise$print()}}
-\item \href{#method-Wf_umccrise-read_pcgr_json}{\code{Wf_umccrise$read_pcgr_json()}}
-\item \href{#method-Wf_umccrise-read_hrd_chord}{\code{Wf_umccrise$read_hrd_chord()}}
-\item \href{#method-Wf_umccrise-read_hrd_hrdetect}{\code{Wf_umccrise$read_hrd_hrdetect()}}
-\item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}}
-\item \href{#method-Wf_umccrise-read_qcsum}{\code{Wf_umccrise$read_qcsum()}}
-\item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}}
+\item \href{#method-Wf_umccrise-read_pcgrJson}{\code{Wf_umccrise$read_pcgrJson()}}
+\item \href{#method-Wf_umccrise-read_hrdChord}{\code{Wf_umccrise$read_hrdChord()}}
+\item \href{#method-Wf_umccrise-read_hrdHrdetect}{\code{Wf_umccrise$read_hrdHrdetect()}}
+\item \href{#method-Wf_umccrise-read_sigsTsv}{\code{Wf_umccrise$read_sigsTsv()}}
+\item \href{#method-Wf_umccrise-read_qcSum}{\code{Wf_umccrise$read_qcSum()}}
+\item \href{#method-Wf_umccrise-read_conpair}{\code{Wf_umccrise$read_conpair()}}
 \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}}
 }
 }
@@ -129,12 +129,12 @@ Print details about the Workflow.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_pcgr_json"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgr_json}{}}}
-\subsection{Method \code{read_pcgr_json()}}{
+\if{html}{\out{<a id="method-Wf_umccrise-read_pcgrJson"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgrJson}{}}}
+\subsection{Method \code{read_pcgrJson()}}{
 Read \code{pcgr.json.gz} file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_pcgr_json(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_pcgrJson(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -146,12 +146,12 @@ Read \code{pcgr.json.gz} file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_hrd_chord"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrd_chord}{}}}
-\subsection{Method \code{read_hrd_chord()}}{
+\if{html}{\out{<a id="method-Wf_umccrise-read_hrdChord"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrdChord}{}}}
+\subsection{Method \code{read_hrdChord()}}{
 Read \code{chord.tsv.gz} cancer report file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_hrd_chord(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_hrdChord(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -163,12 +163,12 @@ Read \code{chord.tsv.gz} cancer report file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_hrd_hrdetect"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrd_hrdetect}{}}}
-\subsection{Method \code{read_hrd_hrdetect()}}{
+\if{html}{\out{<a id="method-Wf_umccrise-read_hrdHrdetect"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrdHrdetect}{}}}
+\subsection{Method \code{read_hrdHrdetect()}}{
 Read \code{hrdetect.tsv.gz} cancer report file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_hrd_hrdetect(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_hrdHrdetect(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -180,12 +180,12 @@ Read \code{hrdetect.tsv.gz} cancer report file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_sigstsv"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigstsv}{}}}
-\subsection{Method \code{read_sigstsv()}}{
+\if{html}{\out{<a id="method-Wf_umccrise-read_sigsTsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsTsv}{}}}
+\subsection{Method \code{read_sigsTsv()}}{
 Read signature cancer report file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigsTsv(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -197,12 +197,12 @@ Read signature cancer report file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_qcsum"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcsum}{}}}
-\subsection{Method \code{read_qcsum()}}{
+\if{html}{\out{<a id="method-Wf_umccrise-read_qcSum"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcSum}{}}}
+\subsection{Method \code{read_qcSum()}}{
 Read \code{qc_summary.tsv.gz} cancer report file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_qcsum(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_qcSum(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -214,12 +214,12 @@ Read \code{qc_summary.tsv.gz} cancer report file.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-read_conpairmultiqc"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_conpairmultiqc}{}}}
-\subsection{Method \code{read_conpairmultiqc()}}{
+\if{html}{\out{<a id="method-Wf_umccrise-read_conpair"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_conpair}{}}}
+\subsection{Method \code{read_conpair()}}{
 Read multiqc_conpair.txt file.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_conpairmultiqc(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_conpair(x)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{

From eafd6cbf7b726e60ad669cdc6160e3454147b605 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 20 Oct 2024 23:57:48 +1100
Subject: [PATCH 07/32] umccrise: use full path for regexes

---
 R/umccrise.R | 63 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/R/umccrise.R b/R/umccrise.R
index 0c0c296..dc4b5d9 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -25,14 +25,18 @@
 #' )
 #'
 #' #---- GDS ----#
-#' SubjectID <- "SBJ03043"
-#' SampleID_tumor <- "PRJ230004"
+#' SubjectID <- "SBJ03606"
+#' SampleID_tumor <- "PRJ230726"
+#' SampleID_normal <- "PRJ230725"
 #' prefix <- glue("{SubjectID}__{SampleID_tumor}")
 #' p1_gds <- "gds://production/analysis_data"
-#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
+#' p <- file.path(p1_gds, "SBJ03606/umccrise/20240829d11e13b0/L2300828__L2300827")
 #' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
 #' token <- Sys.getenv("ICA_ACCESS_TOKEN")
-#' um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+#' um2 <- Wf_umccrise$new(
+#'   path = p, SubjectID = SubjectID,
+#'   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
+#' )
 #' um2$list_files(max_files = 8)
 #' um2$list_files_filter_relevant(ica_token = token, max_files = 500)
 #' d <- um2$download_files(
@@ -53,36 +57,51 @@ Wf_umccrise <- R6::R6Class(
   "Wf_umccrise",
   inherit = Wf,
   public = list(
-    #' @field SubjectID The SubjectID of the sample (needed for path lookup).
-    #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
+    #' @field SubjectID The SubjectID of the sample.
+    #' @field SampleID_tumor The SampleID of the tumor sample.
+    #' @field SampleID_normal The SampleID of the normal sample.
     SubjectID = NULL,
     SampleID_tumor = NULL,
+    SampleID_normal = NULL,
     #' @description Create a new Wf_umccrise object.
     #' @param path Path to directory with raw workflow results (from GDS, S3, or
     #' local filesystem).
-    #' @param SubjectID The SubjectID of the sample (needed for path lookup).
-    #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
-    initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) {
+    #' @param SubjectID The SubjectID of the sample.
+    #' @param SampleID_tumor The SampleID of the tumor sample.
+    #' @field SampleID_normal The SampleID of the normal sample.
+    initialize = function(path = NULL, SubjectID = NULL,
+                          SampleID_tumor = NULL, SampleID_normal = NULL) {
       wname <- "umccrise"
       pref <- glue("{SubjectID}__{SampleID_tumor}")
+      pref_norm <- glue("{SubjectID}__{SampleID_normal}")
       crep <- "cancer_report_tables"
+      smallv <- "small_variants"
       regexes <- tibble::tribble(
         ~regex, ~fun,
-        glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord",
-        glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv",
-        glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv",
-        glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv",
-        glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum",
-        glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpair",
-        glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgrJson"
+        glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord",
+        glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv",
+        glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum",
+        glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpair",
+        glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgrJson",
+        glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.somatic\\.tsv$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY"
       ) |>
-        dplyr::mutate(fun = paste0("read_", .data$fun))
-
+        dplyr::mutate(
+          fun = paste0("read_", .data$fun),
+          fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun)
+        )
       super$initialize(path = path, wname = wname, regexes = regexes)
       self$SubjectID <- SubjectID
       self$SampleID_tumor <- SampleID_tumor
+      self$SampleID_normal <- SampleID_normal
     },
     #' @description Print details about the Workflow.
     #' @param ... (ignored).
@@ -93,7 +112,8 @@ Wf_umccrise <- R6::R6Class(
         "wname", self$wname,
         "filesystem", self$filesystem,
         "SubjectID", self$SubjectID,
-        "SampleID_tumor", self$SampleID_tumor
+        "SampleID_tumor", self$SampleID_tumor,
+        "SampleID_normal", self$SampleID_normal
       )
       print(res)
       invisible(self)
@@ -141,7 +161,6 @@ Wf_umccrise <- R6::R6Class(
           .default = ""
         )
       }
-
       suffix <- .sigsSuffix(x)
       ct <- readr::cols(
         .default = "d",

From d08942dcbe114aace92e02a50c0d8cc027752800 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 22 Oct 2024 18:13:38 +1100
Subject: [PATCH 08/32] Wf: add active regex bindings + private fields

---
 R/Wf.R | 78 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 32 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index 089d575..6fe882d 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -61,16 +61,26 @@
 #' @export
 Wf <- R6::R6Class(
   "Wf",
+  private = list(
+    .path = NULL,
+    .wname = NULL,
+    .regexes = NULL,
+    .filesystem = NULL
+  ),
+  active = list(
+    regexes = function(value) {
+      if (missing(value)) {
+        private$.regexes
+      } else {
+        assertthat::assert_that(
+          tibble::is_tibble(value),
+          all(c("regex", "fun") %in% colnames(value))
+        )
+        private$.regexes <- value
+      }
+    }
+  ),
   public = list(
-    #' @field path Path to directory with raw workflow results (from GDS, S3, or
-    #' local filesystem).
-    #' @field wname Name of workflow (e.g. umccrise, sash).
-    #' @field filesystem  Filesystem of `path` (gds/s3/local).
-    #' @field regexes Tibble with file `regex` and `fun`ction to parse it.
-    path = NULL,
-    wname = NULL,
-    filesystem = NULL,
-    regexes = NULL,
     #' @description Create a new Workflow object.
     #' @param path Path to directory with raw workflow results.
     #' @param wname Name of workflow.
@@ -94,23 +104,28 @@ Wf <- R6::R6Class(
       )
       subwnames <- c("dragen")
       assertthat::assert_that(wname %in% c(wnames, subwnames))
-      self$path <- sub("/$", "", path) # remove potential trailing slash
-      self$wname <- wname
-      self$filesystem <- dplyr::case_when(
+      private$.path <- sub("/$", "", path) # remove potential trailing slash
+      private$.wname <- wname
+      private$.filesystem <- dplyr::case_when(
         grepl("^gds://", path) ~ "gds",
         grepl("^s3://", path) ~ "s3",
         .default = "local"
       )
-      self$regexes <- regexes
+      assertthat::assert_that(
+        tibble::is_tibble(regexes),
+        all(c("regex", "fun") %in% colnames(regexes))
+      )
+      private$.regexes <- regexes
     },
     #' @description Print details about the Workflow.
     #' @param ... (ignored).
     print = function(...) {
       res <- tibble::tribble(
         ~var, ~value,
-        "path", self$path,
-        "wname", self$wname,
-        "filesystem", self$filesystem
+        "path", private$.path,
+        "wname", private$.wname,
+        "filesystem", private$.filesystem,
+        "nregexes", as.character(nrow(private$.regexes))
       )
       print(res)
       invisible(self)
@@ -120,13 +135,13 @@ Wf <- R6::R6Class(
     #' @param max_files Max number of files to list (for gds/s3 only).
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
     #' @param ... Passed on to `gds_list_files_dir` function.
-    list_files = function(path = self$path, max_files = 1000,
+    list_files = function(path = private$.path, max_files = 1000,
                           ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
-      if (self$filesystem == "gds") {
+      if (private$.filesystem == "gds") {
         d <- gds_list_files_dir(
           gdsdir = path, token = ica_token, page_size = max_files, ...
         )
-      } else if (self$filesystem == "s3") {
+      } else if (private$.filesystem == "s3") {
         d <- s3_list_files_dir(s3dir = path, max_objects = max_files)
       } else {
         d <- local_list_files_dir(localdir = path, max_files = max_files)
@@ -139,15 +154,15 @@ Wf <- R6::R6Class(
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
     #' @param ... Passed on to the `gds_list_files_filter_relevant` or
     #' the `s3_list_files_filter_relevant` function.
-    list_files_filter_relevant = function(path = self$path, max_files = 1000,
+    list_files_filter_relevant = function(path = private$.path, max_files = 1000,
                                           ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
-      regexes <- self$regexes
+      regexes <- private$.regexes
       assertthat::assert_that(!is.null(regexes))
-      if (self$filesystem == "gds") {
+      if (private$.filesystem == "gds") {
         d <- gds_list_files_filter_relevant(
           gdsdir = path, regexes = regexes, token = ica_token, page_size = max_files, ...
         )
-      } else if (self$filesystem == "s3") {
+      } else if (private$.filesystem == "s3") {
         d <- s3_list_files_filter_relevant(
           s3dir = path, regexes = regexes, max_objects = max_files, ...
         )
@@ -167,28 +182,27 @@ Wf <- R6::R6Class(
     #' download them).
     #' @param recursive Should files be returned recursively _in and under_ the specified
     #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
-    download_files = function(path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+    download_files = function(path = private$.path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
                               max_files = 1000, dryrun = FALSE, recursive = NULL) {
-      # TODO: add envvar checker
-      regexes <- self$regexes
+      regexes <- private$.regexes
       assertthat::assert_that(!is.null(regexes))
-      if (self$filesystem == "gds") {
+      if (private$.filesystem == "gds") {
         d <- dr_gds_download(
           gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token,
           page_size = max_files, dryrun = dryrun, recursive = recursive
         )
         if (!dryrun) {
-          self$filesystem <- "local"
-          self$path <- outdir
+          private$.filesystem <- "local"
+          private$.path <- outdir
         }
-      } else if (self$filesystem == "s3") {
+      } else if (private$.filesystem == "s3") {
         d <- dr_s3_download(
           s3dir = path, outdir = outdir, regexes = regexes,
           max_objects = max_files, dryrun = dryrun
         )
         if (!dryrun) {
-          self$filesystem <- "local"
-          self$path <- outdir
+          private$.filesystem <- "local"
+          private$.path <- outdir
         }
       } else {
         d <- self$list_files_filter_relevant(regexes = regexes, max_files = max_files)

From 483313ed0b0ec8d54045b87519a45a7b695ebfe4 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 22 Oct 2024 18:44:45 +1100
Subject: [PATCH 09/32] update man/

---
 man/Wf.Rd                          | 20 +++-----
 man/Wf_sash.Rd                     | 77 +++++++++++++++++++++++-------
 man/Wf_sash_download_tidy_write.Rd | 10 +++-
 man/Wf_umccrise.Rd                 | 31 ++++++++----
 4 files changed, 98 insertions(+), 40 deletions(-)

diff --git a/man/Wf.Rd b/man/Wf.Rd
index b2c99f3..7e98ebf 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -68,17 +68,11 @@ d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryr
 }
 
 }
-\section{Public fields}{
-\if{html}{\out{<div class="r6-fields">}}
+\section{Active bindings}{
+\if{html}{\out{<div class="r6-active-bindings">}}
 \describe{
-\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or
-local filesystem).}
-
-\item{\code{wname}}{Name of workflow (e.g. umccrise, sash).}
-
-\item{\code{filesystem}}{Filesystem of \code{path} (gds/s3/local).}
-
-\item{\code{regexes}}{Tibble with file \code{regex} and \code{fun}ction to parse it.}
+\item{\code{regexes}}{Get/Set regexes. Tibble with file \code{regex} and \code{fun}ction
+to parse it.}
 }
 \if{html}{\out{</div>}}
 }
@@ -140,7 +134,7 @@ Print details about the Workflow.
 List all files under given path.
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$list_files(
-  path = self$path,
+  path = private$.path,
   max_files = 1000,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   ...
@@ -168,7 +162,7 @@ List all files under given path.
 List dracarys files under given path
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$list_files_filter_relevant(
-  path = self$path,
+  path = private$.path,
   max_files = 1000,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   ...
@@ -197,7 +191,7 @@ the \code{s3_list_files_filter_relevant} function.}
 Download files from GDS/S3 to local filesystem.
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$download_files(
-  path = self$path,
+  path = private$.path,
   outdir,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   max_files = 1000,
diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd
index 1b7c81f..1ae526f 100644
--- a/man/Wf_sash.Rd
+++ b/man/Wf_sash.Rd
@@ -10,13 +10,27 @@ Reads and writes tidy versions of files from the \code{sash} workflow
 \dontrun{
 
 #---- Local ----#
-p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ03324/sash"
-p2 <- "202408309698c304/L2300777_L2300776"
+p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ02862/sash"
+p2 <- "20240830ece6b0b7/L2201449_L2201450"
 p <- normalizePath(file.path(p1, p2))
-SubjectID <- "SBJ03324"
-SampleID_tumor <- "PRJ230432"
-prefix <- glue("{SubjectID}__{SampleID_tumor}")
-s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+SubjectID <- "SBJ02862"
+SampleID_tumor <- "PRJ222112"
+SampleID_normal <- "PRJ222114"
+prefix <- glue("{SubjectID}_{SampleID_tumor}")
+s1 <- Wf_sash$new(
+  path = p, SubjectID = SubjectID,
+  SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
+)
+#-- test regexes active binding
+counts1 <- glue(
+  "{p}/{prefix}/smlv_somatic/report/",
+  "{SampleID_tumor}\\\\.somatic\\\\.variant_counts_process\\\\.json$"
+)
+regexes1 <- tibble::tribble(
+      ~regex, ~fun,
+      counts1, "read_smlvSomCounts"
+)
+s1$regexes <- regexes1
 s1$list_files(max_files = 20)
 s1$list_files_filter_relevant(max_files = 300)
 d <- s1$download_files(max_files = 1000, dryrun = F)
@@ -29,13 +43,17 @@ d_write <- s1$write(
 )
 
 #---- S3 ----#
-p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash"
-p2 <- "202408270b93455e/L2401308_L2401307"
+p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ02862/sash"
+p2 <- "20240830ece6b0b7/L2201449_L2201450"
 p <- file.path(p1, p2)
-SubjectID <- "SBJ05571"
-SampleID_tumor <- "MDX240307"
+SubjectID <- "SBJ02862"
+SampleID_tumor <- "PRJ222112"
+SampleID_normal <- "PRJ222114"
 prefix <- glue("{SubjectID}__{SampleID_tumor}")
-s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+s1 <- Wf_sash$new(
+  path = p, SubjectID = SubjectID,
+  SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
+)
 s1$list_files(max_files = 20)
 s1$list_files_filter_relevant()
 outdir <- sub("s3:/", "~/s3", p)
@@ -56,9 +74,11 @@ d_write <- s1$write(
 \section{Public fields}{
 \if{html}{\out{<div class="r6-fields">}}
 \describe{
-\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).}
+\item{\code{SubjectID}}{The SubjectID of the sample.}
 
-\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).}
+\item{\code{SampleID_tumor}}{The SampleID of the tumor sample.}
+
+\item{\code{SampleID_normal}}{The SampleID of the normal sample.}
 }
 \if{html}{\out{</div>}}
 }
@@ -67,6 +87,7 @@ d_write <- s1$write(
 \itemize{
 \item \href{#method-Wf_sash-new}{\code{Wf_sash$new()}}
 \item \href{#method-Wf_sash-print}{\code{Wf_sash$print()}}
+\item \href{#method-Wf_sash-read_smlvSomCounts}{\code{Wf_sash$read_smlvSomCounts()}}
 \item \href{#method-Wf_sash-read_pcgrJson}{\code{Wf_sash$read_pcgrJson()}}
 \item \href{#method-Wf_sash-read_hrdDragen}{\code{Wf_sash$read_hrdDragen()}}
 \item \href{#method-Wf_sash-read_hrdChord}{\code{Wf_sash$read_hrdChord()}}
@@ -93,7 +114,12 @@ d_write <- s1$write(
 \subsection{Method \code{new()}}{
 Create a new Wf_sash object.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$new(path = NULL, SubjectID = NULL, SampleID_tumor = NULL)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$new(
+  path = NULL,
+  SubjectID = NULL,
+  SampleID_tumor = NULL,
+  SampleID_normal = NULL
+)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -102,9 +128,11 @@ Create a new Wf_sash object.
 \item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or
 local filesystem).}
 
-\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).}
+\item{\code{SubjectID}}{The SubjectID of the sample.}
+
+\item{\code{SampleID_tumor}}{The SampleID of the tumor sample.}
 
-\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).}
+\item{\code{SampleID_normal}}{The SampleID of the tumor sample.}
 }
 \if{html}{\out{</div>}}
 }
@@ -127,6 +155,23 @@ Print details about the Workflow.
 }
 }
 \if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_sash-read_smlvSomCounts"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_sash-read_smlvSomCounts}{}}}
+\subsection{Method \code{read_smlvSomCounts()}}{
+Read \code{somatic.variant_counts_process.json} file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_sash$read_smlvSomCounts(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf_sash-read_pcgrJson"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgrJson}{}}}
 \subsection{Method \code{read_pcgrJson()}}{
diff --git a/man/Wf_sash_download_tidy_write.Rd b/man/Wf_sash_download_tidy_write.Rd
index a806fdd..ca0c538 100644
--- a/man/Wf_sash_download_tidy_write.Rd
+++ b/man/Wf_sash_download_tidy_write.Rd
@@ -8,10 +8,12 @@ Wf_sash_download_tidy_write(
   path,
   SubjectID,
   SampleID_tumor,
+  SampleID_normal,
   outdir,
   format = "rds",
   max_files = 1000,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+  regexes = NULL,
   dryrun = FALSE
 )
 }
@@ -19,9 +21,11 @@ Wf_sash_download_tidy_write(
 \item{path}{Path to directory with raw workflow results (from GDS, S3, or
 local filesystem).}
 
-\item{SubjectID}{The SubjectID of the sample (needed for path lookup).}
+\item{SubjectID}{The SubjectID of the sample.}
 
-\item{SampleID_tumor}{The SampleID of the tumor sample (needed for path lookup).}
+\item{SampleID_tumor}{The SampleID of the tumor sample.}
+
+\item{SampleID_normal}{The SampleID of the normal sample.}
 
 \item{outdir}{Path to output directory.}
 
@@ -31,6 +35,8 @@ local filesystem).}
 
 \item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
+\item{regexes}{Tibble with file \code{regex} and \code{fun}ction to parse it.}
+
 \item{dryrun}{If TRUE, just list the files that will be downloaded (don't
 download them).}
 }
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index f9e92e2..d19c3f6 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -28,14 +28,18 @@ d_write <- um1$write(
 )
 
 #---- GDS ----#
-SubjectID <- "SBJ03043"
-SampleID_tumor <- "PRJ230004"
+SubjectID <- "SBJ03606"
+SampleID_tumor <- "PRJ230726"
+SampleID_normal <- "PRJ230725"
 prefix <- glue("{SubjectID}__{SampleID_tumor}")
 p1_gds <- "gds://production/analysis_data"
-p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
+p <- file.path(p1_gds, "SBJ03606/umccrise/20240829d11e13b0/L2300828__L2300827")
 outdir <- file.path(sub("gds:/", "~/icav1/g", p))
 token <- Sys.getenv("ICA_ACCESS_TOKEN")
-um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+um2 <- Wf_umccrise$new(
+  path = p, SubjectID = SubjectID,
+  SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
+)
 um2$list_files(max_files = 8)
 um2$list_files_filter_relevant(ica_token = token, max_files = 500)
 d <- um2$download_files(
@@ -58,9 +62,11 @@ d_write <- um2$write(
 \section{Public fields}{
 \if{html}{\out{<div class="r6-fields">}}
 \describe{
-\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).}
+\item{\code{SubjectID}}{The SubjectID of the sample.}
 
-\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).}
+\item{\code{SampleID_tumor}}{The SampleID of the tumor sample.}
+
+\item{\code{SampleID_normal}}{The SampleID of the normal sample.}
 }
 \if{html}{\out{</div>}}
 }
@@ -95,7 +101,12 @@ d_write <- um2$write(
 \subsection{Method \code{new()}}{
 Create a new Wf_umccrise object.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$new(path = NULL, SubjectID = NULL, SampleID_tumor = NULL)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$new(
+  path = NULL,
+  SubjectID = NULL,
+  SampleID_tumor = NULL,
+  SampleID_normal = NULL
+)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -104,9 +115,11 @@ Create a new Wf_umccrise object.
 \item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or
 local filesystem).}
 
-\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).}
+\item{\code{SubjectID}}{The SubjectID of the sample.}
+
+\item{\code{SampleID_tumor}}{The SampleID of the tumor sample.}
 
-\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).}
+\item{\code{SampleID_normal}}{The SampleID of the normal sample.}
 }
 \if{html}{\out{</div>}}
 }

From a30b4ff86ef259546a38e9dc819e9ed9f23a543e Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 22 Oct 2024 18:45:58 +1100
Subject: [PATCH 10/32] sash: update regexes

---
 R/Wf.R       |   2 +
 R/sash.R     | 118 ++++++++++++++++++++++++++++++++++++---------------
 R/umccrise.R |   4 +-
 3 files changed, 87 insertions(+), 37 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index 6fe882d..d6539c2 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -68,6 +68,8 @@ Wf <- R6::R6Class(
     .filesystem = NULL
   ),
   active = list(
+    #' @field regexes Get/Set regexes. Tibble with file `regex` and `fun`ction
+    #' to parse it.
     regexes = function(value) {
       if (missing(value)) {
         private$.regexes
diff --git a/R/sash.R b/R/sash.R
index 725d36c..c000da6 100644
--- a/R/sash.R
+++ b/R/sash.R
@@ -7,13 +7,27 @@
 #' \dontrun{
 #'
 #' #---- Local ----#
-#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ03324/sash"
-#' p2 <- "202408309698c304/L2300777_L2300776"
+#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ02862/sash"
+#' p2 <- "20240830ece6b0b7/L2201449_L2201450"
 #' p <- normalizePath(file.path(p1, p2))
-#' SubjectID <- "SBJ03324"
-#' SampleID_tumor <- "PRJ230432"
-#' prefix <- glue("{SubjectID}__{SampleID_tumor}")
-#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+#' SubjectID <- "SBJ02862"
+#' SampleID_tumor <- "PRJ222112"
+#' SampleID_normal <- "PRJ222114"
+#' prefix <- glue("{SubjectID}_{SampleID_tumor}")
+#' s1 <- Wf_sash$new(
+#'   path = p, SubjectID = SubjectID,
+#'   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
+#' )
+#' #-- test regexes active binding
+#' counts1 <- glue(
+#'   "{p}/{prefix}/smlv_somatic/report/",
+#'   "{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"
+#' )
+#' regexes1 <- tibble::tribble(
+#'   ~regex, ~fun,
+#'   counts1, "read_smlvSomCounts"
+#' )
+#' s1$regexes <- regexes1
 #' s1$list_files(max_files = 20)
 #' s1$list_files_filter_relevant(max_files = 300)
 #' d <- s1$download_files(max_files = 1000, dryrun = F)
@@ -26,13 +40,17 @@
 #' )
 #'
 #' #---- S3 ----#
-#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash"
-#' p2 <- "202408270b93455e/L2401308_L2401307"
+#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ02862/sash"
+#' p2 <- "20240830ece6b0b7/L2201449_L2201450"
 #' p <- file.path(p1, p2)
-#' SubjectID <- "SBJ05571"
-#' SampleID_tumor <- "MDX240307"
+#' SubjectID <- "SBJ02862"
+#' SampleID_tumor <- "PRJ222112"
+#' SampleID_normal <- "PRJ222114"
 #' prefix <- glue("{SubjectID}__{SampleID_tumor}")
-#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+#' s1 <- Wf_sash$new(
+#'   path = p, SubjectID = SubjectID,
+#'   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
+#' )
 #' s1$list_files(max_files = 20)
 #' s1$list_files_filter_relevant()
 #' outdir <- sub("s3:/", "~/s3", p)
@@ -51,51 +69,74 @@ Wf_sash <- R6::R6Class(
   "Wf_sash",
   inherit = Wf,
   public = list(
-    #' @field SubjectID The SubjectID of the sample (needed for path lookup).
-    #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
+    #' @field SubjectID The SubjectID of the sample.
+    #' @field SampleID_tumor The SampleID of the tumor sample.
+    #' @field SampleID_normal The SampleID of the normal sample.
     SubjectID = NULL,
     SampleID_tumor = NULL,
+    SampleID_normal = NULL,
     #' @description Create a new Wf_sash object.
     #' @param path Path to directory with raw workflow results (from GDS, S3, or
     #' local filesystem).
-    #' @param SubjectID The SubjectID of the sample (needed for path lookup).
-    #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
-    initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) {
+    #' @param SubjectID The SubjectID of the sample.
+    #' @param SampleID_tumor The SampleID of the tumor sample.
+    #' @param SampleID_normal The SampleID of the tumor sample.
+    initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL,
+                          SampleID_normal = NULL) {
       wname <- "sash"
       pref <- glue("{SubjectID}_{SampleID_tumor}")
       crep <- "cancer_report/cancer_report_tables"
       regexes <- tibble::tribble(
         ~regex, ~fun,
-        glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord",
-        glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect",
-        glue("{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrd_dragen",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigstsv",
-        glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigstsv",
-        glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigstsv",
-        glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigstsv",
-        glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum",
-        glue("{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgr_json"
+        glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord",
+        glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect",
+        glue("{path}/{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrdDragen",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv",
+        glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum",
+        glue("{path}/{pref}/purple/{SampleID_tumor}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgrJson",
+        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts",
+        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi"), "DOWNLOAD_ONLY",
       ) |>
-        dplyr::mutate(fun = paste0("read_", .data$fun))
+        dplyr::mutate(
+          fun = paste0("read_", .data$fun),
+          fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun)
+        )
 
       super$initialize(path = path, wname = wname, regexes = regexes)
       self$SubjectID <- SubjectID
       self$SampleID_tumor <- SampleID_tumor
+      self$SampleID_normal <- SampleID_normal
     },
     #' @description Print details about the Workflow.
     #' @param ... (ignored).
     print = function(...) {
       res <- tibble::tribble(
         ~var, ~value,
-        "path", self$path,
-        "wname", self$wname,
-        "filesystem", self$filesystem,
+        "path", private$.path,
+        "wname", private$.wname,
+        "filesystem", private$.filesystem,
+        "nregexes", as.character(nrow(private$.regexes)),
         "SubjectID", self$SubjectID,
-        "SampleID_tumor", self$SampleID_tumor
+        "SampleID_tumor", self$SampleID_tumor,
+        "SampleID_normal", self$SampleID_normal
       )
       print(res)
       invisible(self)
     },
+    #' @description Read `somatic.variant_counts_process.json` file.
+    #' @param x Path to file.
+    read_smlvSomCounts = function(x) {
+      dat <- jsonlite::read_json(x) |>
+        tibble::as_tibble_row()
+      tibble::tibble(name = "smlvsomcounts", data = list(dat[]))
+    },
     #' @description Read `pcgr.json.gz` file.
     #' @param x Path to file.
     read_pcgrJson = function(x) {
@@ -193,14 +234,17 @@ Wf_sash <- R6::R6Class(
 #'
 #' @param path Path to directory with raw workflow results (from GDS, S3, or
 #' local filesystem).
-#' @param SubjectID The SubjectID of the sample (needed for path lookup).
-#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
+#' @param SubjectID The SubjectID of the sample.
+#' @param SampleID_tumor The SampleID of the tumor sample.
+#' @param SampleID_normal The SampleID of the normal sample.
 #' @param outdir Path to output directory.
 #' @param format Format of output files.
 #' @param max_files Max number of files to list.
 #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
+#' @param regexes Tibble with file `regex` and `fun`ction to parse it.
+#'
 #' @return List where each element is a tidy tibble of a sash file.
 #'
 #' @examples
@@ -218,13 +262,17 @@ Wf_sash <- R6::R6Class(
 #' )
 #' }
 #' @export
-Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor,
+Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor, SampleID_normal,
                                         outdir, format = "rds", max_files = 1000,
                                         ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
-                                        dryrun = FALSE) {
+                                        regexes = NULL, dryrun = FALSE) {
   s <- Wf_sash$new(
-    path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor
+    path = path, SubjectID = SubjectID,
+    SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
   )
+  if (!is.null(regexes)) {
+    s$regexes <- regexes
+  }
   d_dl <- s$download_files(
     outdir = outdir, ica_token = ica_token,
     max_files = max_files, dryrun = dryrun
diff --git a/R/umccrise.R b/R/umccrise.R
index dc4b5d9..79aeb12 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -68,7 +68,7 @@ Wf_umccrise <- R6::R6Class(
     #' local filesystem).
     #' @param SubjectID The SubjectID of the sample.
     #' @param SampleID_tumor The SampleID of the tumor sample.
-    #' @field SampleID_normal The SampleID of the normal sample.
+    #' @param SampleID_normal The SampleID of the normal sample.
     initialize = function(path = NULL, SubjectID = NULL,
                           SampleID_tumor = NULL, SampleID_normal = NULL) {
       wname <- "umccrise"
@@ -90,7 +90,7 @@ Wf_umccrise <- R6::R6Class(
         glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY",
         glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
         glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.somatic\\.tsv$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY",
         glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
         glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY"
       ) |>

From ace1028461d205dd1dfae3ae0ce058e5790082cf Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Thu, 24 Oct 2024 09:20:05 +1100
Subject: [PATCH 11/32] fix regexes

---
 R/sash.R                           | 11 ++++++-----
 R/umccrise.R                       | 23 +++++++++++------------
 man/Wf_sash.Rd                     |  5 ++---
 man/Wf_sash_download_tidy_write.Rd |  3 ++-
 man/Wf_umccrise.Rd                 | 11 +++++------
 5 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/R/sash.R b/R/sash.R
index c000da6..c73fb06 100644
--- a/R/sash.R
+++ b/R/sash.R
@@ -46,7 +46,6 @@
 #' SubjectID <- "SBJ02862"
 #' SampleID_tumor <- "PRJ222112"
 #' SampleID_normal <- "PRJ222114"
-#' prefix <- glue("{SubjectID}__{SampleID_tumor}")
 #' s1 <- Wf_sash$new(
 #'   path = p, SubjectID = SubjectID,
 #'   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
@@ -100,9 +99,9 @@ Wf_sash <- R6::R6Class(
         glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgrJson",
         glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
         glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts",
-        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi"), "DOWNLOAD_ONLY",
+        # glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts",
+        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
       ) |>
         dplyr::mutate(
           fun = paste0("read_", .data$fun),
@@ -243,7 +242,9 @@ Wf_sash <- R6::R6Class(
 #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
-#' @param regexes Tibble with file `regex` and `fun`ction to parse it.
+#' @param regexes Tibble with file `regex` and `fun`ction to parse it. Use only
+#' if you want to override the default regexes used for this workflow.
+#'
 #'
 #' @return List where each element is a tidy tibble of a sash file.
 #'
diff --git a/R/umccrise.R b/R/umccrise.R
index 79aeb12..1ffb223 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -25,12 +25,11 @@
 #' )
 #'
 #' #---- GDS ----#
-#' SubjectID <- "SBJ03606"
-#' SampleID_tumor <- "PRJ230726"
-#' SampleID_normal <- "PRJ230725"
-#' prefix <- glue("{SubjectID}__{SampleID_tumor}")
+#' SubjectID <- "SBJ04662"
+#' SampleID_tumor <- "PRJ240647"
+#' SampleID_normal <- "PRJ240646"
 #' p1_gds <- "gds://production/analysis_data"
-#' p <- file.path(p1_gds, "SBJ03606/umccrise/20240829d11e13b0/L2300828__L2300827")
+#' p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239")
 #' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
 #' token <- Sys.getenv("ICA_ACCESS_TOKEN")
 #' um2 <- Wf_umccrise$new(
@@ -38,7 +37,7 @@
 #'   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
 #' )
 #' um2$list_files(max_files = 8)
-#' um2$list_files_filter_relevant(ica_token = token, max_files = 500)
+#' um2$list_files_filter_relevant(ica_token = token, max_files = 1000)
 #' d <- um2$download_files(
 #'   outdir = outdir, ica_token = token,
 #'   max_files = 1000, dryrun = F
@@ -73,7 +72,6 @@ Wf_umccrise <- R6::R6Class(
                           SampleID_tumor = NULL, SampleID_normal = NULL) {
       wname <- "umccrise"
       pref <- glue("{SubjectID}__{SampleID_tumor}")
-      pref_norm <- glue("{SubjectID}__{SampleID_normal}")
       crep <- "cancer_report_tables"
       smallv <- "small_variants"
       regexes <- tibble::tribble(
@@ -91,8 +89,8 @@ Wf_umccrise <- R6::R6Class(
         glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
         glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
         glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/{smallv}/{pref_norm}-germline\\.predispose_genes\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY"
+        glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
+        glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
       ) |>
         dplyr::mutate(
           fun = paste0("read_", .data$fun),
@@ -108,9 +106,10 @@ Wf_umccrise <- R6::R6Class(
     print = function(...) {
       res <- tibble::tribble(
         ~var, ~value,
-        "path", self$path,
-        "wname", self$wname,
-        "filesystem", self$filesystem,
+        "path", private$.path,
+        "wname", private$.wname,
+        "filesystem", private$.filesystem,
+        "nregexes", as.character(nrow(private$.regexes)),
         "SubjectID", self$SubjectID,
         "SampleID_tumor", self$SampleID_tumor,
         "SampleID_normal", self$SampleID_normal
diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd
index 1ae526f..638c69e 100644
--- a/man/Wf_sash.Rd
+++ b/man/Wf_sash.Rd
@@ -27,8 +27,8 @@ counts1 <- glue(
   "{SampleID_tumor}\\\\.somatic\\\\.variant_counts_process\\\\.json$"
 )
 regexes1 <- tibble::tribble(
-      ~regex, ~fun,
-      counts1, "read_smlvSomCounts"
+  ~regex, ~fun,
+  counts1, "read_smlvSomCounts"
 )
 s1$regexes <- regexes1
 s1$list_files(max_files = 20)
@@ -49,7 +49,6 @@ p <- file.path(p1, p2)
 SubjectID <- "SBJ02862"
 SampleID_tumor <- "PRJ222112"
 SampleID_normal <- "PRJ222114"
-prefix <- glue("{SubjectID}__{SampleID_tumor}")
 s1 <- Wf_sash$new(
   path = p, SubjectID = SubjectID,
   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
diff --git a/man/Wf_sash_download_tidy_write.Rd b/man/Wf_sash_download_tidy_write.Rd
index ca0c538..2a3260e 100644
--- a/man/Wf_sash_download_tidy_write.Rd
+++ b/man/Wf_sash_download_tidy_write.Rd
@@ -35,7 +35,8 @@ local filesystem).}
 
 \item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
-\item{regexes}{Tibble with file \code{regex} and \code{fun}ction to parse it.}
+\item{regexes}{Tibble with file \code{regex} and \code{fun}ction to parse it. Use only
+if you want to override the default regexes used for this workflow.}
 
 \item{dryrun}{If TRUE, just list the files that will be downloaded (don't
 download them).}
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index d19c3f6..a43d7ad 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -28,12 +28,11 @@ d_write <- um1$write(
 )
 
 #---- GDS ----#
-SubjectID <- "SBJ03606"
-SampleID_tumor <- "PRJ230726"
-SampleID_normal <- "PRJ230725"
-prefix <- glue("{SubjectID}__{SampleID_tumor}")
+SubjectID <- "SBJ04662"
+SampleID_tumor <- "PRJ240647"
+SampleID_normal <- "PRJ240646"
 p1_gds <- "gds://production/analysis_data"
-p <- file.path(p1_gds, "SBJ03606/umccrise/20240829d11e13b0/L2300828__L2300827")
+p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239")
 outdir <- file.path(sub("gds:/", "~/icav1/g", p))
 token <- Sys.getenv("ICA_ACCESS_TOKEN")
 um2 <- Wf_umccrise$new(
@@ -41,7 +40,7 @@ um2 <- Wf_umccrise$new(
   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
 )
 um2$list_files(max_files = 8)
-um2$list_files_filter_relevant(ica_token = token, max_files = 500)
+um2$list_files_filter_relevant(ica_token = token, max_files = 1000)
 d <- um2$download_files(
   outdir = outdir, ica_token = token,
   max_files = 1000, dryrun = F

From d9e0672ffd4b3b0e1d0bdece66e532436f907ac5 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Thu, 24 Oct 2024 15:07:37 +1100
Subject: [PATCH 12/32] DOWNLOAD_ONLY tidy: just return input path

---
 R/Wf.R                            |  5 +++++
 R/tidy.R                          | 10 ++++++++--
 man/Wf.Rd                         | 18 ++++++++++++++++++
 man/Wf_dragen.Rd                  |  3 ++-
 man/Wf_sash.Rd                    |  3 ++-
 man/Wf_tso_ctdna_tumor_only.Rd    |  3 ++-
 man/Wf_tso_ctdna_tumor_only_v2.Rd |  3 ++-
 man/Wf_umccrise.Rd                |  3 ++-
 man/tidy_files.Rd                 |  6 +++++-
 9 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index d6539c2..84d4644 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -175,6 +175,11 @@ Wf <- R6::R6Class(
       }
       d
     },
+    #' @description For DOWNLOAD_ONLY files, just return the input path.
+    #' @param x Path with raw results.
+    DOWNLOAD_ONLY = function(x) {
+      tibble::tibble(name = glue("DOWNLOAD_ONLY"), data = list(tibble::tibble(input_path = x)))
+    },
     #' @description Download files from GDS/S3 to local filesystem.
     #' @param path Path with raw results.
     #' @param outdir Path to output directory.
diff --git a/R/tidy.R b/R/tidy.R
index a8c4780..1130066 100644
--- a/R/tidy.R
+++ b/R/tidy.R
@@ -14,11 +14,17 @@
 #' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
 #' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables"
 #' p <- file.path(p1, p2, "SBJ01155__PRJ211091-qc_summary.tsv.gz")
+#' p_dl <- file.path(
+#'   p1, "L2101566__L2101565/SBJ01155__PRJ211091/small_variants",
+#'   "SBJ01155__PRJ211091-somatic-PASS.vcf.gz"
+#' )
 #' fun <- function(x) {
 #'   d <- readr::read_tsv(x)
 #'   tibble::tibble(name = "table1", data = list(d[]))
 #' }
-#' x <- tibble::tibble(type = "fun", localpath = p)
+#' x <- tibble::tibble(
+#'   type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl)
+#' )
 #' tidy_files(x)
 #' }
 #'
@@ -27,7 +33,7 @@ tidy_files <- function(x, envir = parent.frame()) {
   assertthat::assert_that(is.data.frame(x))
   assertthat::assert_that(all(c("type", "localpath") %in% colnames(x)))
   x |>
-    dplyr::filter(.data$type != "DOWNLOAD_ONLY") |>
+    # dplyr::filter(.data$type != "DOWNLOAD_ONLY") |>
     dplyr::rowwise() |>
     dplyr::mutate(
       data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath))
diff --git a/man/Wf.Rd b/man/Wf.Rd
index 7e98ebf..5dff941 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -83,6 +83,7 @@ to parse it.}
 \item \href{#method-Wf-print}{\code{Wf$print()}}
 \item \href{#method-Wf-list_files}{\code{Wf$list_files()}}
 \item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}}
+\item \href{#method-Wf-DOWNLOAD_ONLY}{\code{Wf$DOWNLOAD_ONLY()}}
 \item \href{#method-Wf-download_files}{\code{Wf$download_files()}}
 \item \href{#method-Wf-tidy_files}{\code{Wf$tidy_files()}}
 \item \href{#method-Wf-write}{\code{Wf$write()}}
@@ -185,6 +186,23 @@ the \code{s3_list_files_filter_relevant} function.}
 }
 }
 \if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf-DOWNLOAD_ONLY"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf-DOWNLOAD_ONLY}{}}}
+\subsection{Method \code{DOWNLOAD_ONLY()}}{
+For DOWNLOAD_ONLY files, just return the input path.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf$DOWNLOAD_ONLY(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path with raw results.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf-download_files"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf-download_files}{}}}
 \subsection{Method \code{download_files()}}{
diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd
index 141e855..66b91c6 100644
--- a/man/Wf_dragen.Rd
+++ b/man/Wf_dragen.Rd
@@ -65,8 +65,9 @@ d_write <- t1$write(
 }
 }
 \if{html}{\out{
-<details open><summary>Inherited methods</summary>
+<details><summary>Inherited methods</summary>
 <ul>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="DOWNLOAD_ONLY"><a href='../../dracarys/html/Wf.html#method-Wf-DOWNLOAD_ONLY'><code>dracarys::Wf$DOWNLOAD_ONLY()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="download_files"><a href='../../dracarys/html/Wf.html#method-Wf-download_files'><code>dracarys::Wf$download_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files"><a href='../../dracarys/html/Wf.html#method-Wf-list_files'><code>dracarys::Wf$list_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files_filter_relevant"><a href='../../dracarys/html/Wf.html#method-Wf-list_files_filter_relevant'><code>dracarys::Wf$list_files_filter_relevant()</code></a></span></li>
diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd
index 638c69e..d3a8f66 100644
--- a/man/Wf_sash.Rd
+++ b/man/Wf_sash.Rd
@@ -97,8 +97,9 @@ d_write <- s1$write(
 }
 }
 \if{html}{\out{
-<details open><summary>Inherited methods</summary>
+<details><summary>Inherited methods</summary>
 <ul>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="DOWNLOAD_ONLY"><a href='../../dracarys/html/Wf.html#method-Wf-DOWNLOAD_ONLY'><code>dracarys::Wf$DOWNLOAD_ONLY()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="download_files"><a href='../../dracarys/html/Wf.html#method-Wf-download_files'><code>dracarys::Wf$download_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files"><a href='../../dracarys/html/Wf.html#method-Wf-list_files'><code>dracarys::Wf$list_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files_filter_relevant"><a href='../../dracarys/html/Wf.html#method-Wf-list_files_filter_relevant'><code>dracarys::Wf$list_files_filter_relevant()</code></a></span></li>
diff --git a/man/Wf_tso_ctdna_tumor_only.Rd b/man/Wf_tso_ctdna_tumor_only.Rd
index aa37b47..a58245e 100644
--- a/man/Wf_tso_ctdna_tumor_only.Rd
+++ b/man/Wf_tso_ctdna_tumor_only.Rd
@@ -82,8 +82,9 @@ d_write <- t2$write(
 }
 }
 \if{html}{\out{
-<details open><summary>Inherited methods</summary>
+<details><summary>Inherited methods</summary>
 <ul>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="DOWNLOAD_ONLY"><a href='../../dracarys/html/Wf.html#method-Wf-DOWNLOAD_ONLY'><code>dracarys::Wf$DOWNLOAD_ONLY()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="download_files"><a href='../../dracarys/html/Wf.html#method-Wf-download_files'><code>dracarys::Wf$download_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files"><a href='../../dracarys/html/Wf.html#method-Wf-list_files'><code>dracarys::Wf$list_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files_filter_relevant"><a href='../../dracarys/html/Wf.html#method-Wf-list_files_filter_relevant'><code>dracarys::Wf$list_files_filter_relevant()</code></a></span></li>
diff --git a/man/Wf_tso_ctdna_tumor_only_v2.Rd b/man/Wf_tso_ctdna_tumor_only_v2.Rd
index fb84a29..148aca0 100644
--- a/man/Wf_tso_ctdna_tumor_only_v2.Rd
+++ b/man/Wf_tso_ctdna_tumor_only_v2.Rd
@@ -91,8 +91,9 @@ d_write <- t2$write(
 }
 }
 \if{html}{\out{
-<details open><summary>Inherited methods</summary>
+<details><summary>Inherited methods</summary>
 <ul>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="DOWNLOAD_ONLY"><a href='../../dracarys/html/Wf.html#method-Wf-DOWNLOAD_ONLY'><code>dracarys::Wf$DOWNLOAD_ONLY()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="download_files"><a href='../../dracarys/html/Wf.html#method-Wf-download_files'><code>dracarys::Wf$download_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files"><a href='../../dracarys/html/Wf.html#method-Wf-list_files'><code>dracarys::Wf$list_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files_filter_relevant"><a href='../../dracarys/html/Wf.html#method-Wf-list_files_filter_relevant'><code>dracarys::Wf$list_files_filter_relevant()</code></a></span></li>
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index a43d7ad..7b93d21 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -84,8 +84,9 @@ d_write <- um2$write(
 }
 }
 \if{html}{\out{
-<details open><summary>Inherited methods</summary>
+<details><summary>Inherited methods</summary>
 <ul>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="DOWNLOAD_ONLY"><a href='../../dracarys/html/Wf.html#method-Wf-DOWNLOAD_ONLY'><code>dracarys::Wf$DOWNLOAD_ONLY()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="download_files"><a href='../../dracarys/html/Wf.html#method-Wf-download_files'><code>dracarys::Wf$download_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files"><a href='../../dracarys/html/Wf.html#method-Wf-list_files'><code>dracarys::Wf$list_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files_filter_relevant"><a href='../../dracarys/html/Wf.html#method-Wf-list_files_filter_relevant'><code>dracarys::Wf$list_files_filter_relevant()</code></a></span></li>
diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd
index caff8d2..75f3625 100644
--- a/man/tidy_files.Rd
+++ b/man/tidy_files.Rd
@@ -25,11 +25,15 @@ Tidies files into a tibble with parsed data.
 p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
 p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables"
 p <- file.path(p1, p2, "SBJ01155__PRJ211091-qc_summary.tsv.gz")
+p_dl <- file.path(
+  p1, "L2101566__L2101565/SBJ01155__PRJ211091/small_variants",
+  "SBJ01155__PRJ211091-somatic-PASS.vcf.gz")
 fun <- function(x) {
   d <- readr::read_tsv(x)
   tibble::tibble(name = "table1", data = list(d[]))
 }
-x <- tibble::tibble(type = "fun", localpath = p)
+x <- tibble::tibble(
+  type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl))
 tidy_files(x)
 }
 

From ee0ec621e94db6411019cfc61a4cac4d7eca49ca Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Thu, 24 Oct 2024 16:12:46 +1100
Subject: [PATCH 13/32] better handling of DOWNLOAD_ONLY

---
 R/Wf.R                                 | 12 ++++++--
 R/tidy.R                               |  1 -
 R/umccrise.R                           | 38 ++++++++++++++++----------
 man/Wf_umccrise.Rd                     | 13 +++++----
 man/Wf_umccrise_download_tidy_write.Rd | 21 ++++++++------
 man/tidy_files.Rd                      |  6 ++--
 6 files changed, 58 insertions(+), 33 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index 84d4644..e18d908 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -238,8 +238,16 @@ Wf <- R6::R6Class(
       d_write <- x |>
         dplyr::rowwise() |>
         dplyr::mutate(
-          p = glue("{prefix}_{.data$name}"),
-          out = list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid))
+          p = ifelse(
+            .data$name != "DOWNLOAD_ONLY",
+            as.character(glue("{prefix}_{.data$name}")),
+            as.character(.data$data |> dplyr::pull("input_path"))
+          ),
+          out = ifelse(
+            .data$name != "DOWNLOAD_ONLY",
+            list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid)),
+            list(.data$data)
+          )
         ) |>
         dplyr::ungroup() |>
         dplyr::select("name", "data", prefix = "p")
diff --git a/R/tidy.R b/R/tidy.R
index 1130066..9ca0496 100644
--- a/R/tidy.R
+++ b/R/tidy.R
@@ -33,7 +33,6 @@ tidy_files <- function(x, envir = parent.frame()) {
   assertthat::assert_that(is.data.frame(x))
   assertthat::assert_that(all(c("type", "localpath") %in% colnames(x)))
   x |>
-    # dplyr::filter(.data$type != "DOWNLOAD_ONLY") |>
     dplyr::rowwise() |>
     dplyr::mutate(
       data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath))
diff --git a/R/umccrise.R b/R/umccrise.R
index 1ffb223..743c58d 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -7,12 +7,15 @@
 #' \dontrun{
 #'
 #' #---- LOCAL ----#
-#' SubjectID <- "SBJ03043"
-#' SampleID_tumor <- "PRJ230004"
-#' prefix <- glue("{SubjectID}__{SampleID_tumor}")
+#' SubjectID <- "SBJ04662"
+#' SampleID_tumor <- "PRJ240647"
+#' SampleID_normal <- "PRJ240646"
 #' p1_local <- "~/icav1/g/production/analysis_data"
-#' p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
-#' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+#' p <- file.path(normalizePath(p1_local), "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239")
+#' um1 <- Wf_umccrise$new(
+#'   path = p, SubjectID = SubjectID,
+#'   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
+#' )
 #' um1$list_files(max_files = 10)
 #' um1$list_files_filter_relevant()
 #' d <- um1$download_files(max_files = 1000, dryrun = F)
@@ -245,37 +248,42 @@ Wf_umccrise <- R6::R6Class(
 #'
 #' @param path Path to directory with raw workflow results (from GDS, S3, or
 #' local filesystem).
-#' @param SubjectID The SubjectID of the sample (needed for path lookup).
-#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
+#' @param SubjectID The SubjectID of the sample.
+#' @param SampleID_tumor The SampleID of the tumor sample.
+#' @param SampleID_normal The SampleID of the normal sample.
 #' @param outdir Path to output directory.
 #' @param format Format of output files.
 #' @param max_files Max number of files to list.
 #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
-#' @return List where each element is a tidy tibble of a umccrise file.
+#' @return Tibble of tidy data as list-cols.
 #'
 #' @examples
 #' \dontrun{
-#' SubjectID <- "SBJ03043"
-#' SampleID_tumor <- "PRJ230004"
-#' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise")
-#' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063")
+#' SubjectID <- "SBJ04662"
+#' SampleID_tumor <- "PRJ240647"
+#' SampleID_normal <- "PRJ240646"
+#' p1_gds <- "gds://production/analysis_data"
+#' p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239")
 #' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
 #' token <- Sys.getenv("ICA_ACCESS_TOKEN")
 #' d <- Wf_umccrise_download_tidy_write(
-#'   path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor,
+#'   path = p, SubjectID = SubjectID,
+#'   SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal,
 #'   outdir = outdir,
 #'   dryrun = F
 #' )
 #' }
 #' @export
-Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor,
+Wf_umccrise_download_tidy_write <- function(path, SubjectID,
+                                            SampleID_tumor, SampleID_normal,
                                             outdir, format = "rds", max_files = 1000,
                                             ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
                                             dryrun = FALSE) {
   um <- Wf_umccrise$new(
-    path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor
+    path = path, SubjectID = SubjectID,
+    SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
   )
   d_dl <- um$download_files(
     outdir = outdir, ica_token = ica_token,
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index 7b93d21..de10726 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -10,12 +10,15 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow
 \dontrun{
 
 #---- LOCAL ----#
-SubjectID <- "SBJ03043"
-SampleID_tumor <- "PRJ230004"
-prefix <- glue("{SubjectID}__{SampleID_tumor}")
+SubjectID <- "SBJ04662"
+SampleID_tumor <- "PRJ240647"
+SampleID_normal <- "PRJ240646"
 p1_local <- "~/icav1/g/production/analysis_data"
-p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
-um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+p <- file.path(normalizePath(p1_local), "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239")
+um1 <- Wf_umccrise$new(
+  path = p, SubjectID = SubjectID,
+  SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal
+)
 um1$list_files(max_files = 10)
 um1$list_files_filter_relevant()
 d <- um1$download_files(max_files = 1000, dryrun = F)
diff --git a/man/Wf_umccrise_download_tidy_write.Rd b/man/Wf_umccrise_download_tidy_write.Rd
index 05ca0bf..99eeacc 100644
--- a/man/Wf_umccrise_download_tidy_write.Rd
+++ b/man/Wf_umccrise_download_tidy_write.Rd
@@ -8,6 +8,7 @@ Wf_umccrise_download_tidy_write(
   path,
   SubjectID,
   SampleID_tumor,
+  SampleID_normal,
   outdir,
   format = "rds",
   max_files = 1000,
@@ -19,9 +20,11 @@ Wf_umccrise_download_tidy_write(
 \item{path}{Path to directory with raw workflow results (from GDS, S3, or
 local filesystem).}
 
-\item{SubjectID}{The SubjectID of the sample (needed for path lookup).}
+\item{SubjectID}{The SubjectID of the sample.}
 
-\item{SampleID_tumor}{The SampleID of the tumor sample (needed for path lookup).}
+\item{SampleID_tumor}{The SampleID of the tumor sample.}
+
+\item{SampleID_normal}{The SampleID of the normal sample.}
 
 \item{outdir}{Path to output directory.}
 
@@ -35,21 +38,23 @@ local filesystem).}
 download them).}
 }
 \value{
-List where each element is a tidy tibble of a umccrise file.
+Tibble of tidy data as list-cols.
 }
 \description{
 Downloads files from the \code{umccrise} workflow and writes them in a tidy format.
 }
 \examples{
 \dontrun{
-SubjectID <- "SBJ03043"
-SampleID_tumor <- "PRJ230004"
-p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise")
-p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063")
+SubjectID <- "SBJ04662"
+SampleID_tumor <- "PRJ240647"
+SampleID_normal <- "PRJ240646"
+p1_gds <- "gds://production/analysis_data"
+p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239")
 outdir <- file.path(sub("gds:/", "~/icav1/g", p))
 token <- Sys.getenv("ICA_ACCESS_TOKEN")
 d <- Wf_umccrise_download_tidy_write(
-  path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor,
+  path = p, SubjectID = SubjectID,
+  SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal,
   outdir = outdir,
   dryrun = F
 )
diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd
index 75f3625..c13cffd 100644
--- a/man/tidy_files.Rd
+++ b/man/tidy_files.Rd
@@ -27,13 +27,15 @@ p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables"
 p <- file.path(p1, p2, "SBJ01155__PRJ211091-qc_summary.tsv.gz")
 p_dl <- file.path(
   p1, "L2101566__L2101565/SBJ01155__PRJ211091/small_variants",
-  "SBJ01155__PRJ211091-somatic-PASS.vcf.gz")
+  "SBJ01155__PRJ211091-somatic-PASS.vcf.gz"
+)
 fun <- function(x) {
   d <- readr::read_tsv(x)
   tibble::tibble(name = "table1", data = list(d[]))
 }
 x <- tibble::tibble(
-  type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl))
+  type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl)
+)
 tidy_files(x)
 }
 

From a283e13fdfdd07507657712d1a185fa8ce4e4b80 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 26 Oct 2024 18:42:29 +1100
Subject: [PATCH 14/32] better handling of DOWNLOAD_ONLY

---
 R/Wf.R       | 11 +++++++----
 R/tidy.R     | 21 +++++++++++++++++++--
 R/umccrise.R | 36 ++++++++++++++++--------------------
 3 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index e18d908..ed69d88 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -177,8 +177,11 @@ Wf <- R6::R6Class(
     },
     #' @description For DOWNLOAD_ONLY files, just return the input path.
     #' @param x Path with raw results.
-    DOWNLOAD_ONLY = function(x) {
-      tibble::tibble(name = glue("DOWNLOAD_ONLY"), data = list(tibble::tibble(input_path = x)))
+    DOWNLOAD_ONLY = function(x, suffix = "") {
+      tibble::tibble(
+        name = glue("DOWNLOAD_ONLY{suffix}"),
+        data = list(tibble::tibble(input_path = x))
+      )
     },
     #' @description Download files from GDS/S3 to local filesystem.
     #' @param path Path with raw results.
@@ -239,12 +242,12 @@ Wf <- R6::R6Class(
         dplyr::rowwise() |>
         dplyr::mutate(
           p = ifelse(
-            .data$name != "DOWNLOAD_ONLY",
+            !grepl("DOWNLOAD_ONLY", .data$name),
             as.character(glue("{prefix}_{.data$name}")),
             as.character(.data$data |> dplyr::pull("input_path"))
           ),
           out = ifelse(
-            .data$name != "DOWNLOAD_ONLY",
+            !grepl("DOWNLOAD_ONLY", .data$name),
             list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid)),
             list(.data$data)
           )
diff --git a/R/tidy.R b/R/tidy.R
index 9ca0496..cb5e2f0 100644
--- a/R/tidy.R
+++ b/R/tidy.R
@@ -23,7 +23,7 @@
 #'   tibble::tibble(name = "table1", data = list(d[]))
 #' }
 #' x <- tibble::tibble(
-#'   type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl)
+#'   type = c("fun", "DOWNLOAD_ONLY_foobar"), localpath = c(p, p_dl)
 #' )
 #' tidy_files(x)
 #' }
@@ -32,10 +32,27 @@
 tidy_files <- function(x, envir = parent.frame()) {
   assertthat::assert_that(is.data.frame(x))
   assertthat::assert_that(all(c("type", "localpath") %in% colnames(x)))
+  # if there's a DOWNLOAD_ONLY_suffix, extract that suffix and call
+  # the DOWNLOAD_ONLY function
+  extract_download_suffix <- function(s) {
+    sub("DOWNLOAD_ONLY(.*)", "\\1", s)
+  }
   x |>
     dplyr::rowwise() |>
     dplyr::mutate(
-      data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath))
+      data = ifelse(
+        !grepl("DOWNLOAD_ONLY", .data$type),
+        list(
+          dr_func_eval(
+            f = .data$type, v = .data$type, envir = envir
+          )(.data$localpath)
+        ),
+        list(
+          dr_func_eval(
+            f = "DOWNLOAD_ONLY", v = "DOWNLOAD_ONLY", envir = envir
+          )(.data$localpath, extract_download_suffix(.data$type))
+        )
+      )
     ) |>
     dplyr::ungroup() |>
     dplyr::select("data") |>
diff --git a/R/umccrise.R b/R/umccrise.R
index 743c58d..4b6ed45 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -79,26 +79,22 @@ Wf_umccrise <- R6::R6Class(
       smallv <- "small_variants"
       regexes <- tibble::tribble(
         ~regex, ~fun,
-        glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord",
-        glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect",
-        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv",
-        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv",
-        glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv",
-        glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv",
-        glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum",
-        glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpair",
-        glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgrJson",
-        glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY",
-        glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
-        glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
-      ) |>
-        dplyr::mutate(
-          fun = paste0("read_", .data$fun),
-          fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun)
-        )
+        glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "read_hrdChord",
+        glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "read_hrdHrdetect",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "read_sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "read_sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "read_sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "read_sigsTsv",
+        glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum",
+        glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "read_conpair",
+        glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "read_pcgrJson",
+        glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers",
+        glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smallvpassvcf",
+        glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smallvpassvcfi",
+        glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene",
+        glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf",
+        glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi"
+      )
       super$initialize(path = path, wname = wname, regexes = regexes)
       self$SubjectID <- SubjectID
       self$SampleID_tumor <- SampleID_tumor

From e773307dc8e9142ed22e8ebbe1e0bd2c9a444c3e Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Fri, 1 Nov 2024 23:57:20 +1100
Subject: [PATCH 15/32] fix regexes

---
 R/sash.R          | 29 +++++++++++++++--------------
 R/umccrise.R      |  2 +-
 man/Wf.Rd         |  2 +-
 man/tidy_files.Rd |  2 +-
 4 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/R/sash.R b/R/sash.R
index c73fb06..f1d94b6 100644
--- a/R/sash.R
+++ b/R/sash.R
@@ -87,21 +87,22 @@ Wf_sash <- R6::R6Class(
       crep <- "cancer_report/cancer_report_tables"
       regexes <- tibble::tribble(
         ~regex, ~fun,
-        glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrdChord",
-        glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrdHrdetect",
-        glue("{path}/{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrdDragen",
-        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigsTsv",
-        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigsTsv",
-        glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigsTsv",
-        glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigsTsv",
-        glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcSum",
-        glue("{path}/{pref}/purple/{SampleID_tumor}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgrJson",
-        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "read_hrdChord",
+        glue("{path}/{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "read_hrdHrdetect",
+        glue("{path}/{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "read_hrdDragen",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "read_sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "read_sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "read_sigsTsv",
+        glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "read_sigsTsv",
+        glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum",
+        glue("{path}/{pref}/purple/{SampleID_tumor}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene",
+        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "read_pcgrJson",
+        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers",
+        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY-pcgrvcf",
+        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-pcgrvcfi",
         # glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts",
-        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
-        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
+        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf",
+        glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi",
       ) |>
         dplyr::mutate(
           fun = paste0("read_", .data$fun),
diff --git a/R/umccrise.R b/R/umccrise.R
index 4b6ed45..ba7b745 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -87,11 +87,11 @@ Wf_umccrise <- R6::R6Class(
         glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "read_sigsTsv",
         glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum",
         glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "read_conpair",
+        glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene",
         glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "read_pcgrJson",
         glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers",
         glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smallvpassvcf",
         glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smallvpassvcfi",
-        glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene",
         glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf",
         glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi"
       )
diff --git a/man/Wf.Rd b/man/Wf.Rd
index 5dff941..f8e623d 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -191,7 +191,7 @@ the \code{s3_list_files_filter_relevant} function.}
 \subsection{Method \code{DOWNLOAD_ONLY()}}{
 For DOWNLOAD_ONLY files, just return the input path.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf$DOWNLOAD_ONLY(x)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf$DOWNLOAD_ONLY(x, suffix = "")}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd
index c13cffd..3770191 100644
--- a/man/tidy_files.Rd
+++ b/man/tidy_files.Rd
@@ -34,7 +34,7 @@ fun <- function(x) {
   tibble::tibble(name = "table1", data = list(d[]))
 }
 x <- tibble::tibble(
-  type = c("fun", "DOWNLOAD_ONLY"), localpath = c(p, p_dl)
+  type = c("fun", "DOWNLOAD_ONLY_foobar"), localpath = c(p, p_dl)
 )
 tidy_files(x)
 }

From 1e569e60109b875387a577058f91d973a6bac6d3 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 5 Nov 2024 23:58:21 +1100
Subject: [PATCH 16/32] add smlv filt vcf

---
 R/fs_s3.R    |  2 +-
 R/sash.R     | 11 ++++-------
 R/umccrise.R |  9 +++++----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/R/fs_s3.R b/R/fs_s3.R
index f2e3226..4609eac 100644
--- a/R/fs_s3.R
+++ b/R/fs_s3.R
@@ -32,7 +32,7 @@ s3_list_files_dir <- function(s3dir, max_objects = 1000) {
     purrr::map(\(x) tibble::tibble(
       Key = x[["Key"]],
       Size = x[["Size"]],
-      lastmodified = x[["LastModified"]]
+      lastmodified = as.character(x[["LastModified"]])
     )) |>
     dplyr::bind_rows() |>
     dplyr::mutate(
diff --git a/R/sash.R b/R/sash.R
index f1d94b6..91260b4 100644
--- a/R/sash.R
+++ b/R/sash.R
@@ -96,19 +96,16 @@ Wf_sash <- R6::R6Class(
         glue("{path}/{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "read_sigsTsv",
         glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum",
         glue("{path}/{pref}/purple/{SampleID_tumor}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene",
+        glue("{path}/{pref}/smlv_somatic/filter/{SampleID_tumor}\\.pass\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smlvfiltvcf",
+        glue("{path}/{pref}/smlv_somatic/filter/{SampleID_tumor}\\.pass\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smlvfiltvcfi",
         glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "read_pcgrJson",
-        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers",
         glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY-pcgrvcf",
         glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-pcgrvcfi",
+        glue("{path}/{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers",
         # glue("{path}/{pref}/smlv_somatic/report/{SampleID_tumor}\\.somatic\\.variant_counts_process\\.json$"), "smlvSomCounts",
         glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf",
         glue("{path}/{pref}/smlv_germline/report/cpsr/{SampleID_normal}\\.cpsr\\.grch38\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi",
-      ) |>
-        dplyr::mutate(
-          fun = paste0("read_", .data$fun),
-          fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun)
-        )
-
+      )
       super$initialize(path = path, wname = wname, regexes = regexes)
       self$SubjectID <- SubjectID
       self$SampleID_tumor <- SampleID_tumor
diff --git a/R/umccrise.R b/R/umccrise.R
index ba7b745..21894d3 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -76,7 +76,6 @@ Wf_umccrise <- R6::R6Class(
       wname <- "umccrise"
       pref <- glue("{SubjectID}__{SampleID_tumor}")
       crep <- "cancer_report_tables"
-      smallv <- "small_variants"
       regexes <- tibble::tribble(
         ~regex, ~fun,
         glue("{path}/{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "read_hrdChord",
@@ -88,10 +87,12 @@ Wf_umccrise <- R6::R6Class(
         glue("{path}/{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "read_qcSum",
         glue("{path}/{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "read_conpair",
         glue("{path}/{pref}/purple/{pref}\\.purple\\.cnv\\.gene\\.tsv$"), "DOWNLOAD_ONLY-purplegene",
+        glue("{path}/{pref}/small_variants/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smlvfiltvcf",
+        glue("{path}/{pref}/small_variants/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smlvfiltvcfi",
         glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "read_pcgrJson",
-        glue("{path}/{pref}/{smallv}/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers",
-        glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz$"), "DOWNLOAD_ONLY-smallvpassvcf",
-        glue("{path}/{pref}/{smallv}/{pref}-somatic-PASS\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-smallvpassvcfi",
+        glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.pass\\.vcf\\.gz$"), "DOWNLOAD_ONLY-pcgrvcf",
+        glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.pass\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-pcgrvcfi",
+        glue("{path}/work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$"), "DOWNLOAD_ONLY-pcgrtiers",
         glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz$"), "DOWNLOAD_ONLY-cpsrvcf",
         glue("{path}/work/{pref}/cpsr/{pref}-normal\\.cpsr\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cpsrvcfi"
       )

From 66b8e8243964e90d12483b61713f555d9d80729c Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Fri, 8 Nov 2024 19:20:35 +1100
Subject: [PATCH 17/32] dragen swf: remove trailing slash

---
 R/Wf.R                            |  4 +++-
 R/fs_icav1.R                      |  7 ++++++
 R/fs_s3.R                         |  7 ++++++
 R/tso_dragen.R                    |  6 ++---
 R/tsov2.R                         | 40 ++++++++++++++-----------------
 man/Wf.Rd                         |  2 ++
 man/Wf_tso_ctdna_tumor_only_v2.Rd |  2 +-
 7 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index ed69d88..b6ff9c3 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -106,7 +106,8 @@ Wf <- R6::R6Class(
       )
       subwnames <- c("dragen")
       assertthat::assert_that(wname %in% c(wnames, subwnames))
-      private$.path <- sub("/$", "", path) # remove potential trailing slash
+      path <- sub("/$", "", path) # remove potential trailing slash
+      private$.path <- path
       private$.wname <- wname
       private$.filesystem <- dplyr::case_when(
         grepl("^gds://", path) ~ "gds",
@@ -177,6 +178,7 @@ Wf <- R6::R6Class(
     },
     #' @description For DOWNLOAD_ONLY files, just return the input path.
     #' @param x Path with raw results.
+    #' @param suffix Suffix.
     DOWNLOAD_ONLY = function(x, suffix = "") {
       tibble::tibble(
         name = glue("DOWNLOAD_ONLY{suffix}"),
diff --git a/R/fs_icav1.R b/R/fs_icav1.R
index 56c6ad4..5aef0bf 100644
--- a/R/fs_icav1.R
+++ b/R/fs_icav1.R
@@ -179,6 +179,12 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN
     no_recurse = FALSE, page_token = NULL,
     recursive = recursive
   )
+  msg <- glue(
+    "GDS input path is: {gdsdir}",
+    "\nNo relevant files found under there.",
+    "\nPlease check that path with `ica files list`, and try to adjust page size."
+  )
+  assertthat::assert_that(nrow(d) > 0, msg = msg)
   d <- d |>
     dplyr::mutate(
       gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path),
@@ -206,6 +212,7 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN
         ),
         localpath = normalizePath(.data$localpath)
       ) |>
+      dplyr::ungroup() |>
       dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id")
     return(res)
   } else {
diff --git a/R/fs_s3.R b/R/fs_s3.R
index 4609eac..a8a4da9 100644
--- a/R/fs_s3.R
+++ b/R/fs_s3.R
@@ -132,6 +132,12 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
     s3dir = s3dir, pattern = NULL, regexes = regexes,
     max_objects = max_objects, presign = FALSE
   )
+  msg <- glue(
+    "S3 input path is: {s3dir}",
+    "\nNo relevant files found under there.",
+    "\nPlease check that path with `aws s3 ls`, and try to adjust page size."
+  )
+  assertthat::assert_that(nrow(d) > 0, msg = msg)
   d <- d |>
     dplyr::mutate(
       s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path),
@@ -163,6 +169,7 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
         ),
         localpath = normalizePath(.data$localpath)
       ) |>
+      dplyr::ungroup() |>
       dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path")
     return(res)
   } else {
diff --git a/R/tso_dragen.R b/R/tso_dragen.R
index 6df67d9..897c45e 100644
--- a/R/tso_dragen.R
+++ b/R/tso_dragen.R
@@ -86,9 +86,9 @@ Wf_dragen <- R6::R6Class(
     print = function(...) {
       res <- tibble::tribble(
         ~var, ~value,
-        "path", self$path,
-        "wname", self$wname,
-        "filesystem", self$filesystem,
+        "path", private$.path,
+        "wname", private$.wname,
+        "filesystem", private$.filesystem,
         "prefix", self$prefix
       )
       print(res)
diff --git a/R/tsov2.R b/R/tsov2.R
index 1a08dd9..fbfe581 100644
--- a/R/tsov2.R
+++ b/R/tsov2.R
@@ -47,7 +47,7 @@
 #' d <- t2$download_files(
 #'   outdir = outdir,
 #'   max_files = 500,
-#'   dryrun = F
+#'   dryrun = FALSE
 #' )
 #' d_tidy <- t2$tidy_files(d)
 #' d_write <- t2$write(
@@ -76,29 +76,25 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class(
       res <- glue("Results/{pref}")
       li <- "Logs_Intermediates"
       dc <- glue("{li}/DragenCaller/{pref}")
+      path <- sub("/$", "", path) # remove potential trailing slash
       self$dragenObj <- Wf_dragen$new(path = file.path(path, dc), prefix = glue("{dc}/{prefix}"))
       # Results
-      reg1 <- tibble::tribble(
+      regexes <- tibble::tribble(
         ~regex, ~fun,
-        glue("{res}/{pref}\\.cnv\\.vcf\\.gz$"), "cnv",
-        glue("{res}/{pref}\\.cnv\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
-        glue("{res}/{pref}\\.exon_cov_report\\.tsv$"), "cvgrepe",
-        glue("{res}/{pref}\\.gene_cov_report\\.tsv$"), "cvgrepg",
-        glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz$"), "hardfilt",
-        glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
+        glue("{res}/{pref}\\.cnv\\.vcf\\.gz$"), "read_cnv",
+        glue("{res}/{pref}\\.cnv\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-cnvvcfi",
+        glue("{res}/{pref}\\.exon_cov_report\\.tsv$"), "read_cvgrepe",
+        glue("{res}/{pref}\\.gene_cov_report\\.tsv$"), "read_cvgrepg",
+        glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz$"), "read_hardfilt",
+        glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY-hardfiltvcfi",
         # glue("{res}/{pref}\\.microsat_output\\.json$"), "msi", # in DragenCaller
-        glue("{res}/{pref}\\.tmb.trace\\.tsv$"), "tmbt",
-        glue("{res}/{pref}_CombinedVariantOutput\\.tsv$"), "cvo",
-        glue("{res}/{pref}_Fusions\\.csv$"), "fus",
-        glue("{res}/{pref}_MetricsOutput\\.tsv$"), "DOWNLOAD_ONLY",
-        # glue("{res}/{pref}_SmallVariants_Annotated\\.json\\.gz$"), "DOWNLOAD_ONLY",
-        glue("{li}/SampleAnalysisResults/{pref}_SampleAnalysisResults\\.json$"), "sar"
+        glue("{res}/{pref}\\.tmb.trace\\.tsv$"), "read_tmbt",
+        glue("{res}/{pref}_CombinedVariantOutput\\.tsv$"), "read_cvo",
+        glue("{res}/{pref}_Fusions\\.csv$"), "read_fus",
+        glue("{res}/{pref}_MetricsOutput\\.tsv$"), "DOWNLOAD_ONLY-metricsoutput",
+        # glue("{res}/{pref}_SmallVariants_Annotated\\.json\\.gz$"), "DOWNLOAD_ONLY-smallvannjson",
+        glue("{li}/SampleAnalysisResults/{pref}_SampleAnalysisResults\\.json$"), "read_sar"
       )
-      regexes <- reg1 |>
-        dplyr::mutate(
-          fun = paste0("read_", .data$fun),
-          fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun)
-        )
       super$initialize(path = path, wname = wname, regexes = regexes)
       self$prefix <- prefix
     },
@@ -107,9 +103,9 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class(
     print = function(...) {
       res <- tibble::tribble(
         ~var, ~value,
-        "path", self$path,
-        "wname", self$wname,
-        "filesystem", self$filesystem,
+        "path", private$.path,
+        "wname", private$.wname,
+        "filesystem", private$.filesystem,
         "prefix", self$prefix
       )
       print(res)
diff --git a/man/Wf.Rd b/man/Wf.Rd
index f8e623d..d6d30c7 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -198,6 +198,8 @@ For DOWNLOAD_ONLY files, just return the input path.
 \if{html}{\out{<div class="arguments">}}
 \describe{
 \item{\code{x}}{Path with raw results.}
+
+\item{\code{suffix}}{Suffix.}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/Wf_tso_ctdna_tumor_only_v2.Rd b/man/Wf_tso_ctdna_tumor_only_v2.Rd
index 148aca0..a8513e4 100644
--- a/man/Wf_tso_ctdna_tumor_only_v2.Rd
+++ b/man/Wf_tso_ctdna_tumor_only_v2.Rd
@@ -50,7 +50,7 @@ t2$list_files_filter_relevant(max_files = 500)
 d <- t2$download_files(
   outdir = outdir,
   max_files = 500,
-  dryrun = F
+  dryrun = FALSE
 )
 d_tidy <- t2$tidy_files(d)
 d_write <- t2$write(

From 6e1a4f99aca0dd6254207f7ee48b31641fddedc8 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 9 Nov 2024 13:27:27 +1100
Subject: [PATCH 18/32] normalize outputdir

---
 R/fs_icav1.R | 6 +++---
 R/fs_s3.R    | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/fs_icav1.R b/R/fs_icav1.R
index 5aef0bf..803aa7d 100644
--- a/R/fs_icav1.R
+++ b/R/fs_icav1.R
@@ -188,9 +188,9 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN
   d <- d |>
     dplyr::mutate(
       gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path),
-      gdspath_minus_gdsdir_outdir = fs::dir_create(
-        file.path(outdir, dirname(.data$gdspath_minus_gdsdir))
-      ),
+      gdspath_minus_gdsdir_outdir = file.path(outdir, dirname(.data$gdspath_minus_gdsdir)) |>
+        fs::dir_create() |>
+        normalizePath(),
       localpath = file.path(.data$gdspath_minus_gdsdir_outdir, .data$bname),
       gdspath = .data$path
     ) |>
diff --git a/R/fs_s3.R b/R/fs_s3.R
index a8a4da9..d67a109 100644
--- a/R/fs_s3.R
+++ b/R/fs_s3.R
@@ -141,9 +141,9 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
   d <- d |>
     dplyr::mutate(
       s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path),
-      s3path_minus_s3dir_outdir = fs::dir_create(
-        file.path(outdir, dirname(.data$s3path_minus_s3dir))
-      ),
+      s3path_minus_s3dir_outdir = file.path(outdir, dirname(.data$s3path_minus_s3dir)) |>
+        fs::dir_create() |>
+        normalizePath(),
       localpath = file.path(.data$s3path_minus_s3dir_outdir, .data$bname),
       s3path = .data$path
     ) |>

From c3746ba33dce945c311058c240c6e159c5855e3b Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 9 Nov 2024 15:15:45 +1100
Subject: [PATCH 19/32] dragen: fix cnv/mapping metrics for wgs tn

---
 R/dragen.R       | 39 +++++++++++++++++++++++++++------------
 man/Wf_dragen.Rd | 19 ++++++++++++++++++-
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/R/dragen.R b/R/dragen.R
index b198cb8..907f776 100644
--- a/R/dragen.R
+++ b/R/dragen.R
@@ -209,7 +209,8 @@ dragen_gc_metrics_read <- function(x) {
 #' @export
 dragen_cnv_metrics_read <- function(x) {
   d0 <- readr::read_lines(x)
-  assertthat::assert_that(grepl("SEX GENOTYPER", d0[1]))
+  # first row is sometimes SEX GENOTYPER, others CNV SUMMARY
+  assertthat::assert_that(grepl("CNV SUMMARY", d0[2]))
   abbrev_nm <- c(
     "Bases in reference genome" = "bases_in_ref_genome",
     "Average alignment coverage over genome" = "cov_alignment_avg_over_genome",
@@ -227,7 +228,10 @@ dragen_cnv_metrics_read <- function(x) {
     "Number of amplifications" = "n_amp",
     "Number of deletions" = "n_del",
     "Number of passing amplifications" = "n_amp_pass",
-    "Number of passing deletions" = "n_del_pass"
+    "Number of passing deletions" = "n_del_pass",
+    "Estimated tumor purity" = "purity_tumor",
+    "Diploid coverage" = "cov_diploid",
+    "Overall ploidy" = "ploidy_overall"
   )
   d1 <- d0 |>
     tibble::as_tibble_col(column_name = "value") |>
@@ -236,10 +240,10 @@ dragen_cnv_metrics_read <- function(x) {
       names = c("category", "extra", "var", "count", "pct"),
       delim = ",", too_few = "align_start"
     )
+  # in cttso
   sexgt <- d1 |>
     dplyr::filter(.data$category == "SEX GENOTYPER") |>
     dplyr::select(sexgt = "count", sexgt_pct = "pct")
-
   d2 <- d1 |>
     dplyr::filter(!.data$category == "SEX GENOTYPER") |>
     dplyr::mutate(
@@ -258,8 +262,10 @@ dragen_cnv_metrics_read <- function(x) {
     ) |>
     dplyr::select("var", "value") |>
     tidyr::pivot_wider(names_from = "var", values_from = "value")
-  res <- dplyr::bind_cols(sexgt, d2)
-  return(res)
+  if (nrow(sexgt) == 0) {
+    return(d2)
+  }
+  dplyr::bind_cols(sexgt, d2)
 }
 
 #' Read DRAGEN SV Metrics
@@ -485,6 +491,7 @@ dragen_vc_metrics_read <- function(x) {
 dragen_mapping_metrics_read <- function(x) {
   abbrev_nm <- c(
     "Total input reads" = "reads_tot_input",
+    "Total reads removed by downsampling" = "reads_removed_downsamp",
     "Number of duplicate marked reads" = "reads_num_dupmarked",
     "Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed",
     "Number of unique reads (excl. duplicate marked reads)" = "reads_num_uniq",
@@ -499,6 +506,7 @@ dragen_mapping_metrics_read <- function(x) {
     "Unmapped reads" = "reads_unmapped",
     "Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt",
     "Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj",
+    "Adjustment of reads matching exclude contigs" = "reads_match_excl_contigs",
     "Singleton reads (itself mapped; mate unmapped)" = "reads_singleton",
     "Paired reads (itself & mate mapped)" = "reads_paired",
     "Properly paired reads" = "reads_paired_proper",
@@ -514,6 +522,7 @@ dragen_mapping_metrics_read <- function(x) {
     "Reads with indel R1" = "reads_indel_r1",
     "Reads with indel R2" = "reads_indel_r2",
     "Total bases" = "bases_tot",
+    "Total bases removed by downsampling" = "bases_removed_downsamp",
     "Total bases R1" = "bases_tot_r1",
     "Total bases R2" = "bases_tot_r2",
     "Mapped bases" = "bases_mapped",
@@ -555,17 +564,21 @@ dragen_mapping_metrics_read <- function(x) {
     "Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj",
     "Reads with splice junction" = "reads_splicejunc",
     "Average sequenced coverage over genome" = "cov_avg_seq_over_genome",
-    "Filtered rRNA reads" = "reads_rrna_filtered"
+    "Filtered rRNA reads" = "reads_rrna_filtered",
+    "Mitochondrial reads excluded" = "reads_mito_excl"
   )
   d0 <- readr::read_lines(x)
   assertthat::assert_that(grepl("MAPPING/ALIGNING", d0[1]))
-  # split by RG and non-RG
-  # tidy
+  # File is separated into two sections, the SUMMARY and the PER RG.
+  # Based on what I've seen so far, we can have single samples (where
+  # the first column just has MAPPING/ALIGNING) or TUMOR/NORMAL samples (where
+  # the first column will have a TUMOR or NORMAL prefix).
+  reg1 <- paste0("MAPPING/ALIGNING ", c("SUMMARY", "PER RG"), collapse = "|")
   d <- d0 |>
     tibble::as_tibble_col(column_name = "value") |>
     tidyr::separate_wider_delim(
       "value",
-      names = c("category", "RG", "var", "count", "pct"),
+      names = c("dragen_sample", "RG", "var", "count", "pct"),
       delim = ",", too_few = "align_start"
     ) |>
     dplyr::mutate(
@@ -573,9 +586,11 @@ dragen_mapping_metrics_read <- function(x) {
       count = as.numeric(.data$count),
       pct = as.numeric(.data$pct),
       var = dplyr::recode(.data$var, !!!abbrev_nm),
-      RG = dplyr::if_else(.data$RG == "", "Total", .data$RG)
+      RG = dplyr::if_else(.data$RG == "", "Total", .data$RG),
+      dragen_sample = sub(reg1, "", .data$dragen_sample) |> trimws(),
+      dragen_sample = dplyr::if_else(.data$dragen_sample == "", "SINGLE", .data$dragen_sample)
     ) |>
-    dplyr::select("RG", "var", "count", "pct")
+    dplyr::select("dragen_sample", "RG", "var", "count", "pct")
   dirty_names_cleaned(unique(d$var), abbrev_nm, x)
   # pivot
   d |>
@@ -584,7 +599,7 @@ dragen_mapping_metrics_read <- function(x) {
       name = dplyr::if_else(.data$name == "count", "", "_pct"),
       var = glue("{.data$var}{.data$name}")
     ) |>
-    dplyr::select("RG", "var", "value") |>
+    dplyr::select("dragen_sample", "RG", "var", "value") |>
     dplyr::filter(!is.na(.data$value)) |>
     tidyr::pivot_wider(names_from = "var", values_from = "value")
 }
diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd
index 66b91c6..f3465e6 100644
--- a/man/Wf_dragen.Rd
+++ b/man/Wf_dragen.Rd
@@ -19,7 +19,24 @@ p <- file.path(
 d1 <- Wf_dragen$new(path = p, prefix = prefix)
 d1$list_files(max_files = 100)
 d1$list_files_filter_relevant(max_files = 300)
-d <- d1$download_files(max_files = 100, dryrun = F)
+d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
+d_tidy <- d1$tidy_files(d)
+d_write <- t1$write(
+  d_tidy,
+  outdir = file.path(p, "dracarys_tidy"),
+  prefix = prefix,
+  format = "tsv"
+)
+#---- GDS ----#
+prefix <- "PRJ222358"
+p <- file.path("gds://production/analysis_data/SBJ03001/wgs_tumor_normal",
+  "20241108fc293a38/L2201805_L2201797_dragen_somatic"
+)
+outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case
+d1 <- Wf_dragen$new(path = p, prefix = prefix)
+d1$list_files(max_files = 100)
+d1$list_files_filter_relevant(max_files = 300)
+d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
 d_tidy <- d1$tidy_files(d)
 d_write <- t1$write(
   d_tidy,

From 4d833f4f0ae32b04ebe565cf9ed64c3f90e0136c Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 9 Nov 2024 16:25:05 +1100
Subject: [PATCH 20/32] dragen: grab tn suffix for cov/hist metrics

---
 R/tso_dragen.R | 88 +++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 37 deletions(-)

diff --git a/R/tso_dragen.R b/R/tso_dragen.R
index 897c45e..5dfad15 100644
--- a/R/tso_dragen.R
+++ b/R/tso_dragen.R
@@ -16,7 +16,25 @@
 #' d1 <- Wf_dragen$new(path = p, prefix = prefix)
 #' d1$list_files(max_files = 100)
 #' d1$list_files_filter_relevant(max_files = 300)
-#' d <- d1$download_files(max_files = 100, dryrun = F)
+#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
+#' d_tidy <- d1$tidy_files(d)
+#' d_write <- t1$write(
+#'   d_tidy,
+#'   outdir = file.path(p, "dracarys_tidy"),
+#'   prefix = prefix,
+#'   format = "tsv"
+#' )
+#' #---- GDS ----#
+#' prefix <- "PRJ222358"
+#' p <- file.path(
+#'   "gds://production/analysis_data/SBJ03001/wgs_tumor_normal",
+#'   "20241108fc293a38/L2201805_L2201797_dragen_somatic"
+#' )
+#' outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case
+#' d1 <- Wf_dragen$new(path = p, prefix = prefix)
+#' d1$list_files(max_files = 100)
+#' d1$list_files_filter_relevant(max_files = 300)
+#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
 #' d_tidy <- d1$tidy_files(d)
 #' d_write <- t1$write(
 #'   d_tidy,
@@ -39,44 +57,40 @@ Wf_dragen <- R6::R6Class(
     initialize = function(path = NULL, prefix = NULL) {
       wname <- "dragen"
       pref <- prefix
-      reg1 <- tibble::tribble(
+      tn1 <- "(|_tumor|_normal)"
+      regexes <- tibble::tribble(
         ~regex, ~fun,
-        glue("{pref}\\-replay\\.json$"), "replay",
-        glue("{pref}\\.cnv_metrics.csv$"), "cnvMetrics",
-        glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "contigMeanCov",
-        glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "contigMeanCov",
-        glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "contigMeanCov",
-        glue("{pref}\\.wgs_contig_mean_cov\\.csv$"), "contigMeanCov",
-        glue("{pref}\\.exon_coverage_metrics\\.csv$"), "coverageMetrics",
-        glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "coverageMetrics",
-        glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "coverageMetrics",
-        glue("{pref}\\.wgs_coverage_metrics\\.csv$"), "coverageMetrics",
-        glue("{pref}\\.exon_fine_hist\\.csv$"), "fineHist",
-        glue("{pref}\\.target_bed_fine_hist\\.csv$"), "fineHist",
-        glue("{pref}\\.tmb_fine_hist\\.csv$"), "fineHist",
-        glue("{pref}\\.wgs_fine_hist\\.csv$"), "fineHist",
-        glue("{pref}\\.exon_hist\\.csv$"), "hist",
-        glue("{pref}\\.target_bed_hist\\.csv$"), "hist",
-        glue("{pref}\\.tmb_hist\\.csv$"), "hist",
-        glue("{pref}\\.wgs_hist\\.csv$"), "hist",
-        glue("{pref}\\.fastqc_metrics\\.csv$"), "fastqcMetrics",
-        glue("{pref}\\.fragment_length_hist\\.csv$"), "fragmentLengthHist",
-        glue("{pref}\\.gc_metrics\\.csv$"), "gcMetrics",
-        glue("{pref}\\.gvcf_metrics\\.csv$"), "vcMetrics",
-        glue("{pref}\\.mapping_metrics\\.csv$"), "mappingMetrics",
-        glue("{pref}\\.microsat_diffs\\.txt$"), "msiDiffs",
-        glue("{pref}\\.microsat_output\\.json$"), "msi",
-        glue("{pref}\\.sv_metrics\\.csv$"), "svMetrics",
-        glue("{pref}\\.time_metrics\\.csv$"), "timeMetrics",
-        glue("{pref}\\.trimmer_metrics\\.csv$"), "trimmerMetrics",
-        glue("{pref}\\.umi_metrics\\.csv$"), "umiMetrics",
-        glue("{pref}\\.vc_metrics\\.csv$"), "vcMetrics"
+        glue("{pref}\\-replay\\.json$"), "read_replay",
+        glue("{pref}\\.cnv_metrics.csv$"), "read_cnvMetrics",
+        glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "read_contigMeanCov",
+        glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "read_contigMeanCov",
+        glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "read_contigMeanCov",
+        glue("{pref}\\.wgs_contig_mean_cov{tn1}\\.csv$"), "read_contigMeanCov",
+        glue("{pref}\\.exon_coverage_metrics\\.csv$"), "read_coverageMetrics",
+        glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "read_coverageMetrics",
+        glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "read_coverageMetrics",
+        glue("{pref}\\.wgs_coverage_metrics{tn1}\\.csv$"), "read_coverageMetrics",
+        glue("{pref}\\.exon_fine_hist\\.csv$"), "read_fineHist",
+        glue("{pref}\\.target_bed_fine_hist\\.csv$"), "read_fineHist",
+        glue("{pref}\\.tmb_fine_hist\\.csv$"), "read_fineHist",
+        glue("{pref}\\.wgs_fine_hist{tn1}\\.csv$"), "read_fineHist",
+        glue("{pref}\\.exon_hist\\.csv$"), "read_hist",
+        glue("{pref}\\.target_bed_hist\\.csv$"), "read_hist",
+        glue("{pref}\\.tmb_hist\\.csv$"), "read_hist",
+        glue("{pref}\\.wgs_hist{tn1}\\.csv$"), "read_hist",
+        glue("{pref}\\.fastqc_metrics\\.csv$"), "read_fastqcMetrics",
+        glue("{pref}\\.fragment_length_hist\\.csv$"), "read_fragmentLengthHist",
+        glue("{pref}\\.gc_metrics\\.csv$"), "read_gcMetrics",
+        glue("{pref}\\.gvcf_metrics\\.csv$"), "read_vcMetrics",
+        glue("{pref}\\.mapping_metrics\\.csv$"), "read_mappingMetrics",
+        glue("{pref}\\.microsat_diffs\\.txt$"), "read_msiDiffs",
+        glue("{pref}\\.microsat_output\\.json$"), "read_msi",
+        glue("{pref}\\.sv_metrics\\.csv$"), "read_svMetrics",
+        glue("{pref}\\.time_metrics\\.csv$"), "read_timeMetrics",
+        glue("{pref}\\.trimmer_metrics\\.csv$"), "read_trimmerMetrics",
+        glue("{pref}\\.umi_metrics\\.csv$"), "read_umiMetrics",
+        glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics"
       )
-      regexes <- reg1 |>
-        dplyr::mutate(
-          fun = paste0("read_", .data$fun),
-          fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun)
-        )
 
       super$initialize(path = path, wname = wname, regexes = regexes)
       self$prefix <- prefix

From 7ec18e57d98d0b1b9b0f5b7966898d310fde6da6 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 10 Nov 2024 23:58:15 +1100
Subject: [PATCH 21/32] dragen: add ploidy estimation metrics

---
 NAMESPACE                                    |   1 -
 R/dragen.R                                   | 116 ++++++-------------
 man/PloidyEstimationMetricsFile.Rd           | 100 ----------------
 man/Wf_dragen.Rd                             |   3 +-
 man/dragen_ploidy_estimation_metrics_read.Rd |  17 +++
 5 files changed, 52 insertions(+), 185 deletions(-)
 delete mode 100644 man/PloidyEstimationMetricsFile.Rd
 create mode 100644 man/dragen_ploidy_estimation_metrics_read.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 5270441..12a33de 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,7 +6,6 @@ export(BclconvertReports)
 export(BclconvertReports375)
 export(File)
 export(MultiqcFile)
-export(PloidyEstimationMetricsFile)
 export(Wf)
 export(Wf_dragen)
 export(Wf_sash)
diff --git a/R/dragen.R b/R/dragen.R
index 907f776..ce091f8 100644
--- a/R/dragen.R
+++ b/R/dragen.R
@@ -781,89 +781,39 @@ dragen_contig_mean_coverage_plot <- function(d, top_alt_n = 15) {
     ggplot2::facet_wrap(ggplot2::vars(.data$panel), nrow = 2, scales = "free")
 }
 
-#' PloidyEstimationMetricsFile R6 Class
+#' Read DRAGEN Ploidy Estimation Metrics
 #'
-#' @description
-#' Contains methods for reading contents of
-#' the `ploidy_estimation_metrics.csv` file output from DRAGEN.
-#'
-#' @examples
-#' x <- system.file("extdata/wgs/SEQC-II.ploidy_estimation_metrics.csv.gz", package = "dracarys")
-#' pem <- PloidyEstimationMetricsFile$new(x)
-#' d <- pem$read() # or read(pem)
-#' pem$write(d, out_dir = tempdir(), prefix = "seqc_ploidy", out_format = "tsv")
+#' Reads the `ploidy_estimation_metrics.csv` file generated by DRAGEN.
+#' @param x Path to file.
 #'
-#' @export
-PloidyEstimationMetricsFile <- R6::R6Class(
-  "PloidyEstimationMetricsFile",
-  inherit = File,
-  public = list(
-    #' @description
-    #' Reads the `ploidy_estimation_metrics.csv` file output from DRAGEN.
-    #'
-    #' @return tibble with one row and metrics spread across individual columns.
-    read = function() {
-      x <- self$path
-      raw <- readr::read_lines(x)
-      assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1]))
-      abbrev_nm <- c(
-        "Autosomal median coverage" = "cov_auto_median",
-        "X median coverage" = "cov_x_median",
-        "Y median coverage" = "cov_y_median",
-        "1 median / Autosomal median" = "cov_1_div_auto_median",
-        "2 median / Autosomal median" = "cov_2_div_auto_median",
-        "3 median / Autosomal median" = "cov_3_div_auto_median",
-        "4 median / Autosomal median" = "cov_4_div_auto_median",
-        "5 median / Autosomal median" = "cov_5_div_auto_median",
-        "6 median / Autosomal median" = "cov_6_div_auto_median",
-        "7 median / Autosomal median" = "cov_7_div_auto_median",
-        "8 median / Autosomal median" = "cov_8_div_auto_median",
-        "9 median / Autosomal median" = "cov_9_div_auto_median",
-        "10 median / Autosomal median" = "cov_10_div_auto_median",
-        "11 median / Autosomal median" = "cov_11_div_auto_median",
-        "12 median / Autosomal median" = "cov_12_div_auto_median",
-        "13 median / Autosomal median" = "cov_13_div_auto_median",
-        "14 median / Autosomal median" = "cov_14_div_auto_median",
-        "15 median / Autosomal median" = "cov_15_div_auto_median",
-        "16 median / Autosomal median" = "cov_16_div_auto_median",
-        "17 median / Autosomal median" = "cov_17_div_auto_median",
-        "18 median / Autosomal median" = "cov_18_div_auto_median",
-        "19 median / Autosomal median" = "cov_19_div_auto_median",
-        "20 median / Autosomal median" = "cov_20_div_auto_median",
-        "21 median / Autosomal median" = "cov_21_div_auto_median",
-        "22 median / Autosomal median" = "cov_22_div_auto_median",
-        "X median / Autosomal median" = "cov_x_div_auto_median",
-        "Y median / Autosomal median" = "cov_y_div_auto_median",
-        "Ploidy estimation" = "ploidy_est"
-      )
-
-      d <- raw |>
-        tibble::as_tibble_col(column_name = "value") |>
-        tidyr::separate_wider_delim("value", names = c("dummy1", "dummy2", "var", "value"), delim = ",") |>
-        dplyr::select("var", "value") |>
-        dplyr::mutate(
-          var = dplyr::recode(.data$var, !!!abbrev_nm)
-        ) |>
-        tidyr::pivot_wider(names_from = "var", values_from = "value")
-      # now convert all except 'Ploidy estimation' to numeric
-      cols1 <- colnames(d)[colnames(d) != "ploidy_est"]
-      d |>
-        dplyr::mutate(dplyr::across(dplyr::all_of(cols1), as.numeric))
-    },
-    #' @description
-    #' Writes a tidy version of the `ploidy_estimation_metrics.csv` file output
-    #' from DRAGEN.
-    #'
-    #' @param d Parsed object from `self$read()`.
-    #' @param prefix Prefix of output file(s).
-    #' @param out_dir Output directory.
-    #' @param out_format Format of output file(s).
-    #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
-    write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) {
-      if (!is.null(out_dir)) {
-        prefix <- file.path(out_dir, prefix)
-      }
-      write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
-    }
+#' @return Tibble with metrics.
+dragen_ploidy_estimation_metrics_read <- function(x) {
+  raw <- readr::read_lines(x)
+  assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1]))
+  fun1 <- function(x) {
+    setNames(
+      as.character(glue("cov_{x}_div_auto_median")),
+      as.character(glue("{x} median / Autosomal median"))
+    )
+  }
+  abbrev_nm <- c(
+    "Autosomal median coverage" = "cov_auto_median",
+    "X median coverage" = "cov_x_median",
+    "Y median coverage" = "cov_y_median",
+    "Ploidy estimation" = "ploidy_est",
+    fun1(c(1:22, "X", "Y"))
   )
-)
+
+  d <- raw |>
+    tibble::as_tibble_col(column_name = "value") |>
+    tidyr::separate_wider_delim("value", names = c("dummy1", "dummy2", "var", "value"), delim = ",") |>
+    dplyr::select("var", "value") |>
+    dplyr::mutate(
+      var = dplyr::recode(.data$var, !!!abbrev_nm)
+    ) |>
+    tidyr::pivot_wider(names_from = "var", values_from = "value")
+  # now convert all except 'Ploidy estimation' to numeric
+  cols1 <- colnames(d)[colnames(d) != "ploidy_est"]
+  d |>
+    dplyr::mutate(dplyr::across(dplyr::all_of(cols1), as.numeric))
+}
diff --git a/man/PloidyEstimationMetricsFile.Rd b/man/PloidyEstimationMetricsFile.Rd
deleted file mode 100644
index 7208e81..0000000
--- a/man/PloidyEstimationMetricsFile.Rd
+++ /dev/null
@@ -1,100 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dragen.R
-\name{PloidyEstimationMetricsFile}
-\alias{PloidyEstimationMetricsFile}
-\title{PloidyEstimationMetricsFile R6 Class}
-\description{
-Contains methods for reading contents of
-the \code{ploidy_estimation_metrics.csv} file output from DRAGEN.
-}
-\examples{
-x <- system.file("extdata/wgs/SEQC-II.ploidy_estimation_metrics.csv.gz", package = "dracarys")
-pem <- PloidyEstimationMetricsFile$new(x)
-d <- pem$read() # or read(pem)
-pem$write(d, out_dir = tempdir(), prefix = "seqc_ploidy", out_format = "tsv")
-
-}
-\section{Super class}{
-\code{\link[dracarys:File]{dracarys::File}} -> \code{PloidyEstimationMetricsFile}
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-PloidyEstimationMetricsFile-read}{\code{PloidyEstimationMetricsFile$read()}}
-\item \href{#method-PloidyEstimationMetricsFile-write}{\code{PloidyEstimationMetricsFile$write()}}
-\item \href{#method-PloidyEstimationMetricsFile-clone}{\code{PloidyEstimationMetricsFile$clone()}}
-}
-}
-\if{html}{\out{
-<details open><summary>Inherited methods</summary>
-<ul>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="bname"><a href='../../dracarys/html/File.html#method-File-bname'><code>dracarys::File$bname()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="initialize"><a href='../../dracarys/html/File.html#method-File-initialize'><code>dracarys::File$initialize()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="print"><a href='../../dracarys/html/File.html#method-File-print'><code>dracarys::File$print()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="File" data-id="type"><a href='../../dracarys/html/File.html#method-File-type'><code>dracarys::File$type()</code></a></span></li>
-</ul>
-</details>
-}}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PloidyEstimationMetricsFile-read"></a>}}
-\if{latex}{\out{\hypertarget{method-PloidyEstimationMetricsFile-read}{}}}
-\subsection{Method \code{read()}}{
-Reads the \code{ploidy_estimation_metrics.csv} file output from DRAGEN.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PloidyEstimationMetricsFile$read()}\if{html}{\out{</div>}}
-}
-
-\subsection{Returns}{
-tibble with one row and metrics spread across individual columns.
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PloidyEstimationMetricsFile-write"></a>}}
-\if{latex}{\out{\hypertarget{method-PloidyEstimationMetricsFile-write}{}}}
-\subsection{Method \code{write()}}{
-Writes a tidy version of the \code{ploidy_estimation_metrics.csv} file output
-from DRAGEN.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PloidyEstimationMetricsFile$write(
-  d,
-  out_dir = NULL,
-  prefix,
-  out_format = "tsv",
-  drid = NULL
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{d}}{Parsed object from \code{self$read()}.}
-
-\item{\code{out_dir}}{Output directory.}
-
-\item{\code{prefix}}{Prefix of output file(s).}
-
-\item{\code{out_format}}{Format of output file(s).}
-
-\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-PloidyEstimationMetricsFile-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-PloidyEstimationMetricsFile-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{PloidyEstimationMetricsFile$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd
index f3465e6..c11ab54 100644
--- a/man/Wf_dragen.Rd
+++ b/man/Wf_dragen.Rd
@@ -29,7 +29,8 @@ d_write <- t1$write(
 )
 #---- GDS ----#
 prefix <- "PRJ222358"
-p <- file.path("gds://production/analysis_data/SBJ03001/wgs_tumor_normal",
+p <- file.path(
+  "gds://production/analysis_data/SBJ03001/wgs_tumor_normal",
   "20241108fc293a38/L2201805_L2201797_dragen_somatic"
 )
 outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case
diff --git a/man/dragen_ploidy_estimation_metrics_read.Rd b/man/dragen_ploidy_estimation_metrics_read.Rd
new file mode 100644
index 0000000..468566f
--- /dev/null
+++ b/man/dragen_ploidy_estimation_metrics_read.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dragen.R
+\name{dragen_ploidy_estimation_metrics_read}
+\alias{dragen_ploidy_estimation_metrics_read}
+\title{Read DRAGEN Ploidy Estimation Metrics}
+\usage{
+dragen_ploidy_estimation_metrics_read(x)
+}
+\arguments{
+\item{x}{Path to file.}
+}
+\value{
+Tibble with metrics.
+}
+\description{
+Reads the \code{ploidy_estimation_metrics.csv} file generated by DRAGEN.
+}

From d2b76c8888a3a3e7c308bb95e3b620355ab4801b Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Mon, 11 Nov 2024 00:20:46 +1100
Subject: [PATCH 22/32] dragen: add ploidy estimation metrics

---
 R/dragen.R       | 14 +++++++++-----
 R/tso_dragen.R   |  9 ++++++++-
 man/Wf_dragen.Rd | 18 ++++++++++++++++++
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/R/dragen.R b/R/dragen.R
index ce091f8..ea482bf 100644
--- a/R/dragen.R
+++ b/R/dragen.R
@@ -792,18 +792,21 @@ dragen_ploidy_estimation_metrics_read <- function(x) {
   assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1]))
   fun1 <- function(x) {
     setNames(
-      as.character(glue("cov_{x}_div_auto_median")),
+      as.character(glue("cov_{tolower(x)}_div_auto_median")),
       as.character(glue("{x} median / Autosomal median"))
     )
   }
+  fun2 <- function(x) {
+    setNames(
+      as.character(glue("cov_{tolower(x)}_median")),
+      as.character(glue("{x} median coverage"))
+    )
+  }
   abbrev_nm <- c(
-    "Autosomal median coverage" = "cov_auto_median",
-    "X median coverage" = "cov_x_median",
-    "Y median coverage" = "cov_y_median",
     "Ploidy estimation" = "ploidy_est",
+    fun2(c("X", "Y", "Autosomal")),
     fun1(c(1:22, "X", "Y"))
   )
-
   d <- raw |>
     tibble::as_tibble_col(column_name = "value") |>
     tidyr::separate_wider_delim("value", names = c("dummy1", "dummy2", "var", "value"), delim = ",") |>
@@ -812,6 +815,7 @@ dragen_ploidy_estimation_metrics_read <- function(x) {
       var = dplyr::recode(.data$var, !!!abbrev_nm)
     ) |>
     tidyr::pivot_wider(names_from = "var", values_from = "value")
+  dirty_names_cleaned(unique(colnames(d)), abbrev_nm, x)
   # now convert all except 'Ploidy estimation' to numeric
   cols1 <- colnames(d)[colnames(d) != "ploidy_est"]
   d |>
diff --git a/R/tso_dragen.R b/R/tso_dragen.R
index 5dfad15..d786296 100644
--- a/R/tso_dragen.R
+++ b/R/tso_dragen.R
@@ -89,7 +89,8 @@ Wf_dragen <- R6::R6Class(
         glue("{pref}\\.time_metrics\\.csv$"), "read_timeMetrics",
         glue("{pref}\\.trimmer_metrics\\.csv$"), "read_trimmerMetrics",
         glue("{pref}\\.umi_metrics\\.csv$"), "read_umiMetrics",
-        glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics"
+        glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics",
+        glue("{pref}\\.ploidy_estimation_metrics\\.csv$"), "read_ploidyMetrics"
       )
 
       super$initialize(path = path, wname = wname, regexes = regexes)
@@ -271,6 +272,12 @@ Wf_dragen <- R6::R6Class(
       dat <- dragen_umi_metrics_read(x)
       dat
     },
+    #' @description Read `ploidy_estimation_metrics.csv` file.
+    #' @param x Path to file.
+    read_ploidyMetrics = function(x) {
+      dat <- dragen_ploidy_estimation_metrics_read(x)
+      tibble::tibble(name = "ploidymetrics", data = list(dat))
+    },
     #' @description Read `microsat_output.json` file.
     #' @param x Path to file.
     read_msi = function(x) {
diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd
index c11ab54..4d520c1 100644
--- a/man/Wf_dragen.Rd
+++ b/man/Wf_dragen.Rd
@@ -77,6 +77,7 @@ d_write <- t1$write(
 \item \href{#method-Wf_dragen-read_fastqcMetrics}{\code{Wf_dragen$read_fastqcMetrics()}}
 \item \href{#method-Wf_dragen-read_gcMetrics}{\code{Wf_dragen$read_gcMetrics()}}
 \item \href{#method-Wf_dragen-read_umiMetrics}{\code{Wf_dragen$read_umiMetrics()}}
+\item \href{#method-Wf_dragen-read_ploidyMetrics}{\code{Wf_dragen$read_ploidyMetrics()}}
 \item \href{#method-Wf_dragen-read_msi}{\code{Wf_dragen$read_msi()}}
 \item \href{#method-Wf_dragen-read_msiDiffs}{\code{Wf_dragen$read_msiDiffs()}}
 \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}}
@@ -380,6 +381,23 @@ Read \code{umi_metrics.csv} file.
 \if{html}{\out{<div class="r">}}\preformatted{Wf_dragen$read_umiMetrics(x)}\if{html}{\out{</div>}}
 }
 
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_dragen-read_ploidyMetrics"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_dragen-read_ploidyMetrics}{}}}
+\subsection{Method \code{read_ploidyMetrics()}}{
+Read \code{ploidy_estimation_metrics.csv} file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_dragen$read_ploidyMetrics(x)}\if{html}{\out{</div>}}
+}
+
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{

From 027bba67cd7b08474ab1b1fe01a478b173228f66 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Mon, 11 Nov 2024 14:52:57 +1100
Subject: [PATCH 23/32] fix coderabbit nits

---
 R/sash.R         | 2 +-
 R/tso_dragen.R   | 4 ++--
 man/Wf_dragen.Rd | 4 ++--
 man/Wf_sash.Rd   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/sash.R b/R/sash.R
index 91260b4..883d659 100644
--- a/R/sash.R
+++ b/R/sash.R
@@ -79,7 +79,7 @@ Wf_sash <- R6::R6Class(
     #' local filesystem).
     #' @param SubjectID The SubjectID of the sample.
     #' @param SampleID_tumor The SampleID of the tumor sample.
-    #' @param SampleID_normal The SampleID of the tumor sample.
+    #' @param SampleID_normal The SampleID of the normal sample.
     initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL,
                           SampleID_normal = NULL) {
       wname <- "sash"
diff --git a/R/tso_dragen.R b/R/tso_dragen.R
index d786296..d4f7411 100644
--- a/R/tso_dragen.R
+++ b/R/tso_dragen.R
@@ -18,7 +18,7 @@
 #' d1$list_files_filter_relevant(max_files = 300)
 #' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
 #' d_tidy <- d1$tidy_files(d)
-#' d_write <- t1$write(
+#' d_write <- d1$write(
 #'   d_tidy,
 #'   outdir = file.path(p, "dracarys_tidy"),
 #'   prefix = prefix,
@@ -36,7 +36,7 @@
 #' d1$list_files_filter_relevant(max_files = 300)
 #' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
 #' d_tidy <- d1$tidy_files(d)
-#' d_write <- t1$write(
+#' d_write <- d1$write(
 #'   d_tidy,
 #'   outdir = file.path(p, "dracarys_tidy"),
 #'   prefix = prefix,
diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd
index 4d520c1..a3fc949 100644
--- a/man/Wf_dragen.Rd
+++ b/man/Wf_dragen.Rd
@@ -21,7 +21,7 @@ d1$list_files(max_files = 100)
 d1$list_files_filter_relevant(max_files = 300)
 d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
 d_tidy <- d1$tidy_files(d)
-d_write <- t1$write(
+d_write <- d1$write(
   d_tidy,
   outdir = file.path(p, "dracarys_tidy"),
   prefix = prefix,
@@ -39,7 +39,7 @@ d1$list_files(max_files = 100)
 d1$list_files_filter_relevant(max_files = 300)
 d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
 d_tidy <- d1$tidy_files(d)
-d_write <- t1$write(
+d_write <- d1$write(
   d_tidy,
   outdir = file.path(p, "dracarys_tidy"),
   prefix = prefix,
diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd
index d3a8f66..23b54d0 100644
--- a/man/Wf_sash.Rd
+++ b/man/Wf_sash.Rd
@@ -132,7 +132,7 @@ local filesystem).}
 
 \item{\code{SampleID_tumor}}{The SampleID of the tumor sample.}
 
-\item{\code{SampleID_normal}}{The SampleID of the tumor sample.}
+\item{\code{SampleID_normal}}{The SampleID of the normal sample.}
 }
 \if{html}{\out{</div>}}
 }

From 2e15dd5c4d0f22cbb26b5d849c26a5dacc8eaa18 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 12 Nov 2024 19:41:20 +1100
Subject: [PATCH 24/32] s3_file_presignedurl: increase expiry to 7 days (604800
 secs)

---
 R/fs_s3.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/fs_s3.R b/R/fs_s3.R
index d67a109..ed32b0f 100644
--- a/R/fs_s3.R
+++ b/R/fs_s3.R
@@ -196,7 +196,7 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
 #' }
 #'
 #' @export
-s3_file_presignedurl <- function(client, s3path, expiry_seconds = 3600) {
+s3_file_presignedurl <- function(client, s3path, expiry_seconds = 604800) {
   bucket <- sub("s3://(.*?)/.*", "\\1", s3path)
   prefix <- sub("s3://(.*?)/(.*)", "\\2", s3path)
   client$generate_presigned_url(

From 6ba304218ea3231e58c47edbd71aabd0f6a0b72a Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Wed, 13 Nov 2024 22:06:28 +1100
Subject: [PATCH 25/32] data-raw/portal_meta.R moved to rportal

---
 data-raw/portal_meta.R | 31 -------------------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 data-raw/portal_meta.R

diff --git a/data-raw/portal_meta.R b/data-raw/portal_meta.R
deleted file mode 100644
index 56982f6..0000000
--- a/data-raw/portal_meta.R
+++ /dev/null
@@ -1,31 +0,0 @@
-# portal workflow meta subset
-require(dracarys)
-require(here)
-require(dplyr)
-require(purrr)
-require(readr)
-
-
-wfs <- c(
-  "bcl_convert", "rnasum", "tso_ctdna_tumor_only",
-  "umccrise", "wgs_alignment_qc", "wgs_tumor_normal", "wts_tumor_only",
-  "wts_alignment_qc",
-  "oncoanalyser_wgs", "oncoanalyser_wgts_existing_both",
-  "oncoanalyser_wts", "sash", "star_alignment"
-)
-
-account <- "stg"
-get_top_succeeded <- function(wf, num_row = 10, num_top = 4) {
-  dracarys::portal_meta_read(params = glue::glue("&type_name={wf}"), account = account, rows = num_row) |>
-    dplyr::filter(.data$end_status == "Succeeded") |>
-    dplyr::slice_head(n = num_top)
-}
-
-# get top 10 rows, then get top 4 successful runs
-d <- wfs |>
-  purrr::map(\(x) get_top_succeeded(x, 10, 4)) |>
-  dplyr::bind_rows()
-# leave dates as character
-d |>
-  readr::write_csv(here::here("inst/extdata/portal_meta_top4.csv"))
-# date_fmt <- "%Y-%m-%dT%H:%M:%S"

From cbae470632773f4958c9a2392f53e22e134c8ca4 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 24 Nov 2024 22:46:53 +1100
Subject: [PATCH 26/32] add dtw_Wf_dragen

---
 NAMESPACE            |   1 +
 R/dragen.R           | 357 +++++++++++++++++++++++++++++++++++++++++++
 R/tso_dragen.R       | 302 ------------------------------------
 man/Wf_dragen.Rd     |   2 +-
 man/dtw_Wf_dragen.Rd |  56 +++++++
 5 files changed, 415 insertions(+), 303 deletions(-)
 create mode 100644 man/dtw_Wf_dragen.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 12a33de..0463659 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -30,6 +30,7 @@ export(dragen_sv_metrics_read)
 export(dragen_trimmer_metrics_read)
 export(dragen_umi_metrics_read)
 export(dragen_vc_metrics_read)
+export(dtw_Wf_dragen)
 export(dtw_Wf_tso_ctdna_tumor_only)
 export(dtw_Wf_tso_ctdna_tumor_only_v2)
 export(empty_tbl)
diff --git a/R/dragen.R b/R/dragen.R
index ea482bf..9e80b62 100644
--- a/R/dragen.R
+++ b/R/dragen.R
@@ -821,3 +821,360 @@ dragen_ploidy_estimation_metrics_read <- function(x) {
   d |>
     dplyr::mutate(dplyr::across(dplyr::all_of(cols1), as.numeric))
 }
+
+#' Wf_dragen Download Tidy and Write
+#'
+#' Downloads files from the `dragen` workflow and writes them in a tidy format.
+#'
+#' @param path Path to directory with raw workflow results (S3 or local filesystem).
+#' @param prefix The LibraryID prefix of the sample.
+#' @param outdir Path to output directory with raw files.
+#' @param outdir_tidy Path to output directory with tidy files.
+#' @param format Format of output files.
+#' @param max_files Max number of files to list.
+#' @param dryrun If TRUE, just list the files that will be downloaded (don't
+#' download them).
+#' @return Tibble of tidy tibbles.
+#'
+#' @examples
+#' \dontrun{
+#' #---- Local ----#
+#'
+#' #---- S3 ----#
+#' path <- file.path(
+#'   "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production",
+#'   "analysis/wgts-qc/20241123ffa837c4/L2401621_dragen_alignment"
+#' )
+#' prefix <- "L2401621"
+#' outdir <- sub("s3:/", "~/s3", path)
+#' dragen_tidy <- dtw_Wf_dragen(
+#'   path = path, prefix = prefix, outdir = outdir,
+#'   format = "tsv",
+#'   dryrun = F
+#' )
+#' }
+#' @export
+dtw_Wf_dragen <- function(path, prefix, outdir,
+                          outdir_tidy = file.path(outdir, "dracarys_tidy"),
+                          format = "rds",
+                          max_files = 1000,
+                          dryrun = FALSE) {
+  obj <- Wf_dragen$new(path = path, prefix = prefix)
+  d_dl <- obj$download_files(
+    outdir = outdir, max_files = max_files, dryrun = dryrun
+  )
+  if (!dryrun) {
+    d_tidy <- obj$tidy_files(d_dl)
+    d_write <- obj$write(
+      d_tidy,
+      outdir = outdir_tidy,
+      prefix = prefix,
+      format = format
+    )
+    return(d_write)
+  }
+  return(d_dl)
+}
+
+#' Wf_dragen R6 Class
+#'
+#' @description
+#' Reads and writes tidy versions of files from the `dragen` workflow.
+#'
+#' @examples
+#' \dontrun{
+#'
+#' #---- Local ----#
+#' prefix <- "L2401290"
+#' p <- file.path(
+#'   "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production",
+#'   "analysis/cttsov2/20240915ff0295ed/Logs_Intermediates/DragenCaller",
+#'   prefix
+#' )
+#' d1 <- Wf_dragen$new(path = p, prefix = prefix)
+#' d1$list_files(max_files = 100)
+#' d1$list_files_filter_relevant(max_files = 300)
+#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
+#' d_tidy <- d1$tidy_files(d)
+#' d_write <- d1$write(
+#'   d_tidy,
+#'   outdir = file.path(p, "dracarys_tidy"),
+#'   prefix = prefix,
+#'   format = "tsv"
+#' )
+#' #---- GDS ----#
+#' prefix <- "PRJ222358"
+#' p <- file.path(
+#'   "gds://production/analysis_data/SBJ03001/wgs_tumor_normal",
+#'   "20241108fc293a38/L2201805_L2201797_dragen_somatic"
+#' )
+#' outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case
+#' d1 <- Wf_dragen$new(path = p, prefix = prefix)
+#' d1$list_files(max_files = 100)
+#' d1$list_files_filter_relevant(max_files = 300)
+#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
+#' d_tidy <- d1$tidy_files(d)
+#' d_write <- d1$write(
+#'   d_tidy,
+#'   outdir = file.path(p, "dracarys_tidy"),
+#'   prefix = prefix,
+#'   format = "tsv"
+#' )
+#' }
+#' @export
+Wf_dragen <- R6::R6Class(
+  "Wf_dragen",
+  inherit = Wf,
+  public = list(
+    #' @field prefix The LibraryID prefix of the sample (needed for path lookup).
+    prefix = NULL,
+    #' @description Create a new Wf_dragen object.
+    #' @param path Path to directory with raw workflow results (from S3 or
+    #' local filesystem).
+    #' @param prefix The LibraryID prefix of the sample (needed for path lookup).
+    initialize = function(path = NULL, prefix = NULL) {
+      wname <- "dragen"
+      pref <- prefix
+      tn1 <- "(|_tumor|_normal)"
+      regexes <- tibble::tribble(
+        ~regex, ~fun,
+        glue("{pref}\\-replay\\.json$"), "read_replay",
+        glue("{pref}\\.cnv_metrics.csv$"), "read_cnvMetrics",
+        glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "read_contigMeanCov",
+        glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "read_contigMeanCov",
+        glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "read_contigMeanCov",
+        glue("{pref}\\.wgs_contig_mean_cov{tn1}\\.csv$"), "read_contigMeanCov",
+        glue("{pref}\\.exon_coverage_metrics\\.csv$"), "read_coverageMetrics",
+        glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "read_coverageMetrics",
+        glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "read_coverageMetrics",
+        glue("{pref}\\.wgs_coverage_metrics{tn1}\\.csv$"), "read_coverageMetrics",
+        glue("{pref}\\.exon_fine_hist\\.csv$"), "read_fineHist",
+        glue("{pref}\\.target_bed_fine_hist\\.csv$"), "read_fineHist",
+        glue("{pref}\\.tmb_fine_hist\\.csv$"), "read_fineHist",
+        glue("{pref}\\.wgs_fine_hist{tn1}\\.csv$"), "read_fineHist",
+        glue("{pref}\\.exon_hist\\.csv$"), "read_hist",
+        glue("{pref}\\.target_bed_hist\\.csv$"), "read_hist",
+        glue("{pref}\\.tmb_hist\\.csv$"), "read_hist",
+        glue("{pref}\\.wgs_hist{tn1}\\.csv$"), "read_hist",
+        glue("{pref}\\.fastqc_metrics\\.csv$"), "read_fastqcMetrics",
+        glue("{pref}\\.fragment_length_hist\\.csv$"), "read_fragmentLengthHist",
+        glue("{pref}\\.gc_metrics\\.csv$"), "read_gcMetrics",
+        glue("{pref}\\.gvcf_metrics\\.csv$"), "read_vcMetrics",
+        glue("{pref}\\.mapping_metrics\\.csv$"), "read_mappingMetrics",
+        glue("{pref}\\.microsat_diffs\\.txt$"), "read_msiDiffs",
+        glue("{pref}\\.microsat_output\\.json$"), "read_msi",
+        glue("{pref}\\.sv_metrics\\.csv$"), "read_svMetrics",
+        glue("{pref}\\.time_metrics\\.csv$"), "read_timeMetrics",
+        glue("{pref}\\.trimmer_metrics\\.csv$"), "read_trimmerMetrics",
+        glue("{pref}\\.umi_metrics\\.csv$"), "read_umiMetrics",
+        glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics",
+        glue("{pref}\\.ploidy_estimation_metrics\\.csv$"), "read_ploidyMetrics"
+      )
+
+      super$initialize(path = path, wname = wname, regexes = regexes)
+      self$prefix <- prefix
+    },
+    #' @description Print details about the Workflow.
+    #' @param ... (ignored).
+    print = function(...) {
+      res <- tibble::tribble(
+        ~var, ~value,
+        "path", private$.path,
+        "wname", private$.wname,
+        "filesystem", private$.filesystem,
+        "prefix", self$prefix
+      )
+      print(res)
+      invisible(self)
+    },
+    #' @description Read `replay.json` file.
+    #' @param x Path to file.
+    read_replay = function(x) {
+      res <- x |>
+        jsonlite::read_json(simplifyVector = TRUE) |>
+        purrr::map_if(is.data.frame, tibble::as_tibble)
+      req_elements <- c("command_line", "hash_table_build", "dragen_config", "system")
+      assertthat::assert_that(all(names(res) %in% req_elements))
+      res[["system"]] <- res[["system"]] |>
+        tibble::as_tibble_row()
+      res[["hash_table_build"]] <- res[["hash_table_build"]] |>
+        tibble::as_tibble_row()
+      # we don't care if the columns are characters, no analysis likely to be done on dragen options
+      # (though never say never!)
+      res[["dragen_config"]] <- res[["dragen_config"]] |>
+        tidyr::pivot_wider(names_from = "name", values_from = "value")
+      dat <- dplyr::bind_cols(res)
+      tibble::tibble(name = "replay", data = list(dat))
+    },
+    #' @description Read `contig_mean_cov.csv` file.
+    #' @param x Path to file.
+    #' @param keep_alt Keep ALT contigs.
+    read_contigMeanCov = function(x, keep_alt = FALSE) {
+      subprefix <- private$dragen_subprefix(x, "_contig_mean_cov")
+      dat <- readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |>
+        dplyr::filter(
+          if (!keep_alt) {
+            !grepl("chrM|MT|_|Autosomal|HLA-|EBV|GL|hs37d5", .data$chrom)
+          } else {
+            TRUE
+          }
+        )
+      tibble::tibble(name = glue("contigmeancov_{subprefix}"), data = list(dat[]))
+    },
+    #' @description Read `coverage_metrics.csv` file.
+    #' @param x Path to file.
+    read_coverageMetrics = function(x) {
+      subprefix <- private$dragen_subprefix(x, "_coverage_metrics")
+      dat <- dragen_coverage_metrics_read(x)
+      tibble::tibble(name = glue("covmetrics_{subprefix}"), data = list(dat))
+    },
+    #' @description Read `fine_hist.csv` file.
+    #' @param x Path to file.
+    read_fineHist = function(x) {
+      subprefix <- private$dragen_subprefix(x, "_fine_hist")
+      d <- readr::read_csv(x, col_types = "cd")
+      assertthat::assert_that(all(colnames(d) == c("Depth", "Overall")))
+      # there's a max Depth of 2000+, so convert to numeric for easier plotting
+      dat <- d |>
+        dplyr::mutate(
+          Depth = ifelse(grepl("+", .data$Depth), sub("(\\d*)\\+", "\\1", .data$Depth), .data$Depth),
+          Depth = as.integer(.data$Depth)
+        ) |>
+        dplyr::select(depth = "Depth", n_loci = "Overall")
+      tibble::tibble(name = glue("finehist_{subprefix}"), data = list(dat))
+    },
+    #' @description Read `fragment_length_hist.csv` file.
+    #' @param x Path to file.
+    read_fragmentLengthHist = function(x) {
+      d <- readr::read_lines(x)
+      assertthat::assert_that(grepl("#Sample", d[1]))
+      dat <- d |>
+        tibble::enframe(name = "name", value = "value") |>
+        dplyr::filter(!grepl("#Sample: |FragmentLength,Count", .data$value)) |>
+        tidyr::separate_wider_delim(cols = "value", names = c("fragmentLength", "count"), delim = ",") |>
+        dplyr::mutate(
+          count = as.numeric(.data$count),
+          fragmentLength = as.numeric(.data$fragmentLength)
+        ) |>
+        dplyr::select("fragmentLength", "count")
+      tibble::tibble(name = "fraglen", data = list(dat))
+    },
+    #' @description Read `mapping_metrics.csv` file.
+    #' @param x Path to file.
+    read_mappingMetrics = function(x) {
+      dat <- dragen_mapping_metrics_read(x)
+      tibble::tibble(name = "mapmetrics", data = list(dat))
+    },
+    #' @description Read `hist.csv` (not `fine_hist.csv`!) file.
+    #' @param x Path to file.
+    read_hist = function(x) {
+      subprefix <- private$dragen_subprefix(x, "_hist")
+      d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd")
+      dat <- d |>
+        dplyr::mutate(
+          var = sub("PCT of bases in .* with coverage ", "", .data$var),
+          var = gsub("\\[|\\]|\\(|\\)", "", .data$var),
+          var = gsub("x", "", .data$var),
+          var = gsub("inf", "Inf", .data$var)
+        ) |>
+        tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |>
+        dplyr::mutate(
+          start = as.numeric(.data$start),
+          end = as.numeric(.data$end),
+          pct = round(.data$pct, 2),
+          cumsum = cumsum(.data$pct)
+        )
+      tibble::tibble(name = glue("hist_{subprefix}"), data = list(dat))
+    },
+    #' @description Read `time_metrics.csv` file.
+    #' @param x Path to file.
+    read_timeMetrics = function(x) {
+      cn <- c("dummy1", "dummy2", "Step", "time_hrs", "time_sec")
+      ct <- readr::cols(
+        .default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d"
+      )
+      d <- readr::read_csv(x, col_names = cn, col_types = ct)
+      assertthat::assert_that(d$dummy1[1] == "RUN TIME", is.na(d$dummy2[1]))
+      assertthat::assert_that(inherits(d$time_hrs, "hms"))
+      dat <- d |>
+        dplyr::mutate(
+          Step = tools::toTitleCase(sub("Time ", "", .data$Step)),
+          Step = gsub(" |/", "", .data$Step),
+          Time = substr(.data$time_hrs, 1, 5)
+        ) |>
+        dplyr::select("Step", "Time") |>
+        tidyr::pivot_wider(names_from = "Step", values_from = "Time") |>
+        dplyr::relocate("TotalRuntime")
+      tibble::tibble(name = "timemetrics", data = list(dat))
+    },
+    #' @description Read `vc_metrics.csv`/`gvcf_metrics.csv` file.
+    #' @param x Path to file.
+    read_vcMetrics = function(x) {
+      subprefix <- private$dragen_subprefix(x, "_metrics")
+      dat <- dragen_vc_metrics_read(x)
+      tibble::tibble(name = glue("vcmetrics_{subprefix}"), data = list(dat[]))
+    },
+    #' @description Read `trimmer_metrics.csv` file.
+    #' @param x Path to file.
+    read_trimmerMetrics = function(x) {
+      dat <- dragen_trimmer_metrics_read(x)
+      tibble::tibble(name = "trimmermetrics", data = list(dat[]))
+    },
+    #' @description Read `sv_metrics.csv` file.
+    #' @param x Path to file.
+    read_svMetrics = function(x) {
+      dat <- dragen_sv_metrics_read(x)
+      tibble::tibble(name = "svmetrics", data = list(dat[]))
+    },
+    #' @description Read `cnv_metrics.csv` file.
+    #' @param x Path to file.
+    read_cnvMetrics = function(x) {
+      dat <- dragen_cnv_metrics_read(x)
+      tibble::tibble(name = "cnvmetrics", data = list(dat[]))
+    },
+    #' @description Read `fastqc_metrics.csv` file.
+    #' @param x Path to file.
+    read_fastqcMetrics = function(x) {
+      dat <- dragen_fastqc_metrics_read(x)
+      dat
+    },
+    #' @description Read `gc_metrics.csv` file.
+    #' @param x Path to file.
+    read_gcMetrics = function(x) {
+      dat <- dragen_gc_metrics_read(x)
+      dat
+    },
+    #' @description Read `umi_metrics.csv` file.
+    #' @param x Path to file.
+    read_umiMetrics = function(x) {
+      dat <- dragen_umi_metrics_read(x)
+      dat
+    },
+    #' @description Read `ploidy_estimation_metrics.csv` file.
+    #' @param x Path to file.
+    read_ploidyMetrics = function(x) {
+      dat <- dragen_ploidy_estimation_metrics_read(x)
+      tibble::tibble(name = "ploidymetrics", data = list(dat))
+    },
+    #' @description Read `microsat_output.json` file.
+    #' @param x Path to file.
+    read_msi = function(x) {
+      dat <- tso_msi_read(x)
+      tibble::tibble(name = "msi", data = list(dat[]))
+    },
+    #' @description Read `microsat_diffs.txt` file.
+    #' @param x Path to file.
+    read_msiDiffs = function(x) {
+      dat <- readr::read_tsv(x, col_types = "cdccddc") |>
+        dplyr::rename(Chromosome = "#Chromosome")
+      tibble::tibble(name = "msidiffs", data = list(dat[]))
+    }
+  ), # end public
+  private = list(
+    dragen_subprefix = function(x, suffix) {
+      bname <- basename(x)
+      s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov
+      sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon"
+    }
+  )
+) # end Wf_dragen
diff --git a/R/tso_dragen.R b/R/tso_dragen.R
index d4f7411..e69de29 100644
--- a/R/tso_dragen.R
+++ b/R/tso_dragen.R
@@ -1,302 +0,0 @@
-#' Wf_dragen R6 Class
-#'
-#' @description
-#' Reads and writes tidy versions of files from the `dragen` workflow.
-#'
-#' @examples
-#' \dontrun{
-#'
-#' #---- Local ----#
-#' prefix <- "L2401290"
-#' p <- file.path(
-#'   "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production",
-#'   "analysis/cttsov2/20240915ff0295ed/Logs_Intermediates/DragenCaller",
-#'   prefix
-#' )
-#' d1 <- Wf_dragen$new(path = p, prefix = prefix)
-#' d1$list_files(max_files = 100)
-#' d1$list_files_filter_relevant(max_files = 300)
-#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
-#' d_tidy <- d1$tidy_files(d)
-#' d_write <- d1$write(
-#'   d_tidy,
-#'   outdir = file.path(p, "dracarys_tidy"),
-#'   prefix = prefix,
-#'   format = "tsv"
-#' )
-#' #---- GDS ----#
-#' prefix <- "PRJ222358"
-#' p <- file.path(
-#'   "gds://production/analysis_data/SBJ03001/wgs_tumor_normal",
-#'   "20241108fc293a38/L2201805_L2201797_dragen_somatic"
-#' )
-#' outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case
-#' d1 <- Wf_dragen$new(path = p, prefix = prefix)
-#' d1$list_files(max_files = 100)
-#' d1$list_files_filter_relevant(max_files = 300)
-#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F)
-#' d_tidy <- d1$tidy_files(d)
-#' d_write <- d1$write(
-#'   d_tidy,
-#'   outdir = file.path(p, "dracarys_tidy"),
-#'   prefix = prefix,
-#'   format = "tsv"
-#' )
-#' }
-#' @export
-Wf_dragen <- R6::R6Class(
-  "Wf_dragen",
-  inherit = Wf,
-  public = list(
-    #' @field prefix The LibraryID prefix of the sample (needed for path lookup).
-    prefix = NULL,
-    #' @description Create a new Wf_dragen object.
-    #' @param path Path to directory with raw workflow results (from S3 or
-    #' local filesystem).
-    #' @param prefix The LibraryID prefix of the sample (needed for path lookup).
-    initialize = function(path = NULL, prefix = NULL) {
-      wname <- "dragen"
-      pref <- prefix
-      tn1 <- "(|_tumor|_normal)"
-      regexes <- tibble::tribble(
-        ~regex, ~fun,
-        glue("{pref}\\-replay\\.json$"), "read_replay",
-        glue("{pref}\\.cnv_metrics.csv$"), "read_cnvMetrics",
-        glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "read_contigMeanCov",
-        glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "read_contigMeanCov",
-        glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "read_contigMeanCov",
-        glue("{pref}\\.wgs_contig_mean_cov{tn1}\\.csv$"), "read_contigMeanCov",
-        glue("{pref}\\.exon_coverage_metrics\\.csv$"), "read_coverageMetrics",
-        glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "read_coverageMetrics",
-        glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "read_coverageMetrics",
-        glue("{pref}\\.wgs_coverage_metrics{tn1}\\.csv$"), "read_coverageMetrics",
-        glue("{pref}\\.exon_fine_hist\\.csv$"), "read_fineHist",
-        glue("{pref}\\.target_bed_fine_hist\\.csv$"), "read_fineHist",
-        glue("{pref}\\.tmb_fine_hist\\.csv$"), "read_fineHist",
-        glue("{pref}\\.wgs_fine_hist{tn1}\\.csv$"), "read_fineHist",
-        glue("{pref}\\.exon_hist\\.csv$"), "read_hist",
-        glue("{pref}\\.target_bed_hist\\.csv$"), "read_hist",
-        glue("{pref}\\.tmb_hist\\.csv$"), "read_hist",
-        glue("{pref}\\.wgs_hist{tn1}\\.csv$"), "read_hist",
-        glue("{pref}\\.fastqc_metrics\\.csv$"), "read_fastqcMetrics",
-        glue("{pref}\\.fragment_length_hist\\.csv$"), "read_fragmentLengthHist",
-        glue("{pref}\\.gc_metrics\\.csv$"), "read_gcMetrics",
-        glue("{pref}\\.gvcf_metrics\\.csv$"), "read_vcMetrics",
-        glue("{pref}\\.mapping_metrics\\.csv$"), "read_mappingMetrics",
-        glue("{pref}\\.microsat_diffs\\.txt$"), "read_msiDiffs",
-        glue("{pref}\\.microsat_output\\.json$"), "read_msi",
-        glue("{pref}\\.sv_metrics\\.csv$"), "read_svMetrics",
-        glue("{pref}\\.time_metrics\\.csv$"), "read_timeMetrics",
-        glue("{pref}\\.trimmer_metrics\\.csv$"), "read_trimmerMetrics",
-        glue("{pref}\\.umi_metrics\\.csv$"), "read_umiMetrics",
-        glue("{pref}\\.vc_metrics\\.csv$"), "read_vcMetrics",
-        glue("{pref}\\.ploidy_estimation_metrics\\.csv$"), "read_ploidyMetrics"
-      )
-
-      super$initialize(path = path, wname = wname, regexes = regexes)
-      self$prefix <- prefix
-    },
-    #' @description Print details about the Workflow.
-    #' @param ... (ignored).
-    print = function(...) {
-      res <- tibble::tribble(
-        ~var, ~value,
-        "path", private$.path,
-        "wname", private$.wname,
-        "filesystem", private$.filesystem,
-        "prefix", self$prefix
-      )
-      print(res)
-      invisible(self)
-    },
-    #' @description Read `replay.json` file.
-    #' @param x Path to file.
-    read_replay = function(x) {
-      res <- x |>
-        jsonlite::read_json(simplifyVector = TRUE) |>
-        purrr::map_if(is.data.frame, tibble::as_tibble)
-      req_elements <- c("command_line", "hash_table_build", "dragen_config", "system")
-      assertthat::assert_that(all(names(res) %in% req_elements))
-      res[["system"]] <- res[["system"]] |>
-        tibble::as_tibble_row()
-      res[["hash_table_build"]] <- res[["hash_table_build"]] |>
-        tibble::as_tibble_row()
-      # we don't care if the columns are characters, no analysis likely to be done on dragen options
-      # (though never say never!)
-      res[["dragen_config"]] <- res[["dragen_config"]] |>
-        tidyr::pivot_wider(names_from = "name", values_from = "value")
-      dat <- dplyr::bind_cols(res)
-      tibble::tibble(name = "replay", data = list(dat))
-    },
-    #' @description Read `contig_mean_cov.csv` file.
-    #' @param x Path to file.
-    #' @param keep_alt Keep ALT contigs.
-    read_contigMeanCov = function(x, keep_alt = FALSE) {
-      subprefix <- private$dragen_subprefix(x, "_contig_mean_cov")
-      dat <- readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |>
-        dplyr::filter(
-          if (!keep_alt) {
-            !grepl("chrM|MT|_|Autosomal|HLA-|EBV|GL|hs37d5", .data$chrom)
-          } else {
-            TRUE
-          }
-        )
-      tibble::tibble(name = glue("contigmeancov_{subprefix}"), data = list(dat[]))
-    },
-    #' @description Read `coverage_metrics.csv` file.
-    #' @param x Path to file.
-    read_coverageMetrics = function(x) {
-      subprefix <- private$dragen_subprefix(x, "_coverage_metrics")
-      dat <- dragen_coverage_metrics_read(x)
-      tibble::tibble(name = glue("covmetrics_{subprefix}"), data = list(dat))
-    },
-    #' @description Read `fine_hist.csv` file.
-    #' @param x Path to file.
-    read_fineHist = function(x) {
-      subprefix <- private$dragen_subprefix(x, "_fine_hist")
-      d <- readr::read_csv(x, col_types = "cd")
-      assertthat::assert_that(all(colnames(d) == c("Depth", "Overall")))
-      # there's a max Depth of 2000+, so convert to numeric for easier plotting
-      dat <- d |>
-        dplyr::mutate(
-          Depth = ifelse(grepl("+", .data$Depth), sub("(\\d*)\\+", "\\1", .data$Depth), .data$Depth),
-          Depth = as.integer(.data$Depth)
-        ) |>
-        dplyr::select(depth = "Depth", n_loci = "Overall")
-      tibble::tibble(name = glue("finehist_{subprefix}"), data = list(dat))
-    },
-    #' @description Read `fragment_length_hist.csv` file.
-    #' @param x Path to file.
-    read_fragmentLengthHist = function(x) {
-      d <- readr::read_lines(x)
-      assertthat::assert_that(grepl("#Sample", d[1]))
-      dat <- d |>
-        tibble::enframe(name = "name", value = "value") |>
-        dplyr::filter(!grepl("#Sample: |FragmentLength,Count", .data$value)) |>
-        tidyr::separate_wider_delim(cols = "value", names = c("fragmentLength", "count"), delim = ",") |>
-        dplyr::mutate(
-          count = as.numeric(.data$count),
-          fragmentLength = as.numeric(.data$fragmentLength)
-        ) |>
-        dplyr::select("fragmentLength", "count")
-      tibble::tibble(name = "fraglen", data = list(dat))
-    },
-    #' @description Read `mapping_metrics.csv` file.
-    #' @param x Path to file.
-    read_mappingMetrics = function(x) {
-      dat <- dragen_mapping_metrics_read(x)
-      tibble::tibble(name = "mapmetrics", data = list(dat))
-    },
-    #' @description Read `hist.csv` (not `fine_hist.csv`!) file.
-    #' @param x Path to file.
-    read_hist = function(x) {
-      subprefix <- private$dragen_subprefix(x, "_hist")
-      d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd")
-      dat <- d |>
-        dplyr::mutate(
-          var = sub("PCT of bases in .* with coverage ", "", .data$var),
-          var = gsub("\\[|\\]|\\(|\\)", "", .data$var),
-          var = gsub("x", "", .data$var),
-          var = gsub("inf", "Inf", .data$var)
-        ) |>
-        tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |>
-        dplyr::mutate(
-          start = as.numeric(.data$start),
-          end = as.numeric(.data$end),
-          pct = round(.data$pct, 2),
-          cumsum = cumsum(.data$pct)
-        )
-      tibble::tibble(name = glue("hist_{subprefix}"), data = list(dat))
-    },
-    #' @description Read `time_metrics.csv` file.
-    #' @param x Path to file.
-    read_timeMetrics = function(x) {
-      cn <- c("dummy1", "dummy2", "Step", "time_hrs", "time_sec")
-      ct <- readr::cols(
-        .default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d"
-      )
-      d <- readr::read_csv(x, col_names = cn, col_types = ct)
-      assertthat::assert_that(d$dummy1[1] == "RUN TIME", is.na(d$dummy2[1]))
-      assertthat::assert_that(inherits(d$time_hrs, "hms"))
-      dat <- d |>
-        dplyr::mutate(
-          Step = tools::toTitleCase(sub("Time ", "", .data$Step)),
-          Step = gsub(" |/", "", .data$Step),
-          Time = substr(.data$time_hrs, 1, 5)
-        ) |>
-        dplyr::select("Step", "Time") |>
-        tidyr::pivot_wider(names_from = "Step", values_from = "Time") |>
-        dplyr::relocate("TotalRuntime")
-      tibble::tibble(name = "timemetrics", data = list(dat))
-    },
-    #' @description Read `vc_metrics.csv`/`gvcf_metrics.csv` file.
-    #' @param x Path to file.
-    read_vcMetrics = function(x) {
-      subprefix <- private$dragen_subprefix(x, "_metrics")
-      dat <- dragen_vc_metrics_read(x)
-      tibble::tibble(name = glue("vcmetrics_{subprefix}"), data = list(dat[]))
-    },
-    #' @description Read `trimmer_metrics.csv` file.
-    #' @param x Path to file.
-    read_trimmerMetrics = function(x) {
-      dat <- dragen_trimmer_metrics_read(x)
-      tibble::tibble(name = "trimmermetrics", data = list(dat[]))
-    },
-    #' @description Read `sv_metrics.csv` file.
-    #' @param x Path to file.
-    read_svMetrics = function(x) {
-      dat <- dragen_sv_metrics_read(x)
-      tibble::tibble(name = "svmetrics", data = list(dat[]))
-    },
-    #' @description Read `cnv_metrics.csv` file.
-    #' @param x Path to file.
-    read_cnvMetrics = function(x) {
-      dat <- dragen_cnv_metrics_read(x)
-      tibble::tibble(name = "cnvmetrics", data = list(dat[]))
-    },
-    #' @description Read `fastqc_metrics.csv` file.
-    #' @param x Path to file.
-    read_fastqcMetrics = function(x) {
-      dat <- dragen_fastqc_metrics_read(x)
-      dat
-    },
-    #' @description Read `gc_metrics.csv` file.
-    #' @param x Path to file.
-    read_gcMetrics = function(x) {
-      dat <- dragen_gc_metrics_read(x)
-      dat
-    },
-    #' @description Read `umi_metrics.csv` file.
-    #' @param x Path to file.
-    read_umiMetrics = function(x) {
-      dat <- dragen_umi_metrics_read(x)
-      dat
-    },
-    #' @description Read `ploidy_estimation_metrics.csv` file.
-    #' @param x Path to file.
-    read_ploidyMetrics = function(x) {
-      dat <- dragen_ploidy_estimation_metrics_read(x)
-      tibble::tibble(name = "ploidymetrics", data = list(dat))
-    },
-    #' @description Read `microsat_output.json` file.
-    #' @param x Path to file.
-    read_msi = function(x) {
-      dat <- tso_msi_read(x)
-      tibble::tibble(name = "msi", data = list(dat[]))
-    },
-    #' @description Read `microsat_diffs.txt` file.
-    #' @param x Path to file.
-    read_msiDiffs = function(x) {
-      dat <- readr::read_tsv(x, col_types = "cdccddc") |>
-        dplyr::rename(Chromosome = "#Chromosome")
-      tibble::tibble(name = "msidiffs", data = list(dat[]))
-    }
-  ), # end public
-  private = list(
-    dragen_subprefix = function(x, suffix) {
-      bname <- basename(x)
-      s1 <- sub("^.*\\.(.*?)\\..*$", "\\1", bname) # exon_contig_mean_cov
-      sub(suffix, "", s1) # sub("contig_mean_cov", "", s1) -> "exon"
-    }
-  )
-) # end Wf_dragen
diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd
index a3fc949..eeeabf1 100644
--- a/man/Wf_dragen.Rd
+++ b/man/Wf_dragen.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/tso_dragen.R
+% Please edit documentation in R/dragen.R
 \name{Wf_dragen}
 \alias{Wf_dragen}
 \title{Wf_dragen R6 Class}
diff --git a/man/dtw_Wf_dragen.Rd b/man/dtw_Wf_dragen.Rd
new file mode 100644
index 0000000..e5ecf7a
--- /dev/null
+++ b/man/dtw_Wf_dragen.Rd
@@ -0,0 +1,56 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dragen.R
+\name{dtw_Wf_dragen}
+\alias{dtw_Wf_dragen}
+\title{Wf_dragen Download Tidy and Write}
+\usage{
+dtw_Wf_dragen(
+  path,
+  prefix,
+  outdir,
+  outdir_tidy = file.path(outdir, "dracarys_tidy"),
+  format = "rds",
+  max_files = 1000,
+  dryrun = FALSE
+)
+}
+\arguments{
+\item{path}{Path to directory with raw workflow results (S3 or local filesystem).}
+
+\item{prefix}{The LibraryID prefix of the sample.}
+
+\item{outdir}{Path to output directory with raw files.}
+
+\item{outdir_tidy}{Path to output directory with tidy files.}
+
+\item{format}{Format of output files.}
+
+\item{max_files}{Max number of files to list.}
+
+\item{dryrun}{If TRUE, just list the files that will be downloaded (don't
+download them).}
+}
+\value{
+Tibble of tidy tibbles.
+}
+\description{
+Downloads files from the \code{dragen} workflow and writes them in a tidy format.
+}
+\examples{
+\dontrun{
+#---- Local ----#
+
+#---- S3 ----#
+path <- file.path(
+  "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production",
+  "analysis/wgts-qc/20241123ffa837c4/L2401621_dragen_alignment"
+)
+prefix <- "L2401621"
+outdir <- sub("s3:/", "~/s3", path)
+dragen_tidy <- dtw_Wf_dragen(
+  path = path, prefix = prefix, outdir = outdir,
+  format = "tsv",
+  dryrun = F
+)
+}
+}

From 00c820e026581f0f29b496a4bc77e2711e9a7eb2 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 24 Nov 2024 22:47:34 +1100
Subject: [PATCH 27/32] alignqc: refactor s3 dl_and_tidy script

---
 .../alignment_qc/dl_and_tidy.R                | 139 +++++++++++-------
 man/s3_file_presignedurl.Rd                   |   2 +-
 2 files changed, 87 insertions(+), 54 deletions(-)

diff --git a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R
index 9fbfa12..559290d 100755
--- a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R
+++ b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R
@@ -5,26 +5,38 @@
   require(dracarys, include.only = "umccr_tidy")
   require(glue, include.only = "glue")
   require(here, include.only = "here")
-  require(rportal, include.only = c("portaldb_query_workflow"))
+  require(rportal, include.only = c("orca_workflow_list"))
+  require(stringr, include.only = "str_remove_all")
+  require(tidyr, include.only = "unnest")
+  require(fs, include.only = "dir_create")
 }
 
-# make sure you have logged into AWS and ICA
+# make sure you have logged into AWS
 c("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION") |>
   rportal::envvar_defined() |>
   stopifnot()
-icav1_token <- Sys.getenv("ICA_ACCESS_TOKEN") |>
-  dracarys::ica_token_validate()
-
-query_workflow_alignqc <- function(start_date) {
-  wfs <- c("wgs_alignment_qc", "wts_alignment_qc") |>
-    shQuote() |>
-    paste(collapse = ", ")
-  q1 <- glue(
-    "WHERE \"type_name\" IN ({wfs}) AND  \"start\" > date(\'{start_date}\') ",
-    "ORDER BY \"start\" DESC;"
-  )
-  rportal::portaldb_query_workflow(q1)
-}
+token <- rportal::orca_jwt() |>
+  rportal::jwt_validate()
+dates <- c(
+  "2024-11-23",
+  "2024-11-24"
+) |>
+  stringr::str_remove_all("-") |>
+  paste(collapse = "|")
+wf0 <- rportal::orca_workflow_list(wf_name = "wgts-qc", token = token, page_size = 500)
+# get pld
+wf1 <- wf0 |>
+  filter(grepl(dates, .data$portalRunId)) |>
+  rowwise() |>
+  mutate(pld = list(rportal::orca_wfrid2payload(wfrid = .data$orcabusId, token = token))) |>
+  ungroup()
+# tidy pld
+wf2 <- wf1 |>
+  rowwise() |>
+  mutate(pld_tidy = list(rportal::pld_wgtsqc(.data$pld))) |>
+  ungroup() |>
+  select(workflowRunId = "orcabusId", portalRunId, currentStateTimestamp, pld_tidy) |>
+  tidyr::unnest(pld_tidy)
 
 query_limsrow_libids <- function(libids) {
   assertthat::assert_that(!is.null(libids), all(grepl("^L", libids)))
@@ -34,61 +46,82 @@ query_limsrow_libids <- function(libids) {
   rportal::portaldb_query_limsrow(q1)
 }
 
-# first read in the workflows table, extract metadata, then join with lims
-start_date <- "2024-10-11"
-p_raw <- query_workflow_alignqc(start_date)
+lims0 <- query_limsrow_libids(wf2$libraryId)
 
-wgs <- p_raw |>
-  rportal::meta_wgs_alignment_qc(status = "Succeeded")
-wts <- p_raw |>
-  rportal::meta_wts_alignment_qc(status = "Succeeded")
-p <- bind_rows(wgs, wts)
-lims_raw <- query_limsrow_libids(p$LibraryID)
-
-lims <- lims_raw |>
+lims1 <- lims0 |>
   tidyr::separate_wider_delim(
     library_id,
     delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start"
   ) |>
   select(
-    subject_id, library_id, sample_id, sample_name,
-    external_subject_id, external_sample_id,
-    project_name, project_owner, phenotype, type,
-    source, assay, quality, workflow
+    individualId = "subject_id",
+    libraryId = "library_id",
+    sampleId = "sample_id",
+    sampleName = "sample_name",
+    subjectId = "external_subject_id",
+    externalSampleId = "external_sample_id",
+    projectName = "project_name",
+    projectOwner = "project_owner",
+    phenotype, type, source, assay, quality, workflow
   ) |>
   distinct()
 
-d <- p |>
-  left_join(lims, by = c("SubjectID" = "subject_id", "LibraryID" = "library_id")) |>
+wf_lims <- wf2 |>
+  left_join(lims1, by = "libraryId") |>
   select(
-    "SubjectID", "LibraryID", "SampleID", "lane", "phenotype", "type", "source",
-    "assay", "workflow", "external_subject_id", "project_name", "project_owner",
-    "start", "end", "portal_run_id", "gds_outdir_dragen", "fq1", "fq2"
+    "libraryId", "individualId", "sampleId", "sampleName", "subjectId",
+    "externalSampleId", "projectName", "projectOwner",
+    lane = "input_lane",
+    "phenotype", "sampleType",
+    date = "currentStateTimestamp",
+    "source", "assay", "quality", "workflow",
+    "portalRunId", "output_dragenAlignmentOutputUri",
+    "input_read1FileUri", "input_read2FileUri",
   ) |>
-  mutate(rownum = row_number())
-
-tidy_script <- system.file("cli/dracarys.R", package = "dracarys")
+  mutate(rownum = row_number()) |>
+  relocate("rownum")
 
+# set up progress bar for the dtw function
+nticks <- nrow(wf_lims)
+bar_width <- 50
+pb <- progress::progress_bar$new(
+  format = "[:bar] :current/:total (:percent) elapsed :elapsedfull eta :eta",
+  total = nticks, clear = FALSE,
+  show_after = 0, width = bar_width
+)
+# wrapping the dtw function to use the progress bar
+fun1 <- function(path, prefix, outdir) {
+  pb$tick(0)
+  res <- dracarys::dtw_Wf_dragen(
+    path = path, prefix = prefix,
+    outdir = outdir, format = "rds",
+    max_files = 1000,
+    dryrun = FALSE
+  )
+  pb$tick()
+  return(res)
+}
 
-meta <- d |>
-  relocate(rownum) |>
+data_tidy <- wf_lims |>
   rowwise() |>
   mutate(
-    indir = gds_outdir_dragen,
-    outdir = file.path(sub("gds://", "", .data$indir)),
-    outdir = file.path(normalizePath("~/icav1/g"), .data$outdir),
-    # indir = file.path(outdir, "dracarys_gds_sync"), # for when debugging locally
-    cmd = system(
-      glue(
-        "echo ---{.data$rownum}--- && ",
-        "{tidy_script} tidy --in_dir {.data$indir} ",
-        "--out_dir {.data$outdir} --prefix {.data$SampleID} ",
-        "--token {icav1_token} ",
-        "--format rds"
+    indir = .data$output_dragenAlignmentOutputUri,
+    outdir = file.path(sub("s3://", "", .data$indir)),
+    outdir = file.path(normalizePath("~/s3"), .data$outdir)
+    # indir = file.path(outdir, "dracarys_s3_sync"), # for when debugging locally
+  ) |>
+  mutate(
+    data_tidy = list(
+      fun1(
+        path = .data$indir,
+        prefix = .data$libraryId,
+        outdir = .data$outdir
       )
     )
   ) |>
   ungroup()
 
-meta |>
-  saveRDS(here(glue("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/{start_date}_wgts.rds")))
+outdir1 <- fs::dir_create("inst/rmd/umccr_workflows/alignment_qc/nogit/tidy_data_rds")
+date1 <- "2024-11-24"
+data_tidy |>
+  saveRDS(here(glue("{outdir1}/{date1}_wgts.rds")))
diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd
index 79eb7d6..f7dddd2 100644
--- a/man/s3_file_presignedurl.Rd
+++ b/man/s3_file_presignedurl.Rd
@@ -4,7 +4,7 @@
 \alias{s3_file_presignedurl}
 \title{S3 Generate Presigned URL}
 \usage{
-s3_file_presignedurl(client, s3path, expiry_seconds = 3600)
+s3_file_presignedurl(client, s3path, expiry_seconds = 604800)
 }
 \arguments{
 \item{client}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).}

From 814df0c44265c122b8369c2d87a1bc4d9418a0c5 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 24 Nov 2024 23:08:23 +1100
Subject: [PATCH 28/32] alignqc: summary.Rmd -> summary.qmd

---
 .../alignment_qc/summary.Rmd => reports/wgts-qc/summary.qmd}      | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename inst/{rmd/umccr_workflows/alignment_qc/summary.Rmd => reports/wgts-qc/summary.qmd} (100%)

diff --git a/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd b/inst/reports/wgts-qc/summary.qmd
similarity index 100%
rename from inst/rmd/umccr_workflows/alignment_qc/summary.Rmd
rename to inst/reports/wgts-qc/summary.qmd

From 592f3cd6af6e54b48d1ec4bd0f0fde2e24edd92f Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Mon, 25 Nov 2024 01:08:40 +1100
Subject: [PATCH 29/32] alignqc: refactor summary report

---
 inst/reports/wgts-qc/summary.qmd | 375 ++++++++++++++++---------------
 1 file changed, 191 insertions(+), 184 deletions(-)

diff --git a/inst/reports/wgts-qc/summary.qmd b/inst/reports/wgts-qc/summary.qmd
index 95807e2..51b729b 100644
--- a/inst/reports/wgts-qc/summary.qmd
+++ b/inst/reports/wgts-qc/summary.qmd
@@ -1,36 +1,37 @@
 ---
-author: "University of Melbourne Centre for Cancer Research"
-date: "`r Sys.time()`"
-output:
-  html_document:
-    toc: true
-    theme: cosmo
-  rmdformats::material:
-    highlight: kate
+title: "WGTS Alignment QC Summary"
+author: "CCGCM - Genomics Platform Group"
+date: now
+date-format: "YYYY-MM-DD HH:mm Z"
+execute:
+  echo: false
+format:
+  html:
+    toc: false
+    toc-expand: 1
+    toc-title: Contents
+    toc-location: body
+    highlight-style: github
+    number-sections: false
+    link-external-icon: true
+    link-external-newwindow: true
+    embed-resources: true
+    code-copy: true
+    code-link: true
+    code-fold: true
+    code-block-border-left: true
+    smooth-scroll: true
+    grid:
+      body-width: 1300px
 params:
-  title: "UMCCR Alignment QC Summary Report"
-  meta: !r here::here("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/2024-10-11_wgts.rds")
-description: "UMCCR Alignment QC Summary Report"
-title: "`r params$title`"
+  tidy_data: "~/projects/dracarys/inst/rmd/umccr_workflows/alignment_qc/nogit/tidy_data_rds/2024-11-24_wgts.rds"
 ---
 
-```{r knitr_opts, include=F}
-knitr::opts_chunk$set(
-  collapse = TRUE, echo = FALSE,
-  warning = FALSE, message = FALSE,
-  fig.width = 10, fig.height = 15
-)
-```
-
-```{css}
-.main-container {
-  max-width: 1400px !important;
-  margin-left: auto;
-  margin-right: auto;
-}
-```
+```{r}
+#| label: pkgs
+#| message: false
+#| warning: false
 
-```{r load_pkgs}
 {
   require(dplyr)
   require(dracarys, include.only = "session_info_kable")
@@ -49,92 +50,103 @@ knitr::opts_chunk$set(
 }
 ```
 
-```{r data_setup}
-ggplot2::theme_set(ggplot2::theme_bw())
-meta <- params[["meta"]] |>
+```{r}
+#| label: data_import
+tidy_data_path <- params[["tidy_data"]]
+d0 <- tidy_data_path |>
   readr::read_rds() |>
-  mutate(topup_or_rerun = stringr::str_extract(fq1, "topup|rerun(2)?"))
+  slice(1:4) |>
+  mutate(
+    umccrId = glue("{.data$individualId}_{.data$libraryId}_{.data$lane}"),
+    umccrId = factor(.data$umccrId),
+    projectOwnerName = glue("{.data$projectOwner}_{.data$projectName}")
+  ) |>
+  select(
+    "umccrId",
+    "subjectId",
+    "libraryId",
+    "projectOwnerName",
+    "sampleType",
+    "phenotype",
+    "source",
+    "quality",
+    "assay",
+    "workflow",
+    "portalRunId",
+    "date",
+    "data_tidy"
+  )
+```
+
+```{r}
+#| label: data_setup
+ggplot2::theme_set(ggplot2::theme_bw())
 stopifnot(all(dir.exists(meta$outdir)))
 options(scipen = 999) # disable scientific notation
 options(width = 150)
-filepaths <- function(indir, sampleid, suffix = "rds") {
-  tibble::tibble(
-    ftype = c(
-      paste0(
-        "FastqcMetricsFile_",
-        c(
-          "positional_base_content", "positional_base_mean_quality",
-          "positional_quality", "read_gc_content", "read_gc_content_quality",
-          "read_lengths", "read_mean_quality", "sequence_positions"
-        )
-      ),
-      "FragmentLengthHistFile",
-      "MappingMetricsFile",
-      "PloidyEstimationMetricsFile",
-      "ReplayFile",
-      "TimeMetricsFile",
-      "TrimmerMetricsFile",
-      "WgsContigMeanCovFile",
-      "WgsCoverageMetricsFile",
-      "WgsFineHistFile",
-      "WgsHistFile"
-    )
-  ) |>
-    mutate(
-      fpath = file.path(indir, glue("{sampleid}_{.data$ftype}.{suffix}")),
-      file_exists = file.exists(.data$fpath)
-    )
-}
 
-dat <- meta |>
-  rowwise() |>
-  mutate(
-    fpaths = list(filepaths(indir = .data$outdir, sampleid = .data$SampleID)),
-    umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}"),
-    umccrid = if_else(is.na(.data$topup_or_rerun), .data$umccrid, glue("{umccrid}_{.data$topup_or_rerun}"))
-  ) |>
-  select("umccrid", "phenotype", "type", "source", "fpaths") |>
-  tidyr::unnest(fpaths) |>
-  filter(.data$file_exists) |>
-  rowwise() |>
-  mutate(
-    dat = list(readr::read_rds(.data$fpath))
+d_unnest <- d0 |>
+  select(
+    "umccrId", "libraryId", "subjectId",
+    type = "sampleType", "phenotype", "source",
+    "quality", "assay", "workflow", "projectOwnerName", "portalRunId", tidy = "data_tidy"
   ) |>
-  ungroup()
-
-eval <- dat |>
-  group_by(ftype) |>
-  count(file_exists, name = "nf") |>
-  mutate(eval = nf > 0) |>
-  select("ftype", "eval") |>
-  tibble::deframe() |>
-  as.list()
-
-# filetype-specific access
-d <- dat |>
-  select("umccrid", "phenotype", "type", "source", "ftype", "dat") |>
-  tidyr::nest(data = c("umccrid", "phenotype", "type", "source", "dat"))
+  tidyr::unnest("tidy", names_sep = "_")
+# tablename-specific access
+# columns: tidy_name, data
+# rows: 1 per tidy_table name
+d_name <- d_unnest |>
+  tidyr::nest(.by = "tidy_name", .key = "data")
+
+## A tibble: 18 × 2
+#   tidy_name                     data
+#   <glue>                        <list>
+# 1 contigmeancov_wgs             <tibble [4 × 4]>
+# 2 covmetrics_wgs                <tibble [4 × 4]>
+# 3 finehist_wgs                  <tibble [4 × 4]>
+# 4 fqc_positionalBaseContent     <tibble [4 × 4]>
+# 5 fqc_positionalBaseMeanQuality <tibble [4 × 4]>
+# 6 fqc_positionalQuality         <tibble [4 × 4]>
+# 7 fqc_readGCContent             <tibble [4 × 4]>
+# 8 fqc_readGCContentQuality      <tibble [4 × 4]>
+# 9 fqc_readLengths               <tibble [4 × 4]>
+# 10 fqc_readMeanQuality           <tibble [4 × 4]>
+# 11 fqc_sequencePositions         <tibble [4 × 4]>
+# 12 fraglen                       <tibble [3 × 4]>
+# 13 hist_wgs                      <tibble [4 × 4]>
+# 14 mapmetrics                    <tibble [4 × 4]>
+# 15 ploidymetrics                 <tibble [3 × 4]>
+# 16 replay                        <tibble [4 × 4]>
+# 17 timemetrics                   <tibble [4 × 4]>
+# 18 trimmermetrics                <tibble [4 × 4]>
 
 # sample-specific access
-# d_samp <- dat |>
-#   select("umccrid", "phenotype", "type", "ftype", "dat") |>
-#   tidyr::nest(data = c("phenotype", "type", "ftype", "dat")) |>
-#   arrange(desc("umccrid"))
+# columns: umccrId, data
+# rows: 1 per umccrId
+d_samp <- d_unnest |>
+  tidyr::nest(.by = "umccrId", .key = "data") |>
+  arrange(desc(.data$umccrId))
+## A tibble: 4 × 2
+#  umccrId           data
+#  <fct>             <list>
+# 1 SBJ05890_L2401624 <tibble [18 × 4]>
+# 2 SBJ05889_L2401644 <tibble [16 × 4]>
+# 3 SBJ05888_L2401621 <tibble [18 × 4]>
+# 4 SBJ05856_L2401572 <tibble [18 × 4]>
+
+dr_unnest <- function(x1, ...) {
+  d_name |>
+    dplyr::filter(.data$tidy_name == x1) |>
+    tidyr::unnest("data") |>
+    dplyr::mutate(nrows = purrr::map_int(.data$tidy_data, nrow)) |>
+    dplyr::filter(.data$nrows > 0) |>
+    dplyr::select(dplyr::everything(), -c("tidy_name", "nrows")) |>
+    dplyr::relocate("tidy_data", .after = dplyr::last_col()) |>
+    tidyr::unnest("tidy_data")
+}
 ```
 
 ```{r funcs}
-dr_unnest <- function(x1) {
-  d |>
-    filter(.data$ftype == x1) |>
-    tidyr::unnest(data) |>
-    rowwise() |>
-    mutate(nrows = nrow(.data$dat)) |>
-    ungroup() |>
-    filter(nrows > 0) |>
-    tidyr::unnest(dat) |>
-    select("umccrid", "phenotype", "type", "source", everything(), -c("ftype", "nrows"))
-}
-
 dt_view <- function(x, caption = NULL, scroll_y = 10 + min(nrow(x) * 35, 570), ...) {
   x |>
     DT::datatable(
@@ -155,15 +167,12 @@ blank_lines <- function(n = 10) {
   cat(rep("&nbsp;  ", n), sep = "\n")
 }
 
-get_sbj_url <- function(x, colour = NULL, account = "pro") {
-  assertthat::assert_that(account %in% c("pro", "stg", "dev"))
-  account <- ifelse(account == "pro", "", account)
-  sbj_url <- glue("https://portal{account}.umccr.org/subjects/{x}/overview")
+get_lib_url <- function(lid, text, colour = NULL) {
+  url <- glue("https://orcaui.umccr.org/lab?tab=library&search={lid}")
   if (!is.null(colour)) {
-    return(glue("<a href={sbj_url} style='background-color:{colour}'>{x}</a>"))
+    return(glue("<a href={url} style='background-color:{colour}'>{text}</a>"))
   }
-  sbj_url <- glue("<a href={sbj_url}>{x}</a>")
-  sbj_url
+  return(glue("<a href={url}>{text}</a>"))
 }
 
 type_col <- list(
@@ -175,17 +184,13 @@ type_col <- list(
 ## Sample Metadata
 
 ```{r meta}
-meta |>
-  arrange(desc(SubjectID), type, LibraryID, lane) |>
+d0 |>
+  arrange(desc(.data$umccrId), libraryId, sampleType) |>
   mutate(
-    SubjectID = get_sbj_url(.data$SubjectID),
-    durationMin = round(end - start)
-  ) |>
-  select(
-    SubjectID, type, LibraryID, lane, durationMin, topup_or_rerun,
-    everything(),
-    -c("rownum", "indir", "outdir", "cmd", "fq1", "fq2")
+    umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId),
+    date_analysis_end = lubridate::ymd_hms(.data$date)
   ) |>
+  select(umccrId, type = "sampleType", everything(), -data_tidy) |>
   dt_view(escape = FALSE) |>
   DT::formatStyle(
     "type",
@@ -200,33 +205,39 @@ meta |>
 ### Mapping
 
 ```{r mm, eval=eval$MappingMetricsFile}
-d_map <- dr_unnest("MappingMetricsFile") |>
-  arrange(desc(umccrid), type) |>
+d_map <- dr_unnest("mapmetrics") |>
+  arrange(desc(umccrId), type) |>
+  mutate(
+    umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId),
+    RG = ifelse(.data$RG == "Total", "Total", "RG")
+  ) |>
   select(
-    umccrid, phenotype, type,
-    source,
-    tot = reads_tot_rg_dragen,
-    dup = reads_num_dupmarked_dragen,
-    `dup%` = reads_num_dupmarked_dragen_pct,
-    `map%` = reads_mapped_dragen_pct,
-    `unmap%` = reads_unmapped_dragen_pct,
-    `uniq%` = reads_num_uniq_dragen_pct,
-    `uniq_map%` = reads_num_uniq_mapped_dragen_pct,
-    `paired%` = reads_paired_dragen_pct,
-    `paired_proper%` = reads_paired_proper_dragen_pct,
-    `singleton%` = reads_singleton_dragen_pct,
-    `discordant%` = reads_discordant_dragen_pct,
-    `rrna_filt%` = reads_rrna_filtered_dragen_pct,
-    `splicejunc%` = reads_splicejunc_dragen_pct,
-    `mapq_0-10%` = reads_mapq_0_10_dragen_pct,
-    `mapq_10-20%` = reads_mapq_10_20_dragen_pct,
-    `mapq_20-30%` = reads_mapq_20_30_dragen_pct,
-    `mapq_30-40%` = reads_mapq_30_40_dragen_pct,
-    `mapq_40-Inf%` = reads_mapq_40_inf_dragen_pct,
-    read_len = read_len_dragen,
-    insert_len_med = insert_len_median_dragen,
-    insert_len_mean = insert_len_mean_dragen,
-    everything()
+    umccrId, subjectId,
+    phenotype, type,
+    source, quality, assay, workflow, projectOwnerName, portalRunId, RG,
+    tot_reads = reads_tot_rg,
+    dup_reads = reads_num_dupmarked,
+    `dup%` = reads_num_dupmarked_pct,
+    `map%` = reads_mapped_pct,
+    `unmap%` = reads_unmapped_pct,
+    `uniq%` = reads_num_uniq_pct,
+    `uniq_map%` = reads_num_uniq_mapped_pct,
+    `paired%` = reads_paired_pct,
+    `paired_proper%` = reads_paired_proper_pct,
+    `singleton%` = reads_singleton_pct,
+    `discordant%` = reads_discordant_pct,
+    `rrna_filt%` = reads_rrna_filtered_pct,
+    `splicejunc%` = reads_splicejunc_pct,
+    `mapq_0-10%` = reads_mapq_0_10_pct,
+    `mapq_10-20%` = reads_mapq_10_20_pct,
+    `mapq_20-30%` = reads_mapq_20_30_pct,
+    `mapq_30-40%` = reads_mapq_30_40_pct,
+    `mapq_40-Inf%` = reads_mapq_40_inf_pct,
+    read_len = read_len,
+    insert_len_med = insert_len_median,
+    insert_len_mean = insert_len_mean,
+    everything(),
+    -c("libraryId", "tidy_prefix", "dragen_sample")
   )
 num_cols <- purrr::map_lgl(d_map, is.numeric)
 num_pct_cols <- grepl("%", names(d_map)) & num_cols
@@ -239,14 +250,7 @@ conf <- list(
   pink_range = c(8, 20)
 )
 d_map |>
-  left_join(
-    meta |>
-      mutate(umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}")) |>
-      select(umccrid, assay, workflow, project_name, project_owner),
-    by = "umccrid"
-  ) |>
-  select(umccrid, phenotype, type, source, assay, workflow, project_name, project_owner, everything()) |>
-  dt_view(scroll_y = 1500) |>
+  dt_view(scroll_y = 1500, escape = FALSE) |>
   DT::formatCurrency(columns = names(d_map)[num_pct_cols], currency = "", digits = 1) |>
   DT::formatCurrency(columns = names(d_map)[no_numpct_cols], currency = "", digits = 0) |>
   DT::formatStyle(
@@ -276,49 +280,52 @@ d_map |>
 - Ploidy metrics only for **WGS**.
 
 ```{r covm, eval=eval$WgsCoverageMetricsFile}
-d_pl <- dr_unnest("PloidyEstimationMetricsFile") |>
-  arrange(desc(umccrid))
+d_pl <- dr_unnest("ploidymetrics") |>
+  arrange(desc(umccrId))
 d_pl_metrics <- d_pl |>
   select(
-    umccrid, phenotype, type, source,
-    ploidy = ploidy_est_dragen,
-    cvg_auto_med_ploidy = cov_auto_median_dragen,
-    cvg_x_med_ploidy = cov_x_median_dragen,
-    cvg_y_med_ploidy = cov_y_median_dragen
+    umccrId, subjectId,
+    phenotype, type,
+    source, quality, assay, workflow, projectOwnerName, portalRunId,
+    ploidy = ploidy_est,
+    cvg_auto_med_ploidy = cov_autosomal_median,
+    cvg_x_med_ploidy = cov_x_median,
+    cvg_y_med_ploidy = cov_y_median
   )
 # cov_genome_pct_* metrics are in the Hist data, so filter out here
-d_cvg <- dr_unnest("WgsCoverageMetricsFile") |>
-  arrange(desc(umccrid)) |>
-  left_join(d_pl_metrics, by = c("umccrid", "phenotype", "type", "source")) |>
+d_cvg <- dr_unnest("covmetrics_wgs") |>
+  arrange(desc(umccrId)) |>
+  left_join(d_pl_metrics, by = c(
+    "umccrId", "subjectId", "phenotype", "type", "source",
+    "quality", "assay",
+    "workflow", "projectOwnerName", "portalRunId"
+  )) |>
+  mutate(umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId)) |>
   select(
-    umccrid, phenotype, type, source,
+    "umccrId", "phenotype", "type", "source",
+    "quality", "assay",
+    "workflow", "projectOwnerName", "portalRunId",
     ploidy,
-    cvg_auto_avg = cov_avg_auto_over_genome_dragen,
-    cvg_auto_med = cov_median_auto_over_genome_dragen,
-    cvg_x_avg = cov_avg_x_over_genome_dragen,
-    cvg_y_avg = cov_avg_y_over_genome_dragen,
-    cvg_uniq = cov_alignment_avg_over_genome_dragen,
-    cvg_mito_avg = cov_avg_mt_over_genome_dragen,
+    cvg_auto_avg = cov_avg_auto_over_genome,
+    cvg_auto_med = cov_median_auto_over_genome,
+    cvg_x_avg = cov_avg_x_over_genome,
+    cvg_y_avg = cov_avg_y_over_genome,
+    cvg_uniq = cov_alignment_avg_over_genome,
+    cvg_mito_avg = cov_avg_mt_over_genome,
     cvg_auto_med_ploidy,
     cvg_x_med_ploidy,
     cvg_y_med_ploidy,
-    reads_aligned_dragen,
-    bases_aligned_dragen,
-    cvg_gt02 = cov_uniformity_over_genome_pct_gt02mean_dragen,
-    cvg_gt04 = cov_uniformity_over_genome_pct_gt04mean_dragen,
+    reads_aligned_tot,
+    bases_aligned_tot,
+    cvg_gt02 = cov_uniformity_pct_gt02mean_genome,
+    cvg_gt04 = cov_uniformity_pct_gt04mean_genome, ,
     everything(),
-    -contains("cov_genome_pct_")
+    -contains("cov_pct_"),
+    -c("libraryId", "tidy_prefix")
   )
 num_cols <- names(d_cvg)[purrr::map_lgl(d_cvg, is.numeric)]
 d_cvg |>
-  left_join(
-    meta |>
-      mutate(umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}")) |>
-      select(umccrid, assay, workflow, project_name, project_owner),
-    by = "umccrid"
-  ) |>
-  select(umccrid, phenotype, type, source, assay, workflow, project_name, project_owner, everything()) |>
-  dt_view(scroll_y = 1500) |>
+  dt_view(scroll_y = 1500, escape = FALSE) |>
   DT::formatCurrency(columns = num_cols, currency = "", digits = 1) |>
   DT::formatStyle(
     "ploidy",
@@ -337,7 +344,7 @@ d_cvg |>
 ### Trimmer
 
 ```{r trim, eval=eval$TrimmerMetricsFile}
-d_tr <- dr_unnest("TrimmerMetricsFile") |>
+d_tr <- dr_unnest("trimmermetrics") |>
   arrange(desc(umccrid)) |>
   select(
     umccrid, phenotype, type, source,

From 661e01ee51535c44bad5288e46434337e4d2e065 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 1 Dec 2024 15:06:25 +1100
Subject: [PATCH 30/32] alignqc: refactor summary report

---
 inst/reports/wgts-qc/summary.qmd              | 286 +++++++++---------
 .../alignment_qc/dl_and_tidy.R                |   4 +-
 2 files changed, 143 insertions(+), 147 deletions(-)

diff --git a/inst/reports/wgts-qc/summary.qmd b/inst/reports/wgts-qc/summary.qmd
index 51b729b..3601013 100644
--- a/inst/reports/wgts-qc/summary.qmd
+++ b/inst/reports/wgts-qc/summary.qmd
@@ -13,7 +13,7 @@ format:
     toc-location: body
     highlight-style: github
     number-sections: false
-    link-external-icon: true
+    link-external-icon: false
     link-external-newwindow: true
     embed-resources: true
     code-copy: true
@@ -55,7 +55,6 @@ params:
 tidy_data_path <- params[["tidy_data"]]
 d0 <- tidy_data_path |>
   readr::read_rds() |>
-  slice(1:4) |>
   mutate(
     umccrId = glue("{.data$individualId}_{.data$libraryId}_{.data$lane}"),
     umccrId = factor(.data$umccrId),
@@ -81,7 +80,6 @@ d0 <- tidy_data_path |>
 ```{r}
 #| label: data_setup
 ggplot2::theme_set(ggplot2::theme_bw())
-stopifnot(all(dir.exists(meta$outdir)))
 options(scipen = 999) # disable scientific notation
 options(width = 150)
 
@@ -146,7 +144,8 @@ dr_unnest <- function(x1, ...) {
 }
 ```
 
-```{r funcs}
+```{r}
+#| label: funcs
 dt_view <- function(x, caption = NULL, scroll_y = 10 + min(nrow(x) * 35, 570), ...) {
   x |>
     DT::datatable(
@@ -183,14 +182,15 @@ type_col <- list(
 
 ## Sample Metadata
 
-```{r meta}
+```{r}
+#| label: meta
 d0 |>
   arrange(desc(.data$umccrId), libraryId, sampleType) |>
   mutate(
     umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId),
     date_analysis_end = lubridate::ymd_hms(.data$date)
   ) |>
-  select(umccrId, type = "sampleType", everything(), -data_tidy) |>
+  select(umccrId, type = "sampleType", everything(), -data_tidy, -date) |>
   dt_view(escape = FALSE) |>
   DT::formatStyle(
     "type",
@@ -204,7 +204,8 @@ d0 |>
 
 ### Mapping
 
-```{r mm, eval=eval$MappingMetricsFile}
+```{r}
+#| label: mapmetrics
 d_map <- dr_unnest("mapmetrics") |>
   arrange(desc(umccrId), type) |>
   mutate(
@@ -279,7 +280,8 @@ d_map |>
 
 - Ploidy metrics only for **WGS**.
 
-```{r covm, eval=eval$WgsCoverageMetricsFile}
+```{r}
+#| label: covmetrics
 d_pl <- dr_unnest("ploidymetrics") |>
   arrange(desc(umccrId))
 d_pl_metrics <- d_pl |>
@@ -343,38 +345,45 @@ d_cvg |>
 
 ### Trimmer
 
-```{r trim, eval=eval$TrimmerMetricsFile}
+```{r}
+#| label: trimmermetrics
 d_tr <- dr_unnest("trimmermetrics") |>
-  arrange(desc(umccrid)) |>
+  arrange(desc(umccrId), type) |>
+  mutate(
+    umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId)
+  ) |>
   select(
-    umccrid, phenotype, type, source,
-    reads_tot = reads_tot_input_dragen,
-    read_len_avg = read_len_avg_dragen,
-    `polygkmers3r1_remain%` = polygkmers3r1_remaining_dragen_pct,
-    `polygkmers3r2_remain%` = polygkmers3r2_remaining_dragen_pct,
-    `polyg_soft_trimmed_reads_unfilt_3r1%` = polyg_soft_trimmed_reads_unfilt_3r1_dragen_pct,
-    `polyg_soft_trimmed_reads_unfilt_3r2%` = polyg_soft_trimmed_reads_unfilt_3r2_dragen_pct,
-    `polyg_soft_trimmed_bases_unfilt_3r1%` = polyg_soft_trimmed_bases_unfilt_3r1_dragen_pct,
-    `polyg_soft_trimmed_bases_unfilt_3r2%` = polyg_soft_trimmed_bases_unfilt_3r2_dragen_pct,
-    polygkmers3r1_remaining = polygkmers3r1_remaining_dragen,
-    polygkmers3r2_remaining = polygkmers3r2_remaining_dragen,
-    polyg_soft_trimmed_reads_unfilt_3r1 = polyg_soft_trimmed_reads_unfilt_3r1_dragen,
-    polyg_soft_trimmed_reads_unfilt_3r2 = polyg_soft_trimmed_reads_unfilt_3r2_dragen,
-    polyg_soft_trimmed_bases_unfilt_3r1 = polyg_soft_trimmed_bases_unfilt_3r1_dragen,
-    polyg_soft_trimmed_bases_unfilt_3r2 = polyg_soft_trimmed_bases_unfilt_3r2_dragen,
-    bases_tot = bases_tot_dragen,
-    bases_r1 = bases_r1_dragen,
-    bases_r2 = bases_r2_dragen,
-    reads_trimmed_tot = reads_trimmed_tot_dragen,
-    `reads_trimmed_tot%` = reads_trimmed_tot_dragen_pct,
-    bases_trimmed_tot = bases_trimmed_tot_dragen,
-    `bases_trimmed_tot%` = bases_trimmed_tot_dragen_pct,
-    reads_tot_filt = reads_tot_filt_dragen,
-    `reads_tot_filt%` = reads_tot_filt_dragen_pct,
-    everything()
+    umccrId, subjectId,
+    phenotype, type,
+    source, quality, assay, workflow, projectOwnerName, portalRunId,
+    reads_tot = reads_tot_input,
+    read_len_avg = read_len_avg,
+    `polygkmers3r1_remain%` = polygkmers3r1_remaining_pct,
+    `polygkmers3r2_remain%` = polygkmers3r2_remaining_pct,
+    `polyg_soft_trimmed_reads_unfilt_3r1%` = polyg_soft_trimmed_reads_unfilt_3r1_pct,
+    `polyg_soft_trimmed_reads_unfilt_3r2%` = polyg_soft_trimmed_reads_unfilt_3r2_pct,
+    `polyg_soft_trimmed_bases_unfilt_3r1%` = polyg_soft_trimmed_bases_unfilt_3r1_pct,
+    `polyg_soft_trimmed_bases_unfilt_3r2%` = polyg_soft_trimmed_bases_unfilt_3r2_pct,
+    polygkmers3r1_remaining = polygkmers3r1_remaining,
+    polygkmers3r2_remaining = polygkmers3r2_remaining,
+    polyg_soft_trimmed_reads_unfilt_3r1 = polyg_soft_trimmed_reads_unfilt_3r1,
+    polyg_soft_trimmed_reads_unfilt_3r2 = polyg_soft_trimmed_reads_unfilt_3r2,
+    polyg_soft_trimmed_bases_unfilt_3r1 = polyg_soft_trimmed_bases_unfilt_3r1,
+    polyg_soft_trimmed_bases_unfilt_3r2 = polyg_soft_trimmed_bases_unfilt_3r2,
+    bases_tot = bases_tot,
+    bases_r1 = bases_r1,
+    bases_r2 = bases_r2,
+    reads_trimmed_tot = reads_trimmed_tot,
+    `reads_trimmed_tot%` = reads_trimmed_tot_pct,
+    bases_trimmed_tot = bases_trimmed_tot,
+    `bases_trimmed_tot%` = bases_trimmed_tot_pct,
+    reads_tot_filt = reads_tot_filt,
+    `reads_tot_filt%` = reads_tot_filt_pct,
+    everything(),
+    -c("libraryId", "tidy_prefix")
   )
 d_tr |>
-  dt_view() |>
+  dt_view(escape = FALSE) |>
   DT::formatStyle(
     "type",
     color = DT::styleEqual(
@@ -391,9 +400,11 @@ in the legend.
 
 ### Read Mean Quality ('Per-Sequence Quality Scores')
 
-```{r read_mean_qual, fig.height=10}
-f1 <- dr_unnest("FastqcMetricsFile_read_mean_quality") |>
-  group_by(umccrid, mate) |>
+```{r}
+#| label: fqc_readMeanQuality
+#| fig-height: 10
+f1 <- dr_unnest("fqc_readMeanQuality") |>
+  group_by(umccrId, mate) |>
   mutate(
     tot = sum(.data$value),
     prop = round(.data$value / .data$tot, 3),
@@ -413,7 +424,7 @@ f1_plot <- ggplot() +
     fill = rep(fqc_colours1$col, length(unique(f1$type))),
     alpha = 0.7
   ) +
-  geom_line(data = f1, aes(x = q, y = prop, colour = umccrid, linetype = mate), linewidth = 1, show.legend = FALSE) +
+  geom_line(data = f1, aes(x = q, y = prop, colour = umccrId, linetype = mate), linewidth = 1, show.legend = FALSE) +
   scale_y_continuous(labels = scales::label_comma()) +
   theme(panel.grid.major = element_blank()) +
   facet_wrap(~type, ncol = 1) +
@@ -422,15 +433,17 @@ f1_plot <- ggplot() +
     subtitle = glue("Percentage of reads with average quality scores. Shows if\na subset of reads has poor quality.")
   )
 
-plotly::ggplotly(f1_plot)
-# f1_plot
+# plotly::ggplotly(f1_plot)
+f1_plot
 ```
 
 ### GC Content ('Per-Sequence GC Content')
 
-```{r read_gc, fig.height=10}
-gc_data <- dr_unnest("FastqcMetricsFile_read_gc_content") |>
-  group_by(umccrid, mate) |>
+```{r}
+#| label: fqc_readGCContent
+#| fig-height: 10
+gc_data <- dr_unnest("fqc_readGCContent") |>
+  group_by(umccrId, mate) |>
   mutate(
     tot = sum(.data$value),
     prop = round(.data$value / .data$tot, 3),
@@ -439,7 +452,7 @@ gc_data <- dr_unnest("FastqcMetricsFile_read_gc_content") |>
   ungroup()
 
 gc_data_plot <- gc_data |>
-  ggplot(aes(x = pct, y = prop, colour = umccrid)) +
+  ggplot(aes(x = pct, y = prop, colour = umccrId)) +
   geom_line(aes(linetype = mate), alpha = 0.4, linewidth = 1) +
   facet_wrap(~type, ncol = 1) +
   labs(
@@ -448,14 +461,15 @@ gc_data_plot <- gc_data |>
     title = "Read GC Content",
     subtitle = glue("Total number of reads with each GC content\npercentile between 0% and 100%")
   )
-plotly::ggplotly(gc_data_plot)
-# gc_data_plot
+# plotly::ggplotly(gc_data_plot)
+gc_data_plot
 ```
 
 ### GC Content Quality ('GC Content Mean Quality Scores')
 
-```{r read_gc_qual}
-f1 <- dr_unnest("FastqcMetricsFile_read_gc_content_quality") |>
+```{r}
+#| label: fqc_readGCContentQuality
+f1 <- dr_unnest("fqc_readGCContentQuality") |>
   filter(!is.na(.data$value))
 fqc_colours2 <- tibble::tibble(
   start = c(0, 20, 28),
@@ -469,28 +483,31 @@ f1_plot <- ggplot() +
     fill = rep(fqc_colours2$col, length(unique(f1$type))),
     alpha = 0.7
   ) +
-  geom_line(data = f1, aes(x = pct, y = value, colour = umccrid, linetype = mate), linewidth = 1) +
+  geom_line(data = f1, aes(x = pct, y = value, colour = umccrId, linetype = mate), linewidth = 1) +
   facet_wrap(~type, ncol = 1) +
   labs(
     title = "GC Content Quality",
     subtitle = glue("Average Phred-scale read mean quality for reads with\neach GC content percentile between 0% and 100%.")
   )
-plotly::ggplotly(f1_plot)
-# f1_plot
+# plotly::ggplotly(f1_plot)
+f1_plot
 ```
 
 ### Positional Base Content ('Per-Position Sequence Content')
 
 - TODO: create heatmap instead
 
-```{r fqc_pbc, eval=F, fig.height=42}
-f1 <- dr_unnest("FastqcMetricsFile_positional_base_content")
+```{r}
+#| label: fqc_pbc
+#| fig-height: 42
+#| eval: false
+f1 <- dr_unnest("fqc_positionalBaseContent")
 f1 |>
   filter(base != "N") |>
   mutate(prop = prop * 100) |>
   ggplot(aes(x = pos, y = prop, colour = base, group = base)) +
   geom_line() +
-  facet_grid(forcats::fct_rev(umccrid) ~ mate) +
+  facet_grid(forcats::fct_rev(umccrId) ~ mate) +
   labs(
     x = "Position in Read (bp)",
     y = "Proportion of Bases",
@@ -505,17 +522,20 @@ f1 |>
 ### Positional Base Mean Quality ('Per-Position Mean Quality Scores')
 
 
-```{r fqc_bmq, eval=F, fig.height=80}
-f1 <- dr_unnest("FastqcMetricsFile_positional_base_mean_quality")
+```{r}
+#| label: fqc_bmq
+#| fig-height: 80
+#| eval: false
+f1 <- dr_unnest("fqc_positionalBaseMeanQuality")
 ggplot() +
   geom_rect(
     data = fqc_colours2,
     mapping = aes(ymin = start, ymax = end, xmin = -Inf, xmax = Inf),
-    fill = rep(fqc_colours2$col, length(unique(f1$umccrid)) * length(unique(f1$mate))),
+    fill = rep(fqc_colours2$col, length(unique(f1$umccrId)) * length(unique(f1$mate))),
     alpha = 0.7
   ) +
   geom_line(data = f1, aes(x = pos, y = value, colour = base)) +
-  facet_grid(forcats::fct_rev(umccrid) ~ mate) +
+  facet_grid(forcats::fct_rev(umccrId) ~ mate) +
   labs(
     x = "Position in Read (bp)",
     y = "Quality Score",
@@ -529,16 +549,20 @@ ggplot() +
 
 ### Positional Quality ('Per-Position Quality Score Ranges')
 
-```{r fqc_pq, eval=F, fig.width=13}
+```{r}
+#| label: fqc_pq
+#| fig-width: 13
+#| eval: false
+
 # TODO: use boxplot instead of point
-f1 <- dr_unnest("FastqcMetricsFile_positional_quality")
+f1 <- dr_unnest("fqc_positionalQuality")
 quants <- c(25, 50, 75)
 f1 |>
   mutate(pos = as.integer(.data$pos)) |>
   filter(pct %in% quants) |>
   ggplot(aes(x = pos, y = value, colour = pct)) +
   geom_point() +
-  facet_wrap(~ forcats::fct_rev(umccrid)) +
+  facet_wrap(~ forcats::fct_rev(umccrId)) +
   labs(
     title = "Positional Quality",
     subtitle = glue("Phred-scale quality value for bases at a given location and a\ngiven quantile of the distribution ({paste(quants, collapse = ', ')})")
@@ -547,17 +571,20 @@ f1 |>
 
 ### Read Lengths ('Sequence Length Distribution')
 
-```{r read_len, fig.height=8}
-read_len <- dr_unnest("FastqcMetricsFile_read_lengths")
+```{r}
+#| label: fqc_readLengths
+#| fig-height: 8
+read_len <- dr_unnest("fqc_readLengths")
 read_len_plot <- read_len |>
-  group_by(umccrid, mate) |>
+  group_by(umccrId, mate) |>
   mutate(
     tot = sum(.data$value),
     prop = round(.data$value / .data$tot, 3),
     prop = 100 * prop
   ) |>
   ungroup() |>
-  ggplot(aes(x = bp, y = prop, colour = umccrid)) +
+  select(umccrId, type, mate, bp, value, tot, prop) |>
+  ggplot(aes(x = bp, y = prop, colour = umccrId)) +
   geom_line(aes(linetype = mate), linewidth = 1) +
   theme(
     panel.grid.major = element_blank()
@@ -567,19 +594,20 @@ read_len_plot <- read_len |>
     title = "Read Lengths",
     subtitle = glue("Read percentage with each observed length.")
   )
-plotly::ggplotly(read_len_plot)
-# read_len_plot
+# plotly::ggplotly(read_len_plot)
+read_len_plot
 ```
 
 ### Sequence Positions ('Adapter Content')
 
-
-```{r seq_pos, eval=T, fig.height=42}
-f1 <- dr_unnest("FastqcMetricsFile_sequence_positions")
+```{r}
+#| label: fqc_sequencePositions
+#| fig-height: 42
+f1 <- dr_unnest("fqc_sequencePositions")
 f1 |>
   ggplot(aes(x = bp, y = value, colour = seq)) +
   geom_line() +
-  facet_grid(forcats::fct_rev(umccrid) ~ mate, scales = "free_y") +
+  facet_grid(forcats::fct_rev(umccrId) ~ mate, scales = "free_y") +
   labs(title = glue(
     "Number of times an adapter or other kmer sequence is found,\n",
     "starting at a given position in the input reads."
@@ -590,16 +618,21 @@ f1 |>
 
 ## Coverage {.tabset .tabset-pills}
 
-```{r contig_cvg, eval=F, results='asis', fig.height=5}
-d1 <- dr_unnest("WgsContigMeanCovFile") |>
-  arrange(desc("umccrid"))
+```{r}
+#| label: contig_cvg
+#| fig-height: 5
+#| eval: false
+#| results: asis
+# TODO: FIXME
+d1 <- dr_unnest("contigmeancov_wgs") |>
+  arrange(desc("umccrId"))
 for (type1 in sort(unique(d1$type), decreasing = FALSE)) {
   cat(glue("\n\n### {type1} {{.tabset .tabset-pills}}"), "\n\n")
   d1_type <- d1 |>
     filter(type == type1)
-  for (s in sort(unique(d1_type$umccrid), decreasing = TRUE)) {
+  for (s in sort(unique(d1_type$umccrId), decreasing = TRUE)) {
     p1 <- d1_type |>
-      filter(umccrid == s) |>
+      filter(umccrId == s) |>
       dracarys::WgsContigMeanCovFile$public_methods$plot() +
       ggplot2::labs(subtitle = s)
     cat(glue("\n#### {s}"), "\n")
@@ -618,13 +651,16 @@ for (type1 in sort(unique(d1$type), decreasing = FALSE)) {
 
 - Only for WGS.
 
-```{r fraglenhist_plot, eval=eval$FragmentLengthHistFile, fig.height=8}
-fl1 <- dr_unnest("FragmentLengthHistFile")
+```{r}
+#| label: fraglenhist_plot
+#| fig-height: 8
+
+fl1 <- dr_unnest("fraglen")
 min_count <- 10
 flp <- fl1 |>
   filter(.data$count >= min_count) |>
   ggplot(aes(x = .data$fragmentLength, y = .data$count)) +
-  geom_line(aes(colour = umccrid)) +
+  geom_line(aes(colour = umccrId)) +
   labs(title = "Fragment Length Distribution") +
   xlab("Fragment Length (bp)") +
   ylab(glue("Read Count (min: {min_count})")) +
@@ -643,21 +679,25 @@ plotly::ggplotly(flp)
 - Only for WGS.
 
 ```{r pe, eval=T, fig.height=5}
+#| label: pe
+#| eval: FALSE
+#| fig-height: 5
+
 chrom_levels <- c(1:22, "x", "y")
 d_pl_plot_data <- d_pl |>
   select(
-    umccrid, phenotype, type,
+    umccrId, phenotype, type,
     contains("div_auto_median")
   ) |>
-  tidyr::pivot_longer(-c("umccrid", "phenotype", "type")) |>
+  tidyr::pivot_longer(-c("umccrId", "phenotype", "type")) |>
   tidyr::separate_wider_delim("name", delim = "_", names = c("cov", "chrom", "rest"), too_many = "merge") |>
   mutate(chrom = factor(chrom, levels = chrom_levels)) |>
-  select(umccrid, phenotype, type, chrom, value)
+  select(umccrId, phenotype, type, chrom, value)
 
 d_pl_plot <- d_pl_plot_data |>
   ggplot(aes(x = chrom, y = value)) +
-  geom_line(aes(colour = umccrid, group = umccrid), na.rm = TRUE) +
-  geom_point(aes(colour = umccrid), na.rm = TRUE) +
+  geom_line(aes(colour = umccrId, group = umccrId), na.rm = TRUE) +
+  geom_point(aes(colour = umccrId), na.rm = TRUE) +
   labs(title = "Chromosome Median / Autosomal Median")
 plotly::ggplotly(d_pl_plot)
 # d_pl_plot
@@ -668,10 +708,14 @@ plotly::ggplotly(d_pl_plot)
 
 ## Hist
 
-```{r cvgm, eval=T, fig.height=8, fig.width=12}
-d_hist <- dr_unnest("WgsHistFile")
+```{r}
+#| label: cvgm
+#| eval: false
+#| fig-height: 8
+#| fig-width: 12
+d_hist <- dr_unnest("hist_wgs")
 d_hist1 <- d_hist |>
-  ggplot(aes(x = start, y = pct, colour = umccrid)) +
+  ggplot(aes(x = start, y = pct, colour = umccrId)) +
   geom_point() +
   geom_linerange(aes(xmin = start, xmax = end)) +
   scale_y_continuous(n.breaks = 10) +
@@ -683,7 +727,7 @@ d_hist1 <- d_hist |>
     subtitle = "e.g. X PCT of bases have coverage between 100 and 500."
   )
 d_hist2 <- d_hist |>
-  ggplot(aes(x = start, y = cumsum, colour = umccrid)) +
+  ggplot(aes(x = start, y = cumsum, colour = umccrId)) +
   geom_point() +
   geom_line() +
   scale_x_continuous(n.breaks = 10) +
@@ -697,63 +741,15 @@ d_hist2 <- d_hist |>
 
 ## FineHist
 
-```{r finehist, eval=F, fig.height=10, fig.width=12}
-d_fhist <- dr_unnest("WgsFineHistFile")
+```{r}
+#| label: finehist
+#| eval: false
+#| fig-height: 10
+#| fig-width: 12
+d_fhist <- dr_unnest("finehist_wgs")
 d_fhist |>
   dracarys::WgsFineHistFile$public_methods$plot(c(0, 150)) +
-  facet_wrap(~ forcats::fct_rev(umccrid), scales = "free_y")
+  facet_wrap(~ forcats::fct_rev(umccrId), scales = "free_y")
 ```
 
 ---
-
-## Addendum {.tabset .tabset-pills}
-
-<details>
-
-<summary>Details</summary>
-
-### Params
-
-```{r params_info}
-params |>
-  purrr::modify_if(is.null, \(x) "NULL", .else = as.character) |>
-  tibble::enframe(name = "Parameter", value = "Value") |>
-  tidyr::unnest("Value", keep_empty = TRUE) |>
-  knitr::kable()
-```
-
-### SessionInfo {.tabset .tabset-pills}
-
-```{r si_prep}
-si <- dracarys:::session_info_tbls()
-si_pkg <- si$si_pkg
-si_pl <- si$si_pl
-```
-
-#### Platform
-
-```{r si_pl}
-si_pl |>
-  knitr::kable()
-```
-
-#### Packages
-
-```{r si_pkg}
-si_pkg |>
-  knitr::kable()
-```
-
-#### SysInfo
-
-```{r reporter_details, comment = NA}
-tibble::tribble(
-  ~Info, ~Value,
-  "Node", Sys.info()["nodename"],
-  "OS", Sys.info()["sysname"],
-  "User", Sys.info()["user"],
-) |>
-  knitr::kable()
-```
-
-</details>
diff --git a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R
index 559290d..c1977d4 100755
--- a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R
+++ b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R
@@ -39,7 +39,7 @@ wf2 <- wf1 |>
   tidyr::unnest(pld_tidy)
 
 query_limsrow_libids <- function(libids) {
-  assertthat::assert_that(!is.null(libids), all(grepl("^L", libids)))
+  stopifnot(!is.null(libids), all(grepl("^L", libids)))
   libids <- unique(libids) |>
     paste(collapse = "|")
   q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');")
@@ -107,7 +107,7 @@ data_tidy <- wf_lims |>
   mutate(
     indir = .data$output_dragenAlignmentOutputUri,
     outdir = file.path(sub("s3://", "", .data$indir)),
-    outdir = file.path(normalizePath("~/s3"), .data$outdir)
+    outdir = fs::as_fs_path(file.path(normalizePath("~/s3"), .data$outdir))
     # indir = file.path(outdir, "dracarys_s3_sync"), # for when debugging locally
   ) |>
   mutate(

From f8be64757fae0a43cc0a60b330cd5e3d5cdc18d1 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 3 Dec 2024 00:11:01 +1100
Subject: [PATCH 31/32] remove trailing comma

---
 inst/reports/wgts-qc/summary.qmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inst/reports/wgts-qc/summary.qmd b/inst/reports/wgts-qc/summary.qmd
index 3601013..88ceafd 100644
--- a/inst/reports/wgts-qc/summary.qmd
+++ b/inst/reports/wgts-qc/summary.qmd
@@ -320,7 +320,7 @@ d_cvg <- dr_unnest("covmetrics_wgs") |>
     reads_aligned_tot,
     bases_aligned_tot,
     cvg_gt02 = cov_uniformity_pct_gt02mean_genome,
-    cvg_gt04 = cov_uniformity_pct_gt04mean_genome, ,
+    cvg_gt04 = cov_uniformity_pct_gt04mean_genome,
     everything(),
     -contains("cov_pct_"),
     -c("libraryId", "tidy_prefix")

From 8694edec86db604a62e7edb10612d5fa3afd4b12 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 3 Dec 2024 00:16:04 +1100
Subject: [PATCH 32/32] use purrr instead of stats

---
 DESCRIPTION | 1 -
 R/dragen.R  | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fe296e3..fb57955 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -58,4 +58,3 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.1
-VignetteBuilder: knitr
diff --git a/R/dragen.R b/R/dragen.R
index 9e80b62..4cba49b 100644
--- a/R/dragen.R
+++ b/R/dragen.R
@@ -791,13 +791,13 @@ dragen_ploidy_estimation_metrics_read <- function(x) {
   raw <- readr::read_lines(x)
   assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1]))
   fun1 <- function(x) {
-    setNames(
+    purrr::set_names(
       as.character(glue("cov_{tolower(x)}_div_auto_median")),
       as.character(glue("{x} median / Autosomal median"))
     )
   }
   fun2 <- function(x) {
-    setNames(
+    purrr::set_names(
       as.character(glue("cov_{tolower(x)}_median")),
       as.character(glue("{x} median coverage"))
     )