From 6a281fd55882d7ad852f2198233d126141cf1a2f Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Wed, 4 Sep 2024 10:21:05 +1000
Subject: [PATCH 01/24] umccrise accreditation 131 tidy meta

---
 R/regex.R                   |  10 +--
 inst/scripts/umccrise_run.R | 122 ++++++++++++++++--------------------
 2 files changed, 60 insertions(+), 72 deletions(-)
diff --git a/R/regex.R b/R/regex.R
index 25a2f61..4ca485b 100644
--- a/R/regex.R
+++ b/R/regex.R
@@ -61,11 +61,11 @@ DR_FILE_REGEX <- tibble::tribble(
   "multiqc_data\\.json", "MultiqcFile",
   "somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile",
   "somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile",
-  "chord\\.tsv\\.gz$", "UmChordTsvFile",
-  "hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
-  "snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
-  "snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
-  "-qc_summary\\.tsv\\.gz$", "UmQcSumFile",
+  # "chord\\.tsv\\.gz$", "UmChordTsvFile",
+  # "hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
+  # "snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
+  # "snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
+  # "-qc_summary\\.tsv\\.gz$", "UmQcSumFile",
   "bcftools_stats\\.txt$", "BcftoolsStatsFile"
 )
 
diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R
index 6602d87..ea4d2f4 100644
--- a/inst/scripts/umccrise_run.R
+++ b/inst/scripts/umccrise_run.R
@@ -1,74 +1,62 @@
 require(dracarys)
-require(here)
-require(glue)
+require(rportal, include.only = "portaldb_query_workflow")
+require(here, include.only = "here")
+require(glue, include.only = "glue")
+require(readr, include.only = "read_rds")
 require(dplyr)
-require(readr)
-
-#---- GDS ----#
-# read last 1000 umccrise runs from portal
-# 475 from 2022-01-24 until 2023-09-03, of which 449 Succeeded
-date1 <- "2023-09-04"
-pmeta_raw_rds <- here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_raw.rds"))
-# pmeta_raw <- dracarys::portal_meta_read(rows = 1000, params = "&type_name=umccrise")
-# saveRDS(pmeta_raw, file = pmeta_raw_rds)
-pmeta <- readr::read_rds(pmeta_raw_rds) |>
-  dracarys::meta_umccrise(status = "Succeeded")
-lims_raw_rds <- here(glue("nogit/umccrise/rds/lims/{date1}_lims_raw.rds"))
-# lims_raw <- dracarys::glims_read()
+require(tidyr, include.only = "separate_wider_delim")
+
+start_date <- "2024-08-29"
+query_workflow_umccrise <- function(start_date) {
+  q1 <- glue(
+    "WHERE \"type_name\" = 'umccrise' AND  \"start\" > date(\'{start_date}\') ",
+    "ORDER BY \"start\" DESC;"
+  )
+  rportal::portaldb_query_workflow(q1)
+}
+
+query_limsrow_libids <- function(libids) {
+  assertthat::assert_that(!is.null(libids), all(grepl("^L", libids)))
+  libids <- unique(libids) |>
+    paste(collapse = "|")
+  q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');")
+  rportal::portaldb_query_limsrow(q1)
+}
+
+# p_raw <- query_workflow_umccrise(start_date)
+p_raw_rds <- here(glue("nogit/data_portal/workflows/{start_date}.rds"))
+# saveRDS(p_raw, file = p_raw_rds)
+p_raw <- readr::read_rds(p_raw_rds)
+
+p <- p_raw |>
+  rportal::meta_umccrise(status = "Succeeded")
+# lims_raw <- query_limsrow_libids(p$LibraryID_tumor)
+lims_raw_rds <- here(glue("nogit/data_portal/lims/{start_date}.rds"))
 # saveRDS(lims_raw, file = lims_raw_rds)
+# L2100192 is L2100192_rerun in the lims, 15 libs are rerun/topup/topup2
 lims_raw <- readr::read_rds(lims_raw_rds)
 lims <- lims_raw |>
-  filter(Type == "WGS") |>
-  filter(LibraryID %in% c(pmeta$LibraryID_normal, pmeta$LibraryID_tumor))
-table(pmeta$LibraryID_tumor %in% lims$LibraryID)
-table(pmeta$LibraryID_normal %in% lims$LibraryID)
-
-# The final results sit under gds_outdir_umccrise/<SubjectID>__<SampleID_tumor>/
-# We need to get the SampleID_tumor for runs before 2023-04-07. We can do that
-# by using the LibraryID_tumor to match up with the glims.
-missing_tumor_sampleid <- pmeta |>
-  filter(end < "2023-04-07") |>
-  pull(LibraryID_tumor)
-
-table(missing_tumor_sampleid %in% lims$LibraryID)
-libid2sampid <- lims |>
-  filter(LibraryID %in% missing_tumor_sampleid) |>
-  select(LibraryID_tumor = LibraryID, SampleID_tumor = SampleID)
-
-d <- pmeta |>
-  left_join(libid2sampid, by = "LibraryID_tumor") |>
-  mutate(SampleID_tumor = if_else(is.na(SampleID_tumor.x), SampleID_tumor.y, SampleID_tumor.x)) |>
-  select(-c(SampleID_tumor.x, SampleID_tumor.y)) |>
-  relocate(SampleID_tumor, .before = SampleID_normal) |>
-  mutate(gds_outdir_umccrise = glue("{.data$gds_outdir_umccrise}/{.data$SubjectID}__{.data$SampleID_tumor}"))
-d
-
-# final portal meta for umccrise runs
-# columns:
-# "id", "wfr_name", "wfr_id", "version", "end_status", "start", "end", "portal_run_id",
-# "SubjectID", "LibraryID_tumor", "LibraryID_normal", "SampleID_tumor", "SampleID_normal",
-# "gds_outdir_umccrise", "gds_indir_dragen_somatic", "gds_indir_dragen_germline", "gds_infile_genomes_tar"
-saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds")))
-
-#---- S3 ----#
-pat <- "qc_summary.tsv.gz"
-rows <- 1000
-d_s3_raw <- dracarys::s3_search(pat = pat, rows = rows)
-
-d_s3 <- d_s3_raw |>
-  arrange(desc(date_aest)) |>
-  mutate(
-    bname = basename(path),
-    dir1 = dirname(path), # path/to/dirA/cancer_report_tables
-    dir2 = basename(dirname(dir1)), # dirA
-    sbj_samp_lib = sub(".*__(.*)", "\\1", dir2),
-    SubjectID = sub("(SBJ[0-9]{5})_.*", "\\1", sbj_samp_lib),
-    SampleID_tumor = sub("SBJ.*?_(.*?)_.*", "\\1", sbj_samp_lib),
-    LibraryID_tumor = sub("SBJ.*?_.*?_(.*)", "\\1", sbj_samp_lib),
-    rerun = grepl("rerun", .data$LibraryID_tumor)
+  tidyr::separate_wider_delim(
+    library_id,
+    delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start"
   ) |>
-  select(dir1, SubjectID, LibraryID_tumor, SampleID_tumor, date = date_aest, rerun)
+  select(
+    subject_id, library_id, sample_id, sample_name,
+    external_subject_id, external_sample_id,
+    project_name, project_owner,
+    source, quality
+  ) |>
+  distinct()
+table(lims$library_id %in% p$LibraryID_tumor) # double-check
+
+d <- p |>
+  left_join(lims, by = c("LibraryID_tumor" = "library_id")) |>
+  mutate(gds_outdir_umccrise = glue("{.data$gds_outdir_umccrise}/{.data$SubjectID}__{.data$SampleID_tumor}")) |>
+  select(
+    wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal,
+    SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id,
+    project_owner, project_name, source, quality
+  )
+d
 
-date2 <- "2023-09-12"
-saveRDS(d_s3, file = here(glue("nogit/umccrise/rds/portal_meta/{date2}_pmeta_s3.rds")))
-# now we have S3 paths and metadata, so all we need is to generate presigned URLs and read the data
+saveRDS(d, file = here(glue("nogit/data_portal/workflows/umccrise_tidy_{start_date}.rds")))

From 318c6fcf722e88b0335b78034fd087979ba46347 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Thu, 5 Sep 2024 01:09:32 +1000
Subject: [PATCH 02/24] add write_dracarys_list_of_tbls

---
 NAMESPACE                          |  2 ++
 R/utils.R                          | 28 +++++++++++++++++++++++++++
 man/write_dracarys_list_of_tbls.Rd | 31 ++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 man/write_dracarys_list_of_tbls.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 4eb4d8d..9d913ec 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -32,6 +32,7 @@ export(UmccriseCanRepTables)
 export(VCMetricsFile)
 export(Wf)
 export(Wf_tso_ctdna_tumor_only)
+export(Wf_umccrise)
 export(WgsContigMeanCovFile)
 export(WgsCoverageMetricsFile)
 export(WgsFineHistFile)
@@ -71,6 +72,7 @@ export(s3_search)
 export(session_info_kable)
 export(time_metrics_process)
 export(umccr_tidy)
+export(write_dracarys_list_of_tbls)
 importFrom(R6,R6Class)
 importFrom(ggplot2,ggplot)
 importFrom(ggrepel,geom_text_repel)
diff --git a/R/utils.R b/R/utils.R
index f61fec9..45cbb54 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -117,6 +117,34 @@ write_dracarys <- function(obj, prefix, out_format, drid = NULL) {
   return(invisible(obj))
 }
 
+#' Write List of Tidy Tibbles
+#'
+#' @param list_of_tbls List of tidy tibbles.
+#' @param out_dir Output directory.
+#' @param prefix Prefix of output file(s).
+#' @param out_format Format of output file(s).
+#' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
+#'
+#' @return Tibble with nested objects that have been written to the output directory.
+#' @export
+write_dracarys_list_of_tbls <- function(list_of_tbls, out_dir = NULL, prefix = NULL, out_format = "tsv", drid = NULL) {
+  assertthat::assert_that(!is.null(prefix))
+  if (!is.null(out_dir)) {
+    prefix <- file.path(out_dir, prefix)
+  }
+  d_write <- list_of_tbls |>
+    tibble::enframe(name = "section") |>
+    dplyr::rowwise() |>
+    dplyr::mutate(
+      section_low = tolower(.data$section),
+      p = glue("{prefix}_{.data$section_low}"),
+      out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid))
+    ) |>
+    dplyr::ungroup() |>
+    dplyr::select("section", "value") |>
+    tibble::deframe()
+  invisible(d_write)
+}
 
 #' Create Empty Tibble Given Column Names
 #'
diff --git a/man/write_dracarys_list_of_tbls.Rd b/man/write_dracarys_list_of_tbls.Rd
new file mode 100644
index 0000000..e6aa8be
--- /dev/null
+++ b/man/write_dracarys_list_of_tbls.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{write_dracarys_list_of_tbls}
+\alias{write_dracarys_list_of_tbls}
+\title{Write List of Tidy Tibbles}
+\usage{
+write_dracarys_list_of_tbls(
+  list_of_tbls,
+  out_dir = NULL,
+  prefix = NULL,
+  out_format = "tsv",
+  drid = NULL
+)
+}
+\arguments{
+\item{list_of_tbls}{List of tidy tibbles.}
+
+\item{out_dir}{Output directory.}
+
+\item{prefix}{Prefix of output file(s).}
+
+\item{out_format}{Format of output file(s).}
+
+\item{drid}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).}
+}
+\value{
+Tibble with nested objects that have been written to the output directory.
+}
+\description{
+Write List of Tidy Tibbles
+}

From 94ad5ab11209552ca6385cdb8325cc2998684ca2 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Thu, 5 Sep 2024 01:33:24 +1000
Subject: [PATCH 03/24] add Wf_umccrise R6 class

---
 R/umccrise.R                | 142 +++++++++++++++++++++++++++++-------
 man/BclconvertReports375.Rd |   4 +-
 man/UmccriseCanRepTables.Rd |  32 --------
 man/Wf_umccrise.Rd          | 117 +++++++++++++++++++++++++++++
 4 files changed, 235 insertions(+), 60 deletions(-)
 create mode 100644 man/Wf_umccrise.Rd

diff --git a/R/umccrise.R b/R/umccrise.R
index e358d2d..a66ccad 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -1,3 +1,119 @@
+#' Wf_umccrise R6 Class
+#'
+#' @description
+#' Reads and writes tidy versions of files from the `umccrise` workflow
+#'
+#' @examples
+#' \dontrun{
+#' token <- Sys.getenv("ICA_ACCESS_TOKEN") |> ica_token_validate()
+#' SubjectID <- "SBJ01155"
+#' SampleID_tumor <- "PRJ211091"
+#' gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043"
+#' gdsdir <- file.path(gdsdir1, "L2101566__L2101565")
+#' obj <- Wf_umccrise$new(gdsdir)
+#' gds_files <- obj$gds_list(gdsdir, token, SubjectID, SampleID_tumor)
+#' outdir <- file.path(sub("gds://", "", gdsdir))
+#' outdir <- file.path(normalizePath("~/icav1/g"), outdir)
+#' out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token)
+#'
+#'
+#'
+#'
+#'
+#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
+#' p2 <- "L2101566__L2101565"
+#' p <- file.path(p1, p2)
+#' obj <- Wf_umccrise$new(p)
+#' obj$path
+#' obj$contents
+#' d <- obj$read()
+#' obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
+#' }
+#'
+#' @export
+Wf_umccrise <- R6::R6Class(
+  "Wf_umccrise",
+  public = list(
+    #' @field path Path to the `umccrise` directory.
+    #' @field contents Tibble with file path, basename, and size.
+    path = NULL,
+    contents = NULL,
+    #' @description Create a new Wf_umccrise object.
+    #' @param path Path to the `umccrise` directory.
+    initialize = function(path = NULL) {
+      stopifnot(is.character(path), length(path) == 1)
+      # self$path <- normalizePath(path)
+      self$path <- path
+      # self$contents <- fs::dir_info(path, type = "file", recurse = TRUE) |>
+      #   dplyr::mutate(
+      #     bname = basename(.data$path),
+      #     size = as.character(trimws(.data$size))
+      #   ) |>
+      #   dplyr::select("path", "bname", "size")
+    },
+    #' @description List Relevant Files In umccrise GDS Directory
+    #' @param gdsdir Path to the `umccrise` directory.
+    #' @param SubjectID The SubjectID of the sample (used to construct path).
+    #' @param SampleID_tumor The SampleID of the tumor sample (used to construct path).
+    #' @param token ICA access token.
+    gds_list = function(gdsdir, SubjectID, SampleID_tumor, token = Sys.getenv("ICA_ACCESS_TOKEN")) {
+      reg1 <- tibble::tribble(
+        ~regex, ~fun,
+        # "-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile",
+        "-chord\\.tsv\\.gz$", "UmChordTsvFile",
+        "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
+        "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
+        "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
+        "-dbs\\.tsv\\.gz$", "UmSigsDbsFile",
+        "-indel\\.tsv\\.gz$", "UmSigsIndelFile",
+        "-qc_summary\\.tsv\\.gz$", "UmQcSumFile"
+      )
+      reg2 <- tibble::tribble(
+        ~regex, ~fun,
+        "-somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile"
+      )
+
+      dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}"))
+      dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}"))
+      dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json
+      f1 <- gds_files_list_filter_relevant(gdsdir = dir_fin, token, page_size = 300, regexes = reg1)
+      f2 <- gds_files_list_filter_relevant(gdsdir = dir_wrk_pcgr, token, page_size = 50, regexes = reg2)
+      gds_files <- dplyr::bind_rows(f1, f2)
+      return(gds_files)
+    },
+
+    #' @description GDS File Download via API
+    #'
+    #' @param gds_files Tibble with bname and file_id for umccrise files.
+    #' @param outdir Directory to output files (loosely, not in a structured manner).
+    #' @param token ICA access token.
+    gds_download = function(gds_files, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN")) {
+      assertthat::assert_that(all(c("bname", "file_id") %in% colnames(gds_files)))
+      gds_files |>
+        dplyr::rowwise() |>
+        dplyr::mutate(
+          out = file.path(outdir, .data$bname),
+          out_dl = gds_file_download_api(.data$file_id, .data$out, token)
+        )
+    },
+    #' @description
+    tidy = function(indir, out_files) {
+      obj_canrep <- UmccriseCanRepTables$new(indir)
+      canrep_parse <- obj_canrep$read()
+      pcgr_json <- out_files |>
+        dplyr::filter(.data$type == "PcgrJsonFile") |>
+        dplyr::pull("out")
+      pcgr_json_parse <- PcgrJsonFile$new(pcgr_json)$read()
+      d <- canrep_parse
+      d[["pcgr_json"]] <- pcgr_json_parse[["metrics"]]
+      d
+    }
+  )
+)
+
+
+
+
 #' UmccriseCanRepTables R6 Class
 #'
 #' @description
@@ -163,32 +279,6 @@ UmccriseCanRepTables <- R6::R6Class(
         sigsindel = self$grep_file("-indel\\.tsv\\.gz$") |> self$read_sigs(),
         qcsum = self$grep_file("-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv()
       )
-    },
-
-    #' @description
-    #' Writes tidied contents of `cancer_report_tables` directory output by umccrise.
-    #'
-    #' @param d Parsed object from `self$read()`.
-    #' @param prefix Prefix of output file(s).
-    #' @param out_dir Output directory.
-    #' @param out_format Format of output file(s).
-    #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
-    write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) {
-      if (!is.null(out_dir)) {
-        prefix <- file.path(out_dir, prefix)
-      }
-      d_write <- d |>
-        tibble::enframe(name = "section") |>
-        dplyr::rowwise() |>
-        dplyr::mutate(
-          section_low = tolower(.data$section),
-          p = glue("{prefix}_{.data$section_low}"),
-          out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid))
-        ) |>
-        dplyr::ungroup() |>
-        dplyr::select("section", "value") |>
-        tibble::deframe()
-      invisible(d_write)
     }
   )
 )
diff --git a/man/BclconvertReports375.Rd b/man/BclconvertReports375.Rd
index 01aba8b..e644a84 100644
--- a/man/BclconvertReports375.Rd
+++ b/man/BclconvertReports375.Rd
@@ -10,8 +10,8 @@ https://support-docs.illumina.com/SW/dragen_v42/Content/SW/DRAGEN/OutputFiles.ht
 }
 \examples{
 \dontrun{
-p1 <- "240816_A01052_0220_AHM7VHDSXC/202408195d4f2fc4/Reports"
-b <- here::here("nogit/bcl_convert", p1) |>
+p1 <- "nogit/bcl_convert/WGS_TsqNano/Reports"
+b <- here::here(p1) |>
   BclconvertReports375$new()
 b$path
 b$contents
diff --git a/man/UmccriseCanRepTables.Rd b/man/UmccriseCanRepTables.Rd
index 67e20f3..9c29fd2 100644
--- a/man/UmccriseCanRepTables.Rd
+++ b/man/UmccriseCanRepTables.Rd
@@ -40,7 +40,6 @@ obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
 \item \href{#method-UmccriseCanRepTables-read_sigs}{\code{UmccriseCanRepTables$read_sigs()}}
 \item \href{#method-UmccriseCanRepTables-read_qcsummarytsv}{\code{UmccriseCanRepTables$read_qcsummarytsv()}}
 \item \href{#method-UmccriseCanRepTables-read}{\code{UmccriseCanRepTables$read()}}
-\item \href{#method-UmccriseCanRepTables-write}{\code{UmccriseCanRepTables$write()}}
 \item \href{#method-UmccriseCanRepTables-clone}{\code{UmccriseCanRepTables$clone()}}
 }
 }
@@ -181,37 +180,6 @@ A list of tibbles.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-write"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-write}{}}}
-\subsection{Method \code{write()}}{
-Writes tidied contents of \code{cancer_report_tables} directory output by umccrise.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$write(
-  d,
-  out_dir = NULL,
-  prefix,
-  out_format = "tsv",
-  drid = NULL
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{d}}{Parsed object from \code{self$read()}.}
-
-\item{\code{out_dir}}{Output directory.}
-
-\item{\code{prefix}}{Prefix of output file(s).}
-
-\item{\code{out_format}}{Format of output file(s).}
-
-\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-UmccriseCanRepTables-clone"></a>}}
 \if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-clone}{}}}
 \subsection{Method \code{clone()}}{
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
new file mode 100644
index 0000000..600cec5
--- /dev/null
+++ b/man/Wf_umccrise.Rd
@@ -0,0 +1,117 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/umccrise.R
+\name{Wf_umccrise}
+\alias{Wf_umccrise}
+\title{Wf_umccrise R6 Class}
+\description{
+Reads and writes tidy versions of files from the \code{umccrise} workflow
+}
+\examples{
+\dontrun{
+token <- Sys.getenv("ICA_ACCESS_TOKEN")
+SubjectID <- "SBJ01155"
+SampleID_tumor <- "PRJ211091"
+gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043"
+gdsdir <- file.path(gdsdir1, "L2101566__L2101565")
+obj <- Wf_umccrise$new(gdsdir)
+gds_files <- obj$gds_list(gdsdir, token, SubjectID, SampleID_tumor)
+outdir <- file.path(sub("gds://", "", gdsdir))
+outdir <- file.path(normalizePath("~/icav1/g"), outdir)
+out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token)
+
+
+
+
+
+p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
+p2 <- "L2101566__L2101565"
+p <- file.path(p1, p2)
+obj <- Wf_umccrise$new(p)
+obj$path
+obj$contents
+d <- obj$read()
+obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
+}
+
+}
+\section{Public fields}{
+\if{html}{\out{<div class="r6-fields">}}
+\describe{
+\item{\code{path}}{Path to the \code{umccrise} directory.}
+
+\item{\code{contents}}{Tibble with file path, basename, and size.}
+}
+\if{html}{\out{</div>}}
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}}
+\item \href{#method-Wf_umccrise-gds_list}{\code{Wf_umccrise$gds_list()}}
+\item \href{#method-Wf_umccrise-gds_download}{\code{Wf_umccrise$gds_download()}}
+\item \href{#method-Wf_umccrise-tidy}{\code{Wf_umccrise$tidy()}}
+\item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-new"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-new}{}}}
+\subsection{Method \code{new()}}{
+Create a new Wf_umccrise object.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$new(path = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{path}}{Path to the \code{umccrise} directory.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-gds_list"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_list}{}}}
+\subsection{Method \code{gds_list()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$gds_list(gdsdir, token, SubjectID, SampleID_tumor)}\if{html}{\out{</div>}}
+}
+
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-gds_download"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_download}{}}}
+\subsection{Method \code{gds_download()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$gds_download(gds_files, outdir, token)}\if{html}{\out{</div>}}
+}
+
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-tidy"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-tidy}{}}}
+\subsection{Method \code{tidy()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$tidy(indir, out_files)}\if{html}{\out{</div>}}
+}
+
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}

From ac6eb68733bddae7ded204eb687d175469fc76a4 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Thu, 5 Sep 2024 14:46:40 +1000
Subject: [PATCH 04/24] parse conpair results; add grep_file

---
 NAMESPACE                   |   1 +
 R/umccrise.R                | 117 ++++++++++++++++++++----------------
 R/utils.R                   |  18 ++++++
 man/UmccriseCanRepTables.Rd |  20 +-----
 man/Wf_umccrise.Rd          |  88 +++++++++++++++++++++------
 man/grep_file.Rd            |  19 ++++++
 6 files changed, 174 insertions(+), 89 deletions(-)
 create mode 100644 man/grep_file.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 9d913ec..80cda13 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -53,6 +53,7 @@ export(gds_files_list)
 export(gds_files_list_fastq)
 export(gds_files_list_filter_relevant)
 export(gds_volumes_list)
+export(grep_file)
 export(ica_token_validate)
 export(match_regex)
 export(multiqc_column_map_append)
diff --git a/R/umccrise.R b/R/umccrise.R
index a66ccad..8903583 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -11,23 +11,13 @@
 #' gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043"
 #' gdsdir <- file.path(gdsdir1, "L2101566__L2101565")
 #' obj <- Wf_umccrise$new(gdsdir)
-#' gds_files <- obj$gds_list(gdsdir, token, SubjectID, SampleID_tumor)
+#' gds_files <- obj$gds_list(
+#'   gdsdir = gdsdir, token = token, SubjectID = SubjectID, SampleID_tumor
+#' )
 #' outdir <- file.path(sub("gds://", "", gdsdir))
 #' outdir <- file.path(normalizePath("~/icav1/g"), outdir)
 #' out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token)
-#'
-#'
-#'
-#'
-#'
-#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
-#' p2 <- "L2101566__L2101565"
-#' p <- file.path(p1, p2)
-#' obj <- Wf_umccrise$new(p)
-#' obj$path
-#' obj$contents
-#' d <- obj$read()
-#' obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
+#' tidy1 <- obj$tidy(indir = outdir, out_files = out_files)
 #' }
 #'
 #' @export
@@ -42,14 +32,7 @@ Wf_umccrise <- R6::R6Class(
     #' @param path Path to the `umccrise` directory.
     initialize = function(path = NULL) {
       stopifnot(is.character(path), length(path) == 1)
-      # self$path <- normalizePath(path)
       self$path <- path
-      # self$contents <- fs::dir_info(path, type = "file", recurse = TRUE) |>
-      #   dplyr::mutate(
-      #     bname = basename(.data$path),
-      #     size = as.character(trimws(.data$size))
-      #   ) |>
-      #   dplyr::select("path", "bname", "size")
     },
     #' @description List Relevant Files In umccrise GDS Directory
     #' @param gdsdir Path to the `umccrise` directory.
@@ -66,13 +49,13 @@ Wf_umccrise <- R6::R6Class(
         "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
         "-dbs\\.tsv\\.gz$", "UmSigsDbsFile",
         "-indel\\.tsv\\.gz$", "UmSigsIndelFile",
-        "-qc_summary\\.tsv\\.gz$", "UmQcSumFile"
+        "-qc_summary\\.tsv\\.gz$", "UmQcSumFile",
+        "multiqc_conpair.txt", "UmConpairMultiqc"
       )
       reg2 <- tibble::tribble(
         ~regex, ~fun,
         "-somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile"
       )
-
       dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}"))
       dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}"))
       dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json
@@ -96,24 +79,66 @@ Wf_umccrise <- R6::R6Class(
           out_dl = gds_file_download_api(.data$file_id, .data$out, token)
         )
     },
-    #' @description
+
+    #' @description Tidy up the output files from umccrise
+    #'
+    #' @param indir Path to the `umccrise` directory.
+    #' @param out_files Tibble with file path, basename, and size.
     tidy = function(indir, out_files) {
       obj_canrep <- UmccriseCanRepTables$new(indir)
       canrep_parse <- obj_canrep$read()
       pcgr_json <- out_files |>
         dplyr::filter(.data$type == "PcgrJsonFile") |>
-        dplyr::pull("out")
-      pcgr_json_parse <- PcgrJsonFile$new(pcgr_json)$read()
+        dplyr::pull("out") |>
+        PcgrJsonFile$new() |>
+        read()
+      conpair_tsv <- out_files |>
+        dplyr::filter(.data$type == "UmConpairMultiqc") |>
+        dplyr::pull("out") |>
+        self$read_conpairmultiqc()
       d <- canrep_parse
-      d[["pcgr_json"]] <- pcgr_json_parse[["metrics"]]
+      d[["pcgr_json"]] <- pcgr_json[["metrics"]]
+      d[["conpair"]] <- conpair_tsv
       d
+    },
+
+    #' @description Read multiqc_conpair.txt file.
+    #'
+    #' @param x (`character(1)`)\cr
+    #'   Path to multiqc_conpair.txt file.
+    read_conpairmultiqc = function(x) {
+      um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota")
+      um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples)))
+      cnames <- list(
+        old = c(
+          "Sample", "concordance_concordance", "concordance_used_markers",
+          "concordance_total_markers", "concordance_marker_threshold",
+          "concordance_min_mapping_quality", "concordance_min_base_quality",
+          "contamination"
+        ),
+        new = c(
+          "sampleid", "contamination", "concordance", "markers_used",
+          "markers_total", "marker_threshold",
+          "mapq_min", "baseq_min"
+        )
+      )
+      ctypes <- list(
+        old = c("cddddddd"),
+        new = c("cddddddd")
+      )
+      if (!file.exists(x)) {
+        return(empty_tbl(cnames$new, ctypes$new))
+      }
+      d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c"))
+      assertthat::assert_that(all(colnames(d1) == cnames$old))
+      d1 |>
+        dplyr::filter(!.data$Sample %in% um_ref_samples) |>
+        dplyr::relocate("contamination", .after = "Sample") |>
+        rlang::set_names(cnames$new)
     }
   )
 )
 
-
-
-
 #' UmccriseCanRepTables R6 Class
 #'
 #' @description
@@ -123,7 +148,7 @@ Wf_umccrise <- R6::R6Class(
 #' @examples
 #' \dontrun{
 #' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
-#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables"
+#' p2 <- "L2101566__L2101565"
 #' p <- file.path(p1, p2)
 #' obj <- UmccriseCanRepTables$new(p)
 #' obj$path
@@ -171,21 +196,6 @@ UmccriseCanRepTables <- R6::R6Class(
       cat(bnames, sep = "\n")
       invisible(self)
     },
-    #' @description Returns file with given pattern from the cancer_report_tables directory.
-    #' @param pat File pattern to look for.
-    grep_file = function(pat) {
-      x <- self$contents |>
-        dplyr::filter(grepl(pat, .data$path)) |>
-        dplyr::pull(.data$path)
-      if (length(x) > 1) {
-        fnames <- paste(x, collapse = ", ")
-        cli::cli_abort("More than 1 match found for {pat} ({fnames}). Aborting.")
-      }
-      if (length(x) == 0) {
-        return("") # file.exists("") returns FALSE
-      }
-      return(x)
-    },
 
     #' @description Read `chord.tsv.gz` file output from umccrise.
     #'
@@ -269,15 +279,16 @@ UmccriseCanRepTables <- R6::R6Class(
     #' @return A list of tibbles.
     #' @export
     read = function() {
+      x <- self$path
       # now return all as list elements
       list(
-        chord = self$grep_file("-chord\\.tsv\\.gz$") |> self$read_chordtsv(),
-        hrdetect = self$grep_file("-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(),
-        sigs2015 = self$grep_file("-snv_2015\\.tsv\\.gz$") |> self$read_sigs(),
-        sigs2020 = self$grep_file("-snv_2020\\.tsv\\.gz$") |> self$read_sigs(),
-        sigsdbs = self$grep_file("-dbs\\.tsv\\.gz$") |> self$read_sigs(),
-        sigsindel = self$grep_file("-indel\\.tsv\\.gz$") |> self$read_sigs(),
-        qcsum = self$grep_file("-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv()
+        chord = grep_file(x, "-chord\\.tsv\\.gz$") |> self$read_chordtsv(),
+        hrdetect = grep_file(x, "-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(),
+        sigs2015 = grep_file(x, "-snv_2015\\.tsv\\.gz$") |> self$read_sigs(),
+        sigs2020 = grep_file(x, "-snv_2020\\.tsv\\.gz$") |> self$read_sigs(),
+        sigsdbs = grep_file(x, "-dbs\\.tsv\\.gz$") |> self$read_sigs(),
+        sigsindel = grep_file(x, "-indel\\.tsv\\.gz$") |> self$read_sigs(),
+        qcsum = grep_file(x, "-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv()
       )
     }
   )
diff --git a/R/utils.R b/R/utils.R
index 45cbb54..912b3c6 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -193,6 +193,24 @@ read_jsongz_rjsonio <- function(x, ...) {
   RJSONIO::fromJSON(x, ...)
 }
 
+#' Grep File Pattern
+#'
+#' @param path Path to look for file.
+#' @param regexp A regular expression (e.g. [.]csv$) passed on to `grep()` to filter paths.
+#'
+#' @return The path to the file or an empty string if no match is found.
+#' @export
+grep_file <- function(path = ".", regexp) {
+  x <- fs::dir_ls(path, recurse = TRUE, type = "file", regexp = regexp)
+  if (length(x) > 1) {
+    fnames <- paste(x, collapse = ", ")
+    cli::cli_abort("More than 1 match found for {regexp} ({fnames}). Aborting.")
+  }
+  if (length(x) == 0) {
+    return("") # file.exists("") returns FALSE
+  }
+  return(x)
+}
 
 
 #' @noRd
diff --git a/man/UmccriseCanRepTables.Rd b/man/UmccriseCanRepTables.Rd
index 9c29fd2..5c43358 100644
--- a/man/UmccriseCanRepTables.Rd
+++ b/man/UmccriseCanRepTables.Rd
@@ -10,7 +10,7 @@ output from the \code{umccrise} workflow.
 \examples{
 \dontrun{
 p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
-p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables"
+p2 <- "L2101566__L2101565"
 p <- file.path(p1, p2)
 obj <- UmccriseCanRepTables$new(p)
 obj$path
@@ -34,7 +34,6 @@ obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
 \itemize{
 \item \href{#method-UmccriseCanRepTables-new}{\code{UmccriseCanRepTables$new()}}
 \item \href{#method-UmccriseCanRepTables-print}{\code{UmccriseCanRepTables$print()}}
-\item \href{#method-UmccriseCanRepTables-grep_file}{\code{UmccriseCanRepTables$grep_file()}}
 \item \href{#method-UmccriseCanRepTables-read_chordtsv}{\code{UmccriseCanRepTables$read_chordtsv()}}
 \item \href{#method-UmccriseCanRepTables-read_hrdetecttsv}{\code{UmccriseCanRepTables$read_hrdetecttsv()}}
 \item \href{#method-UmccriseCanRepTables-read_sigs}{\code{UmccriseCanRepTables$read_sigs()}}
@@ -78,23 +77,6 @@ Print details about the cancer_report_tables directory.
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-grep_file"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-grep_file}{}}}
-\subsection{Method \code{grep_file()}}{
-Returns file with given pattern from the cancer_report_tables directory.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$grep_file(pat)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{pat}}{File pattern to look for.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-UmccriseCanRepTables-read_chordtsv"></a>}}
 \if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_chordtsv}{}}}
 \subsection{Method \code{read_chordtsv()}}{
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index 600cec5..9d6abfb 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -8,29 +8,19 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow
 }
 \examples{
 \dontrun{
-token <- Sys.getenv("ICA_ACCESS_TOKEN")
+token <- Sys.getenv("ICA_ACCESS_TOKEN") |> ica_token_validate()
 SubjectID <- "SBJ01155"
 SampleID_tumor <- "PRJ211091"
 gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043"
 gdsdir <- file.path(gdsdir1, "L2101566__L2101565")
 obj <- Wf_umccrise$new(gdsdir)
-gds_files <- obj$gds_list(gdsdir, token, SubjectID, SampleID_tumor)
+gds_files <- obj$gds_list(
+  gdsdir = gdsdir, token = token, SubjectID = SubjectID, SampleID_tumor
+)
 outdir <- file.path(sub("gds://", "", gdsdir))
 outdir <- file.path(normalizePath("~/icav1/g"), outdir)
 out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token)
-
-
-
-
-
-p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
-p2 <- "L2101566__L2101565"
-p <- file.path(p1, p2)
-obj <- Wf_umccrise$new(p)
-obj$path
-obj$contents
-d <- obj$read()
-obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
+tidy1 <- obj$tidy(indir = outdir, out_files = out_files)
 }
 
 }
@@ -50,6 +40,7 @@ obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
 \item \href{#method-Wf_umccrise-gds_list}{\code{Wf_umccrise$gds_list()}}
 \item \href{#method-Wf_umccrise-gds_download}{\code{Wf_umccrise$gds_download()}}
 \item \href{#method-Wf_umccrise-tidy}{\code{Wf_umccrise$tidy()}}
+\item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}}
 \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}}
 }
 }
@@ -74,28 +65,91 @@ Create a new Wf_umccrise object.
 \if{html}{\out{<a id="method-Wf_umccrise-gds_list"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_list}{}}}
 \subsection{Method \code{gds_list()}}{
+List Relevant Files In umccrise GDS Directory
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$gds_list(gdsdir, token, SubjectID, SampleID_tumor)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$gds_list(
+  gdsdir,
+  SubjectID,
+  SampleID_tumor,
+  token = Sys.getenv("ICA_ACCESS_TOKEN")
+)}\if{html}{\out{</div>}}
 }
 
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{gdsdir}}{Path to the \code{umccrise} directory.}
+
+\item{\code{SubjectID}}{The SubjectID of the sample (used to construct path).}
+
+\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (used to construct path).}
+
+\item{\code{token}}{ICA access token.}
+}
+\if{html}{\out{</div>}}
+}
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf_umccrise-gds_download"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_download}{}}}
 \subsection{Method \code{gds_download()}}{
+GDS File Download via API
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$gds_download(gds_files, outdir, token)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$gds_download(
+  gds_files,
+  outdir,
+  token = Sys.getenv("ICA_ACCESS_TOKEN")
+)}\if{html}{\out{</div>}}
 }
 
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{gds_files}}{Tibble with bname and file_id for umccrise files.}
+
+\item{\code{outdir}}{Directory to output files (loosely, not in a structured manner).}
+
+\item{\code{token}}{ICA access token.}
+}
+\if{html}{\out{</div>}}
+}
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf_umccrise-tidy"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf_umccrise-tidy}{}}}
 \subsection{Method \code{tidy()}}{
+Tidy up the output files from umccrise
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$tidy(indir, out_files)}\if{html}{\out{</div>}}
 }
 
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{indir}}{Path to the \code{umccrise} directory.}
+
+\item{\code{out_files}}{Tibble with file path, basename, and size.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_conpairmultiqc"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_conpairmultiqc}{}}}
+\subsection{Method \code{read_conpairmultiqc()}}{
+Read multiqc_conpair.txt file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_conpairmultiqc(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{(\code{character(1)})\cr
+Path to multiqc_conpair.txt file.}
+}
+\if{html}{\out{</div>}}
+}
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf_umccrise-clone"></a>}}
diff --git a/man/grep_file.Rd b/man/grep_file.Rd
new file mode 100644
index 0000000..d0c588e
--- /dev/null
+++ b/man/grep_file.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{grep_file}
+\alias{grep_file}
+\title{Grep File Pattern}
+\usage{
+grep_file(path = ".", regexp)
+}
+\arguments{
+\item{path}{Path to look for file.}
+
+\item{regexp}{A regular expression (e.g. \link{.}csv$) passed on to \code{grep()} to filter paths.}
+}
+\value{
+The path to the file or an empty string if no match is found.
+}
+\description{
+Grep File Pattern
+}

From 4c5b960582ff09d5cd63084b64d6c72a8075076d Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Fri, 6 Sep 2024 17:44:50 +1000
Subject: [PATCH 05/24] add local_files_list_filter_relevant

---
 NAMESPACE                               |  1 +
 R/ica.R                                 | 14 ++++++++++---
 R/s3.R                                  | 11 ++++++-----
 R/utils.R                               | 26 +++++++++++++++++++++++++
 man/gds_files_list_filter_relevant.Rd   | 11 ++++++++---
 man/local_files_list_filter_relevant.Rd | 25 ++++++++++++++++++++++++
 man/s3_files_list_filter_relevant.Rd    |  9 ++++++---
 7 files changed, 83 insertions(+), 14 deletions(-)
 create mode 100644 man/local_files_list_filter_relevant.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 80cda13..5e7ba34 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -55,6 +55,7 @@ export(gds_files_list_filter_relevant)
 export(gds_volumes_list)
 export(grep_file)
 export(ica_token_validate)
+export(local_files_list_filter_relevant)
 export(match_regex)
 export(multiqc_column_map_append)
 export(multiqc_date_fmt)
diff --git a/R/ica.R b/R/ica.R
index 0838ef0..3e3587b 100644
--- a/R/ica.R
+++ b/R/ica.R
@@ -7,14 +7,22 @@
 #' @param pattern Pattern to further filter the returned file type tibble.
 #' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE).
 #' @param page_size Page size (def: 100).
-#' @param regexes Tibble with regex and function name.
+#' @param regexes Tibble with `regex` and `fun`ction name.
 #'
 #' @return A tibble with type, bname, size, file_id, path, and presigned URL.
+#' \dontrun{
+#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
+#' gds_files_list_filter_relevant(gdsdir)
+#' }
 #' @export
-gds_files_list_filter_relevant <- function(gdsdir, token, pattern = NULL, include_url = FALSE, page_size = 100, regexes = DR_FILE_REGEX) {
+gds_files_list_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                                           pattern = NULL, include_url = FALSE,
+                                           page_size = 100, regexes = DR_FILE_REGEX, ...) {
   pattern <- pattern %||% ".*" # keep all recognisable files by default
   cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url")
-  d <- dracarys::gds_files_list(gdsdir, token, include_url = include_url, page_size = page_size) |>
+  d <- dracarys::gds_files_list(
+    gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, ...
+  ) |>
     dplyr::rowwise() |>
     dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
     dplyr::ungroup() |>
diff --git a/R/s3.R b/R/s3.R
index 6b6daaa..a941cfe 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -8,15 +8,16 @@
 #' @param max_items The total number of items to return in the command’s output (def: 1000).
 #' @param presign Include presigned URLs (def: FALSE).
 #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).
+#' @param regexes Tibble with `regex` and `fun`ction name.
 #'
-#' @return A tibble with path, date, file size, file type, and presigned URL if requested.
+#' @return A tibble with file type, basename, file size, date, full path, and presigned URL if requested.
 #' @examples
 #' \dontrun{
 #' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables"
-#' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE)
+#' s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE)
 #' }
 #' @export
-s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200) {
+s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200, regexes = DR_FILE_REGEX) {
   assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign))
   pattern <- pattern %||% ".*" # keep all recognisable files by default
   b <- sub("s3://(.*?)/.*", "\\1", s3dir)
@@ -38,11 +39,11 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 100
     dplyr::rowwise() |>
     dplyr::mutate(
       bname = basename(.data$path),
-      type = purrr::map_chr(.data$bname, match_regex)
+      type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))
     ) |>
     dplyr::ungroup() |>
     dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::select("path", "bname", "date_utc", "size", "type")
+    dplyr::select("type", "bname", "size", "date_utc", "path")
 
   if (presign) {
     d <- d |>
diff --git a/R/utils.R b/R/utils.R
index 912b3c6..2d39dc7 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1,3 +1,29 @@
+#' List Relevant Files In Local Directory
+#'
+#' Lists relevant files in a local directory.
+#'
+#' @param path Path to local directory.
+#' @param regexes Tibble with `regex` and `fun`ction name.
+#'
+#' @return A tibble with type, bname, size, file_id, path, and presigned URL.
+#'
+#' @examples
+#' \dontrun{
+#' path <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise"
+#' local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX)
+#' }
+#' @export
+local_files_list_filter_relevant <- function(path, regexes = DR_FILE_REGEX) {
+  fs::dir_ls(path = path, recurse = TRUE, type = "file") |>
+    tibble::as_tibble_col(column_name = "path") |>
+    dplyr::mutate(
+      bname = basename(.data$path),
+      type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes))
+    ) |>
+    dplyr::filter(!is.na(.data$type)) |>
+    dplyr::select("type", "bname", "path")
+}
+
 #' Print current timestamp for logging
 #'
 #' @return Current timestamp as character.
diff --git a/man/gds_files_list_filter_relevant.Rd b/man/gds_files_list_filter_relevant.Rd
index 784a393..151f1e5 100644
--- a/man/gds_files_list_filter_relevant.Rd
+++ b/man/gds_files_list_filter_relevant.Rd
@@ -6,11 +6,12 @@
 \usage{
 gds_files_list_filter_relevant(
   gdsdir,
-  token,
+  token = Sys.getenv("ICA_ACCESS_TOKEN"),
   pattern = NULL,
   include_url = FALSE,
   page_size = 100,
-  regexes = DR_FILE_REGEX
+  regexes = DR_FILE_REGEX,
+  ...
 )
 }
 \arguments{
@@ -24,10 +25,14 @@ gds_files_list_filter_relevant(
 
 \item{page_size}{Page size (def: 100).}
 
-\item{regexes}{Tibble with regex and function name.}
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 }
 \value{
 A tibble with type, bname, size, file_id, path, and presigned URL.
+\dontrun{
+gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
+gds_files_list_filter_relevant(gdsdir)
+}
 }
 \description{
 Lists relevant files in a GDS directory.
diff --git a/man/local_files_list_filter_relevant.Rd b/man/local_files_list_filter_relevant.Rd
new file mode 100644
index 0000000..a54407f
--- /dev/null
+++ b/man/local_files_list_filter_relevant.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{local_files_list_filter_relevant}
+\alias{local_files_list_filter_relevant}
+\title{List Relevant Files In Local Directory}
+\usage{
+local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX)
+}
+\arguments{
+\item{path}{Path to local directory.}
+
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
+}
+\value{
+A tibble with type, bname, size, file_id, path, and presigned URL.
+}
+\description{
+Lists relevant files in a local directory.
+}
+\examples{
+\dontrun{
+path <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise"
+local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX)
+}
+}
diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd
index 7fc45f3..29c57e3 100644
--- a/man/s3_files_list_filter_relevant.Rd
+++ b/man/s3_files_list_filter_relevant.Rd
@@ -10,7 +10,8 @@ s3_files_list_filter_relevant(
   page_size = 1000,
   max_items = 1000,
   presign = FALSE,
-  expiry_sec = 43200
+  expiry_sec = 43200,
+  regexes = DR_FILE_REGEX
 )
 }
 \arguments{
@@ -25,9 +26,11 @@ s3_files_list_filter_relevant(
 \item{presign}{Include presigned URLs (def: FALSE).}
 
 \item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).}
+
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 }
 \value{
-A tibble with path, date, file size, file type, and presigned URL if requested.
+A tibble with file type, basename, file size, date, full path, and presigned URL if requested.
 }
 \description{
 Lists relevant files in an AWS S3 directory.
@@ -35,6 +38,6 @@ Lists relevant files in an AWS S3 directory.
 \examples{
 \dontrun{
 s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables"
-s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE)
+s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE)
 }
 }

From 4bc19448022dea3b013198e87cca4a6e31cb3b2a Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 7 Sep 2024 13:56:23 +1000
Subject: [PATCH 06/24] add s3_list_objects_dir

---
 NAMESPACE                            |  1 +
 R/s3.R                               | 50 ++++++++++++++++++++++++++--
 man/s3_files_list_filter_relevant.Rd |  4 +--
 man/s3_list_objects_dir.Rd           | 28 ++++++++++++++++
 4 files changed, 79 insertions(+), 4 deletions(-)
 create mode 100644 man/s3_list_objects_dir.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 5e7ba34..abe8a14 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -70,6 +70,7 @@ export(multiqc_tidy_json)
 export(rdf2tab)
 export(read)
 export(s3_files_list_filter_relevant)
+export(s3_list_objects_dir)
 export(s3_search)
 export(session_info_kable)
 export(time_metrics_process)
diff --git a/R/s3.R b/R/s3.R
index a941cfe..555d7a9 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -1,11 +1,57 @@
+#' List Objects in AWS S3 Directory
+#'
+#' Returns some or all (up to 1,000) of the objects in an S3 directory.
+#'
+#' @param s3dir S3 directory.
+#' @param max_objects Maximum objects returned.
+#'
+#'
+#' @return A tibble with object basename, size, last modified timestamp, and
+#' full S3 path.
+#' @examples
+#' \dontrun{
+#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
+#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
+#' s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables")
+#' s3_list_objects_dir(s3dir, max_objects = 15)
+#' }
+#' @export
+s3_list_objects_dir <- function(s3dir, max_objects = 1000) {
+  assertthat::assert_that(grepl("^s3://", s3dir))
+  bucket <- sub("s3://(.*?)/.*", "\\1", s3dir)
+  prefix <- sub("s3://(.*?)/(.*)", "\\2", s3dir)
+  s3 <- paws.storage::s3()
+  l <- s3$list_objects_v2(Bucket = bucket, Prefix = prefix, MaxKeys = max_objects)
+  assertthat::assert_that(all(c("Contents", "KeyCount") %in% names(l)))
+  cols_sel <- c("bname", "size", "lastmodified", "path")
+  # handle no results
+  if (l[["KeyCount"]] == 0) {
+    return(empty_tbl(cnames = cols_sel, ctypes = "cccc"))
+  }
+  d <- l[["Contents"]] |>
+    purrr::map(\(x) tibble::tibble(
+      Key = x[["Key"]],
+      Size = x[["Size"]],
+      lastmodified = x[["LastModified"]]
+    )) |>
+    dplyr::bind_rows() |>
+    dplyr::mutate(
+      path = glue("s3://{bucket}/{.data$Key}"),
+      bname = basename(.data$path),
+      size = fs::as_fs_bytes(.data$Size)
+    ) |>
+    dplyr::select(dplyr::all_of(cols_sel))
+  return(d)
+}
+
 #' List Relevant Files In AWS S3 Directory
 #'
 #' Lists relevant files in an AWS S3 directory.
 #'
 #' @param s3dir S3 directory.
 #' @param pattern Pattern to further filter the returned file type tibble.
-#' @param page_size The size of each page to get in the AWS service call (def: 1000).
-#' @param max_items The total number of items to return in the command’s output (def: 1000).
+#' @param page_size The size of each page to get in the AWS service call.
+#' @param max_items The total number of items to return in the command’s output.
 #' @param presign Include presigned URLs (def: FALSE).
 #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).
 #' @param regexes Tibble with `regex` and `fun`ction name.
diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd
index 29c57e3..a3ded1f 100644
--- a/man/s3_files_list_filter_relevant.Rd
+++ b/man/s3_files_list_filter_relevant.Rd
@@ -19,9 +19,9 @@ s3_files_list_filter_relevant(
 
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
-\item{page_size}{The size of each page to get in the AWS service call (def: 1000).}
+\item{page_size}{The size of each page to get in the AWS service call.}
 
-\item{max_items}{The total number of items to return in the command’s output (def: 1000).}
+\item{max_items}{The total number of items to return in the command’s output.}
 
 \item{presign}{Include presigned URLs (def: FALSE).}
 
diff --git a/man/s3_list_objects_dir.Rd b/man/s3_list_objects_dir.Rd
new file mode 100644
index 0000000..b3c3b2b
--- /dev/null
+++ b/man/s3_list_objects_dir.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/s3.R
+\name{s3_list_objects_dir}
+\alias{s3_list_objects_dir}
+\title{List Objects in AWS S3 Directory}
+\usage{
+s3_list_objects_dir(s3dir, max_objects = 1000)
+}
+\arguments{
+\item{s3dir}{S3 directory.}
+
+\item{max_objects}{Maximum objects returned.}
+}
+\value{
+A tibble with object basename, size, last modified timestamp, and
+full S3 path.
+}
+\description{
+Returns some or all (up to 1,000) of the objects in an S3 directory.
+}
+\examples{
+\dontrun{
+p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
+p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
+s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables")
+s3_list_objects_dir(s3dir, max_objects = 15)
+}
+}

From aa10e2e833cc1a397b4d3b5edcca446d1e3a51be Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 7 Sep 2024 16:09:36 +1000
Subject: [PATCH 07/24] use s3_file_presignedurl with s3v4 sig version

---
 NAMESPACE                            |  1 +
 R/s3.R                               | 61 +++++++++++++++++-----------
 man/s3_file_presignedurl.Rd          | 29 +++++++++++++
 man/s3_files_list_filter_relevant.Rd | 11 +++--
 4 files changed, 72 insertions(+), 30 deletions(-)
 create mode 100644 man/s3_file_presignedurl.Rd

diff --git a/NAMESPACE b/NAMESPACE
index abe8a14..c069045 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -69,6 +69,7 @@ export(multiqc_parse_xyline_plot_contig_cvg)
 export(multiqc_tidy_json)
 export(rdf2tab)
 export(read)
+export(s3_file_presignedurl)
 export(s3_files_list_filter_relevant)
 export(s3_list_objects_dir)
 export(s3_search)
diff --git a/R/s3.R b/R/s3.R
index 555d7a9..520ba2c 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -63,46 +63,59 @@ s3_list_objects_dir <- function(s3dir, max_objects = 1000) {
 #' s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE)
 #' }
 #' @export
-s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200, regexes = DR_FILE_REGEX) {
-  assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign))
+s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100,
+                                          presign = FALSE, expiry_sec = 43200,
+                                          regexes = DR_FILE_REGEX) {
+  assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000)
+  d_all <- s3_list_objects_dir(s3dir, max_objects = max_objects)
+  if (nrow(d_all) == 0) {
+    return(d_all)
+  }
   pattern <- pattern %||% ".*" # keep all recognisable files by default
-  b <- sub("s3://(.*?)/.*", "\\1", s3dir)
-  p <- sub("s3://(.*?)/(.*)", "\\2", s3dir)
-  cmd <- glue(
-    "aws --output json s3api list-objects-v2 --bucket {b} --prefix {p} ",
-    "--max-items {max_items} --page-size {page_size}"
-  )
-  l <- system(cmd, intern = TRUE)
-  j <- jsonlite::fromJSON(l)
-  assertthat::assert_that("Contents" %in% names(j))
-  d <- j[["Contents"]] |>
-    tibble::as_tibble() |>
-    dplyr::mutate(
-      path = glue("s3://{b}/{.data$Key}"),
-      date_utc = .data$LastModified,
-      size = fs::as_fs_bytes(.data$Size)
-    ) |>
+  d <- d_all |>
     dplyr::rowwise() |>
     dplyr::mutate(
-      bname = basename(.data$path),
       type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))
     ) |>
     dplyr::ungroup() |>
     dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::select("type", "bname", "size", "date_utc", "path")
+    dplyr::select("type", "bname", "size", "lastmodified", "path")
 
   if (presign) {
+    s3_client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4"))
     d <- d |>
       dplyr::rowwise() |>
-      dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path, expiry_seconds = expiry_sec)) |>
+      dplyr::mutate(presigned_url = s3_file_presignedurl(
+        client = s3_client, s3_path = .data$path, expiry_seconds = expiry_sec
+      )) |>
       dplyr::ungroup()
   }
   d
 }
 
-s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) {
-  p <- system(glue("aws s3 presign {s3path} --expires-in {expiry_seconds}"), intern = TRUE)
-  p
+#' S3 Generate Presigned URL
+#'
+#' @param client S3 client. Make sure you use `signature_version = "s3v4"` (see example).
+#' @param s3path Full path to S3 object.
+#' @param expiry_seconds Number of seconds the presigned URL is valid for (3600 = 1 hour).
+#'
+#' @return An S3 presigned URL.
+#' @examples
+#' \dontrun{
+#' client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4"))
+#' s3path <- "s3://bucket1/path/to/file.tsv"
+#' s3_file_presignedurl(client, s3path)
+#' }
+#'
+#' @export
+s3_file_presignedurl <- function(client, s3path, expiry_seconds = 3600) {
+  bucket <- sub("s3://(.*?)/.*", "\\1", s3path)
+  prefix <- sub("s3://(.*?)/(.*)", "\\2", s3path)
+  client$generate_presigned_url(
+    client_method = "get_object",
+    params = list(Bucket = bucket, Key = prefix),
+    expires_in = expiry_seconds
+  )
 }
 
 #' Search AWS S3 Objects
diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd
new file mode 100644
index 0000000..b041252
--- /dev/null
+++ b/man/s3_file_presignedurl.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/s3.R
+\name{s3_file_presignedurl}
+\alias{s3_file_presignedurl}
+\title{S3 Generate Presigned URL}
+\usage{
+s3_file_presignedurl(client, s3path, expiry_seconds = 3600)
+}
+\arguments{
+\item{s3path}{Full path to S3 object.}
+
+\item{expiry_seconds}{Number of seconds the presigned URL is valid for (3600 = 1 hour).}
+
+\item{svc}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).}
+}
+\value{
+An S3 presigned URL.
+}
+\description{
+S3 Generate Presigned URL
+}
+\examples{
+\dontrun{
+client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4"))
+s3path <- "s3://bucket1/path/to/file.tsv"
+s3_file_presignedurl(client, s3path)
+}
+
+}
diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd
index a3ded1f..57caf1d 100644
--- a/man/s3_files_list_filter_relevant.Rd
+++ b/man/s3_files_list_filter_relevant.Rd
@@ -7,8 +7,7 @@
 s3_files_list_filter_relevant(
   s3dir,
   pattern = NULL,
-  page_size = 1000,
-  max_items = 1000,
+  max_objects = 100,
   presign = FALSE,
   expiry_sec = 43200,
   regexes = DR_FILE_REGEX
@@ -19,15 +18,15 @@ s3_files_list_filter_relevant(
 
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
-\item{page_size}{The size of each page to get in the AWS service call.}
-
-\item{max_items}{The total number of items to return in the command’s output.}
-
 \item{presign}{Include presigned URLs (def: FALSE).}
 
 \item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).}
 
 \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
+
+\item{page_size}{The size of each page to get in the AWS service call.}
+
+\item{max_items}{The total number of items to return in the command’s output.}
 }
 \value{
 A tibble with file type, basename, file size, date, full path, and presigned URL if requested.

From 633fd714c641e136f0410d11889d84f9b913a692 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sat, 7 Sep 2024 18:20:07 +1000
Subject: [PATCH 08/24] s3 wrapper fixes

---
 R/s3.R                               | 39 +++++++++++++++-------------
 man/dr_s3_download.Rd                |  8 +++---
 man/s3_file_presignedurl.Rd          |  4 +--
 man/s3_files_list_filter_relevant.Rd | 10 +++----
 4 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/R/s3.R b/R/s3.R
index 520ba2c..2885615 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -50,10 +50,9 @@ s3_list_objects_dir <- function(s3dir, max_objects = 1000) {
 #'
 #' @param s3dir S3 directory.
 #' @param pattern Pattern to further filter the returned file type tibble.
-#' @param page_size The size of each page to get in the AWS service call.
-#' @param max_items The total number of items to return in the command’s output.
+#' @param max_objects The total number of objects to return.
 #' @param presign Include presigned URLs (def: FALSE).
-#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).
+#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated).
 #' @param regexes Tibble with `regex` and `fun`ction name.
 #'
 #' @return A tibble with file type, basename, file size, date, full path, and presigned URL if requested.
@@ -64,10 +63,10 @@ s3_list_objects_dir <- function(s3dir, max_objects = 1000) {
 #' }
 #' @export
 s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100,
-                                          presign = FALSE, expiry_sec = 43200,
+                                          presign = FALSE, expiry_sec = 3600,
                                           regexes = DR_FILE_REGEX) {
   assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000)
-  d_all <- s3_list_objects_dir(s3dir, max_objects = max_objects)
+  d_all <- s3_list_objects_dir(s3dir = s3dir, max_objects = max_objects)
   if (nrow(d_all) == 0) {
     return(d_all)
   }
@@ -82,11 +81,14 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1
     dplyr::select("type", "bname", "size", "lastmodified", "path")
 
   if (presign) {
+    if (nrow(d) == 0) {
+      return(d)
+    }
     s3_client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4"))
     d <- d |>
       dplyr::rowwise() |>
       dplyr::mutate(presigned_url = s3_file_presignedurl(
-        client = s3_client, s3_path = .data$path, expiry_seconds = expiry_sec
+        client = s3_client, s3path = .data$path, expiry_seconds = expiry_sec
       )) |>
       dplyr::ungroup()
   }
@@ -166,36 +168,37 @@ s3_search <- function(pat, rows) {
 #'
 #' @param s3dir Full path to S3 directory.
 #' @param outdir Path to output directory.
-#' @param page_size Page size (def: 100).
+#' @param max_objects Maximum objects returned in file listing.
 #' @param pattern Pattern to further filter the returned file type tibble.
 #' @param regexes Tibble with regex and function name.
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
 #' @examples
+#' \dontrun{
 #' s3dir <- file.path(
 #'   "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596",
 #'   "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067"
 #' )
 #' outdir <- sub("s3:/", "~/s3", s3dir)
-#'
+#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F)
+#' }
 #' @export
-dr_s3_download <- function(s3dir, outdir, page_size = 100, pattern = NULL, regexes = DR_FILE_REGEX, dryrun = FALSE) {
+dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
+                           regexes = DR_FILE_REGEX, dryrun = FALSE) {
   s3 <- paws.storage::s3()
   e <- emojifont::emoji
   fs::dir_create(outdir)
-  d <- s3_files_list_filter_relevant(s3dir, pattern = NULL, page_size = page_size, max_items = 1000, presign = FALSE, expiry_sec = 43200)
+  d <- s3_files_list_filter_relevant(
+    s3dir = s3dir, pattern = NULL, max_objects = max_objects, presign = FALSE, regexes = regexes
+  )
   d <- d |>
-    dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
-    dplyr::select("type", "size", "path", "bname")
+    dplyr::select("type", "size", "path", "bname") |>
+    dplyr::mutate(out = file.path(outdir, .data$bname))
 
   # download recognisable dracarys files to outdir/{bname}
-  pattern <- pattern %||% ".*" # keep all recognisable files
-  d_filt <- d |>
-    dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::mutate(out = file.path(outdir, .data$bname))
   if (!dryrun) {
     cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}")
-    d_filt |>
+    d |>
       dplyr::rowwise() |>
       dplyr::mutate(
         s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path),
@@ -208,7 +211,7 @@ dr_s3_download <- function(s3dir, outdir, page_size = 100, pattern = NULL, regex
       )
   } else {
     cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}")
-    d_filt |>
+    d |>
       dplyr::select("path", "type", "size") |>
       as.data.frame() |>
       print()
diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd
index 76f0339..985d415 100644
--- a/man/dr_s3_download.Rd
+++ b/man/dr_s3_download.Rd
@@ -7,7 +7,7 @@
 dr_s3_download(
   s3dir,
   outdir,
-  page_size = 100,
+  max_objects = 100,
   pattern = NULL,
   regexes = DR_FILE_REGEX,
   dryrun = FALSE
@@ -18,7 +18,7 @@ dr_s3_download(
 
 \item{outdir}{Path to output directory.}
 
-\item{page_size}{Page size (def: 100).}
+\item{max_objects}{Maximum objects returned in file listing.}
 
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
@@ -31,10 +31,12 @@ download them).}
 Download only S3 files that can be processed by dracarys.
 }
 \examples{
+\dontrun{
 s3dir <- file.path(
   "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596",
   "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067"
 )
 outdir <- sub("s3:/", "~/s3", s3dir)
-
+dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun=F)
+}
 }
diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd
index b041252..1598a92 100644
--- a/man/s3_file_presignedurl.Rd
+++ b/man/s3_file_presignedurl.Rd
@@ -7,11 +7,11 @@
 s3_file_presignedurl(client, s3path, expiry_seconds = 3600)
 }
 \arguments{
+\item{client}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).}
+
 \item{s3path}{Full path to S3 object.}
 
 \item{expiry_seconds}{Number of seconds the presigned URL is valid for (3600 = 1 hour).}
-
-\item{svc}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).}
 }
 \value{
 An S3 presigned URL.
diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd
index 57caf1d..927e982 100644
--- a/man/s3_files_list_filter_relevant.Rd
+++ b/man/s3_files_list_filter_relevant.Rd
@@ -9,7 +9,7 @@ s3_files_list_filter_relevant(
   pattern = NULL,
   max_objects = 100,
   presign = FALSE,
-  expiry_sec = 43200,
+  expiry_sec = 3600,
   regexes = DR_FILE_REGEX
 )
 }
@@ -18,15 +18,13 @@ s3_files_list_filter_relevant(
 
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
+\item{max_objects}{The total number of objects to return.}
+
 \item{presign}{Include presigned URLs (def: FALSE).}
 
-\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).}
+\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated).}
 
 \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
-
-\item{page_size}{The size of each page to get in the AWS service call.}
-
-\item{max_items}{The total number of items to return in the command’s output.}
 }
 \value{
 A tibble with file type, basename, file size, date, full path, and presigned URL if requested.

From 6323c627665041f42f462a1bfd10201798b59ed2 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 8 Sep 2024 01:04:40 +1000
Subject: [PATCH 09/24] keep gds funcs in sync with s3

---
 NAMESPACE                                     |   8 +-
 R/ica.R                                       | 338 +++++++++---------
 R/s3.R                                        | 125 +++----
 man/dr_s3_download.Rd                         |   2 +-
 ...ds_files_list.Rd => gds_list_files_dir.Rd} |  28 +-
 ...t.Rd => gds_list_files_filter_relevant.Rd} |  13 +-
 ...st_objects_dir.Rd => s3_list_files_dir.Rd} |   8 +-
 ...nt.Rd => s3_list_files_filter_relevant.Rd} |  11 +-
 8 files changed, 262 insertions(+), 271 deletions(-)
 rename man/{gds_files_list.Rd => gds_list_files_dir.Rd} (63%)
 rename man/{gds_files_list_filter_relevant.Rd => gds_list_files_filter_relevant.Rd} (72%)
 rename man/{s3_list_objects_dir.Rd => s3_list_files_dir.Rd} (80%)
 rename man/{s3_files_list_filter_relevant.Rd => s3_list_files_filter_relevant.Rd} (75%)

diff --git a/NAMESPACE b/NAMESPACE
index c069045..ef0ba80 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -49,9 +49,9 @@ export(file_regex_getter)
 export(gds_file_download)
 export(gds_file_download_api)
 export(gds_file_presignedurl)
-export(gds_files_list)
 export(gds_files_list_fastq)
-export(gds_files_list_filter_relevant)
+export(gds_list_files_dir)
+export(gds_list_files_filter_relevant)
 export(gds_volumes_list)
 export(grep_file)
 export(ica_token_validate)
@@ -70,8 +70,8 @@ export(multiqc_tidy_json)
 export(rdf2tab)
 export(read)
 export(s3_file_presignedurl)
-export(s3_files_list_filter_relevant)
-export(s3_list_objects_dir)
+export(s3_list_files_dir)
+export(s3_list_files_filter_relevant)
 export(s3_search)
 export(session_info_kable)
 export(time_metrics_process)
diff --git a/R/ica.R b/R/ica.R
index 3e3587b..059377e 100644
--- a/R/ica.R
+++ b/R/ica.R
@@ -1,158 +1,21 @@
-#' List Relevant Files In GDS Directory
+#' List files in ICAv1 GDS Directory
 #'
-#' Lists relevant files in a GDS directory.
-#'
-#' @param gdsdir GDS directory.
-#' @param token ICA access token.
-#' @param pattern Pattern to further filter the returned file type tibble.
-#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE).
-#' @param page_size Page size (def: 100).
-#' @param regexes Tibble with `regex` and `fun`ction name.
-#'
-#' @return A tibble with type, bname, size, file_id, path, and presigned URL.
-#' \dontrun{
-#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
-#' gds_files_list_filter_relevant(gdsdir)
-#' }
-#' @export
-gds_files_list_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
-                                           pattern = NULL, include_url = FALSE,
-                                           page_size = 100, regexes = DR_FILE_REGEX, ...) {
-  pattern <- pattern %||% ".*" # keep all recognisable files by default
-  cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url")
-  d <- dracarys::gds_files_list(
-    gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, ...
-  ) |>
-    dplyr::rowwise() |>
-    dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
-    dplyr::ungroup() |>
-    dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::select(dplyr::any_of(cols_sel))
-  d
-}
-
-#' List FASTQs In GDS Directory
-#'
-#' @param gdsdir GDS directory.
-#' @param token ICA access token.
-#' @param include_url Include presigned URLs to all files within the GDS directory.
-#' @param page_size Page size.
-#'
-#' @return A tibble with type, bname, size, file_id, path, and presigned URL.
-#'
-#' @examples
-#' \dontrun{
-#' prim <- "gds://production/primary_data"
-#' run <- "240719_A00130_0323_BHMCYHDSXC/202407205bad380d/BiModal_BM-5L"
-#' gdsdir <- file.path(prim, run)
-#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
-#' include_url <- F
-#' page_size <- 100
-#' gds_files_list_fastq(gdsdir, token, include_url, page_size)
-#' }
-#' @export
-gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = 100) {
-  fq_regex <- tibble::tribble(
-    ~regex, ~fun,
-    "fastq\\.gz$", "FASTQ"
-  )
-  g <- gds_files_list_filter_relevant(
-    gdsdir = gdsdir, token = token, pattern = NULL, include_url = include_url,
-    page_size = page_size, regexes = fq_regex
-  )
-  assertthat::assert_that(
-    all(colnames(g) == c("type", "bname", "size", "file_id", "path"))
-  )
-  g |>
-    dplyr::mutate(
-      size_chr = as.character(.data$size),
-      size_num = as.numeric(.data$size)
-    ) |>
-    dplyr::select(
-      "type", "bname", "size", "size_chr", "size_num", "file_id", "path"
-    )
-}
-
-#' GDS File Presigned URL
-#'
-#' Returns presigned URL of given GDS file.
-#'
-#' @param gds_fileid GDS file ID.
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#' @return Presigned URL if valid.
-#' @export
-gds_file_presignedurl <- function(gds_fileid, token) {
-  token <- ica_token_validate(token)
-  base_url <- "https://aps2.platform.illumina.com/v1"
-  url <- glue("{base_url}/files/{gds_fileid}")
-  res <- httr::GET(
-    url,
-    httr::add_headers(Authorization = glue("Bearer {token}")),
-    httr::accept_json()
-  )
-  presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]]
-  assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url))
-  presigned_url
-}
-
-#' GDS File Download via API
-#'
-#' @param gds_fileid GDS file ID.
-#' @param out_file Path to output file.
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#'
-#' @examples
-#' \dontrun{
-#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0"
-#' out <- tempfile()
-#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
-#' }
-#' @export
-gds_file_download_api <- function(gds_fileid, out_file, token) {
-  presigned_url <- gds_file_presignedurl(gds_fileid, token)
-  # keep quiet instead of logging presigned urls
-  status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE)
-  assertthat::assert_that(status_code == 0)
-  out_file
-}
-
-#' GDS File Download via CLI
-#'
-#' @param gds Full path to GDS file.
-#' @param out Path to output file.
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#' @export
-gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) {
-  token <- ica_token_validate(token)
-  system(glue("ica files download {gds} {out} --access-token {token}"))
-}
-
-#' GDS Files List
-#'
-#' List files on ICA GDS filesystem.
+#' Lists files in a GDS directory.
 #'
 #' @param gdsdir Full path to GDS directory.
 #' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#' @param page_size Page size (def: 10).
-#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE).
+#' @param page_size Page size (def: 10 via ICA API).
+#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).
 #' @param page_token Page token (def: NULL). Used internally for recursion.
 #' @param no_recurse Do not recurse through the file list i.e. just give the first <page_size> items
 #' without recursing further down the list using <page_token>.
 #' @param recursive Should files be returned recursively _in and under_ the specified
-#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE).
+#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
 #'
-#' @return Tibble with file basename, file size, file full data path, file dir name.
+#' @return A tibble with file ID, basename, size, last modified timestamp, full GDS path, presigned URL.
 #' @examples
 #' \dontrun{
 #' gdsdir <- file.path(
-#'   "gds://production/primary_data",
-#'   "240322_A00130_0290_BH5HLLDSXC/20240323f56ec5a5/WGS_TsqNano"
-#' )
-#' gdsdir <- file.path(
-#'   "gds://bssh.acddbfda498038ed99fa94fe79523959/Runs",
-#'   "240322_A00130_0290_BH5HLLDSXC_r.3TbcOsEKZUyetygkqIOXcg/InterOp"
-#' )
-#' gdsdir <- file.path(
 #'   "gds://production/analysis_data/SBJ00699/umccrise",
 #'   "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage"
 #' )
@@ -162,11 +25,11 @@ gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN"))
 #' page_token <- NULL
 #' no_recurse <- TRUE
 #' recursive <- NULL
-#' gds_files_list(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive)
+#' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive)
 #' }
 #' @export
-gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE,
-                           no_recurse = TRUE, page_token = NULL, recursive = NULL) {
+gds_list_files_dir <- function(gdsdir, token, page_size = NULL, include_url = FALSE,
+                               no_recurse = TRUE, page_token = NULL, recursive = NULL) {
   assertthat::assert_that(is.logical(no_recurse), is.logical(include_url))
   assertthat::assert_that(is.null(recursive) || is.logical(recursive))
   token <- ica_token_validate(token)
@@ -213,7 +76,10 @@ gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE,
     }
   } # endif
   d <- j[["items"]] |>
-    purrr::map(\(x) c(file_id = x[["id"]], path = x[["path"]], size = x[["sizeInBytes"]], presigned_url = x[["presignedUrl"]])) |>
+    purrr::map(\(x) c(
+      file_id = x[["id"]], path = x[["path"]], size = x[["sizeInBytes"]],
+      lastmodified = x[["timeModified"]], presigned_url = x[["presignedUrl"]]
+    )) |>
     dplyr::bind_rows()
   if (nrow(d) == 0) {
     # We've iterated through all available items, and the next page has 0 items.
@@ -224,12 +90,11 @@ gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE,
     dplyr::mutate(
       size = fs::as_fs_bytes(.data$size),
       bname = basename(.data$path),
-      path = glue("gds://{volname}{.data$path}"),
-      dname = basename(dirname(.data$path))
+      path = glue("gds://{volname}{.data$path}")
     ) |>
-    dplyr::select(dplyr::any_of(c("file_id", "bname", "size", "path", "dname", "presigned_url")))
+    dplyr::select(dplyr::any_of(c("file_id", "bname", "size", "lastmodified", "path", "presigned_url")))
   if (!is.null(j[["nextPageToken"]]) && !no_recurse) {
-    res2 <- gds_files_list(
+    res2 <- gds_list_files_dir(
       gdsdir = gdsdir, token = token, page_size = NULL,
       include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]],
       recursive = NULL
@@ -239,30 +104,40 @@ gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE,
   res
 }
 
-#' List GDS Volumes
+#' List Relevant Files In ICAv1 GDS Directory
 #'
-#' Lists GDS volumes accessible by the provided ICA token.
+#' Lists relevant files in a GDS directory.
 #'
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#' @param page_size Page size (def: 10).
+#' @param gdsdir GDS directory.
+#' @param token ICA access token.
+#' @param pattern Pattern to further filter the returned file type tibble.
+#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE).
+#' @param page_size Page size (def: 100).
+#' @param regexes Tibble with `regex` and `fun`ction name.
 #'
-#' @return A tibble with vol name and vol id.
+#' @return A tibble with file type, basename, size, file_id, full path,
+#' and presigned URL if requested.
+#' \dontrun{
+#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
+#' gds_list_files_filter_relevant(gdsdir)
+#' }
 #' @export
-gds_volumes_list <- function(token, page_size = 10) {
-  token <- ica_token_validate(token)
-  base_url <- "https://aps2.platform.illumina.com/v1"
-  query_url <- glue("{base_url}/volumes?pageSize={page_size}")
-
-  res <- httr::GET(
-    query_url,
-    httr::add_headers(Authorization = glue("Bearer {token}")),
-    httr::accept_json()
-  )
-  j <- jsonlite::fromJSON(httr::content(x = res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE)
-  purrr::map_df(j[["items"]], function(x) c(name = x[["name"]], id = x[["id"]]))
+gds_list_files_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                                           pattern = NULL, include_url = FALSE,
+                                           page_size = 100, regexes = DR_FILE_REGEX, ...) {
+  pattern <- pattern %||% ".*" # keep all recognisable files by default
+  cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url")
+  d <- dracarys::gds_list_files_dir(
+    gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, ...
+  ) |>
+    dplyr::rowwise() |>
+    dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
+    dplyr::ungroup() |>
+    dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
+    dplyr::select(dplyr::any_of(cols_sel))
+  d
 }
 
-
 #' dracarys GDS Download
 #'
 #' Download only GDS files that can be processed by dracarys.
@@ -283,12 +158,12 @@ dr_gds_download <- function(gdsdir, outdir, token, page_size = 100, pattern = NU
                             dryrun = FALSE, regexes = DR_FILE_REGEX, recursive = NULL) {
   e <- emojifont::emoji
   fs::dir_create(outdir)
-  d <- gds_files_list(
+  d <- gds_list_files_dir(
     gdsdir = gdsdir, token = token, page_size = page_size,
     no_recurse = FALSE, recursive = recursive
   ) |>
     dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
-    dplyr::select("file_id", "dname", "type", "size", "path", "bname")
+    dplyr::select("file_id", "type", "size", "path", "bname")
 
   # download recognisable dracarys files to outdir/{bname}
   pattern <- pattern %||% ".*" # keep all recognisable files
@@ -309,6 +184,127 @@ dr_gds_download <- function(gdsdir, outdir, token, page_size = 100, pattern = NU
   }
 }
 
+
+#' GDS File Presigned URL
+#'
+#' Returns presigned URL of given GDS file.
+#'
+#' @param gds_fileid GDS file ID.
+#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#' @return Presigned URL if valid.
+#' @export
+gds_file_presignedurl <- function(gds_fileid, token) {
+  token <- ica_token_validate(token)
+  base_url <- "https://aps2.platform.illumina.com/v1"
+  url <- glue("{base_url}/files/{gds_fileid}")
+  res <- httr::GET(
+    url,
+    httr::add_headers(Authorization = glue("Bearer {token}")),
+    httr::accept_json()
+  )
+  presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]]
+  assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url))
+  presigned_url
+}
+
+#' List FASTQs In GDS Directory
+#'
+#' @param gdsdir GDS directory.
+#' @param token ICA access token.
+#' @param include_url Include presigned URLs to all files within the GDS directory.
+#' @param page_size Page size.
+#'
+#' @return A tibble with type, bname, size, file_id, path, and presigned URL.
+#'
+#' @examples
+#' \dontrun{
+#' prim <- "gds://production/primary_data"
+#' run <- "240719_A00130_0323_BHMCYHDSXC/202407205bad380d/BiModal_BM-5L"
+#' gdsdir <- file.path(prim, run)
+#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
+#' include_url <- F
+#' page_size <- 100
+#' gds_files_list_fastq(gdsdir, token, include_url, page_size)
+#' }
+#' @export
+gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = 100) {
+  fq_regex <- tibble::tribble(
+    ~regex, ~fun,
+    "fastq\\.gz$", "FASTQ"
+  )
+  g <- gds_list_files_filter_relevant(
+    gdsdir = gdsdir, token = token, pattern = NULL, include_url = include_url,
+    page_size = page_size, regexes = fq_regex
+  )
+  assertthat::assert_that(
+    all(colnames(g) == c("type", "bname", "size", "file_id", "path"))
+  )
+  g |>
+    dplyr::mutate(
+      size_chr = as.character(.data$size),
+      size_num = as.numeric(.data$size)
+    ) |>
+    dplyr::select(
+      "type", "bname", "size", "size_chr", "size_num", "file_id", "path"
+    )
+}
+
+#' GDS File Download via API
+#'
+#' @param gds_fileid GDS file ID.
+#' @param out_file Path to output file.
+#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#'
+#' @examples
+#' \dontrun{
+#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0"
+#' out <- tempfile()
+#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
+#' }
+#' @export
+gds_file_download_api <- function(gds_fileid, out_file, token) {
+  presigned_url <- gds_file_presignedurl(gds_fileid, token)
+  # keep quiet instead of logging presigned urls
+  status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE)
+  assertthat::assert_that(status_code == 0)
+  out_file
+}
+
+#' GDS File Download via CLI
+#'
+#' @param gds Full path to GDS file.
+#' @param out Path to output file.
+#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#' @export
+gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) {
+  token <- ica_token_validate(token)
+  system(glue("ica files download {gds} {out} --access-token {token}"))
+}
+
+#' List GDS Volumes
+#'
+#' Lists GDS volumes accessible by the provided ICA token.
+#'
+#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#' @param page_size Page size (def: 10).
+#'
+#' @return A tibble with vol name and vol id.
+#' @export
+gds_volumes_list <- function(token, page_size = 10) {
+  token <- ica_token_validate(token)
+  base_url <- "https://aps2.platform.illumina.com/v1"
+  query_url <- glue("{base_url}/volumes?pageSize={page_size}")
+
+  res <- httr::GET(
+    query_url,
+    httr::add_headers(Authorization = glue("Bearer {token}")),
+    httr::accept_json()
+  )
+  j <- jsonlite::fromJSON(httr::content(x = res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE)
+  purrr::map_df(j[["items"]], function(x) c(name = x[["name"]], id = x[["id"]]))
+}
+
+
 #' Validate ICA access token
 #'
 #' Validates ICA access token by parsing it and checking its expiration date.
diff --git a/R/s3.R b/R/s3.R
index 2885615..fcd7802 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -13,10 +13,10 @@
 #' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
 #' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
 #' s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables")
-#' s3_list_objects_dir(s3dir, max_objects = 15)
+#' s3_list_files_dir(s3dir, max_objects = 15)
 #' }
 #' @export
-s3_list_objects_dir <- function(s3dir, max_objects = 1000) {
+s3_list_files_dir <- function(s3dir, max_objects = 1000) {
   assertthat::assert_that(grepl("^s3://", s3dir))
   bucket <- sub("s3://(.*?)/.*", "\\1", s3dir)
   prefix <- sub("s3://(.*?)/(.*)", "\\2", s3dir)
@@ -55,18 +55,19 @@ s3_list_objects_dir <- function(s3dir, max_objects = 1000) {
 #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated).
 #' @param regexes Tibble with `regex` and `fun`ction name.
 #'
-#' @return A tibble with file type, basename, file size, date, full path, and presigned URL if requested.
+#' @return A tibble with file type, basename, size, date, full path,
+#' and presigned URL if requested.
 #' @examples
 #' \dontrun{
 #' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables"
-#' s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE)
+#' s3_list_files_filter_relevant(s3dir = s3dir, presign = FALSE)
 #' }
 #' @export
-s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100,
+s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100,
                                           presign = FALSE, expiry_sec = 3600,
                                           regexes = DR_FILE_REGEX) {
   assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000)
-  d_all <- s3_list_objects_dir(s3dir = s3dir, max_objects = max_objects)
+  d_all <- s3_list_files_dir(s3dir = s3dir, max_objects = max_objects)
   if (nrow(d_all) == 0) {
     return(d_all)
   }
@@ -95,6 +96,62 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1
   d
 }
 
+#' dracarys S3 Download
+#'
+#' Download only S3 files that can be processed by dracarys.
+#'
+#' @param s3dir Full path to S3 directory.
+#' @param outdir Path to output directory.
+#' @param max_objects Maximum objects returned in file listing.
+#' @param pattern Pattern to further filter the returned file type tibble.
+#' @param regexes Tibble with regex and function name.
+#' @param dryrun If TRUE, just list the files that will be downloaded (don't
+#' download them).
+#' @examples
+#' \dontrun{
+#' s3dir <- file.path(
+#'   "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596",
+#'   "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067"
+#' )
+#' outdir <- sub("s3:/", "~/s3", s3dir)
+#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F)
+#' }
+#' @export
+dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
+                           regexes = DR_FILE_REGEX, dryrun = FALSE) {
+  s3 <- paws.storage::s3()
+  e <- emojifont::emoji
+  fs::dir_create(outdir)
+  d <- s3_list_files_filter_relevant(
+    s3dir = s3dir, pattern = NULL, max_objects = max_objects, presign = FALSE, regexes = regexes
+  )
+  d <- d |>
+    dplyr::select("type", "size", "path", "bname") |>
+    dplyr::mutate(out = file.path(outdir, .data$bname))
+
+  # download recognisable dracarys files to outdir/{bname}
+  if (!dryrun) {
+    cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}")
+    d |>
+      dplyr::rowwise() |>
+      dplyr::mutate(
+        s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path),
+        s3key = sub("s3://(.*?)/(.*)", "\\2", .data$path),
+        dl = list(
+          s3$download_file(
+            Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$out
+          )
+        )
+      )
+  } else {
+    cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}")
+    d |>
+      dplyr::select("path", "type", "size") |>
+      as.data.frame() |>
+      print()
+  }
+}
+
 #' S3 Generate Presigned URL
 #'
 #' @param client S3 client. Make sure you use `signature_version = "s3v4"` (see example).
@@ -161,59 +218,3 @@ s3_search <- function(pat, rows) {
     ) |>
     dplyr::select("path", "size", "date_aest", "id", "unique_hash")
 }
-
-#' dracarys S3 Download
-#'
-#' Download only S3 files that can be processed by dracarys.
-#'
-#' @param s3dir Full path to S3 directory.
-#' @param outdir Path to output directory.
-#' @param max_objects Maximum objects returned in file listing.
-#' @param pattern Pattern to further filter the returned file type tibble.
-#' @param regexes Tibble with regex and function name.
-#' @param dryrun If TRUE, just list the files that will be downloaded (don't
-#' download them).
-#' @examples
-#' \dontrun{
-#' s3dir <- file.path(
-#'   "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596",
-#'   "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067"
-#' )
-#' outdir <- sub("s3:/", "~/s3", s3dir)
-#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F)
-#' }
-#' @export
-dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
-                           regexes = DR_FILE_REGEX, dryrun = FALSE) {
-  s3 <- paws.storage::s3()
-  e <- emojifont::emoji
-  fs::dir_create(outdir)
-  d <- s3_files_list_filter_relevant(
-    s3dir = s3dir, pattern = NULL, max_objects = max_objects, presign = FALSE, regexes = regexes
-  )
-  d <- d |>
-    dplyr::select("type", "size", "path", "bname") |>
-    dplyr::mutate(out = file.path(outdir, .data$bname))
-
-  # download recognisable dracarys files to outdir/{bname}
-  if (!dryrun) {
-    cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}")
-    d |>
-      dplyr::rowwise() |>
-      dplyr::mutate(
-        s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path),
-        s3key = sub("s3://(.*?)/(.*)", "\\2", .data$path),
-        dl = list(
-          s3$download_file(
-            Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$out
-          )
-        )
-      )
-  } else {
-    cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}")
-    d |>
-      dplyr::select("path", "type", "size") |>
-      as.data.frame() |>
-      print()
-  }
-}
diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd
index 985d415..d572c10 100644
--- a/man/dr_s3_download.Rd
+++ b/man/dr_s3_download.Rd
@@ -37,6 +37,6 @@ s3dir <- file.path(
   "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067"
 )
 outdir <- sub("s3:/", "~/s3", s3dir)
-dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun=F)
+dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F)
 }
 }
diff --git a/man/gds_files_list.Rd b/man/gds_list_files_dir.Rd
similarity index 63%
rename from man/gds_files_list.Rd
rename to man/gds_list_files_dir.Rd
index 30e3bd9..2a80f47 100644
--- a/man/gds_files_list.Rd
+++ b/man/gds_list_files_dir.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ica.R
-\name{gds_files_list}
-\alias{gds_files_list}
-\title{GDS Files List}
+\name{gds_list_files_dir}
+\alias{gds_list_files_dir}
+\title{List files in ICAv1 GDS Directory}
 \usage{
-gds_files_list(
+gds_list_files_dir(
   gdsdir,
   token,
   page_size = NULL,
@@ -19,9 +19,9 @@ gds_files_list(
 
 \item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
-\item{page_size}{Page size (def: 10).}
+\item{page_size}{Page size (def: 10 via ICA API).}
 
-\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE).}
+\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).}
 
 \item{no_recurse}{Do not recurse through the file list i.e. just give the first <page_size> items
 without recursing further down the list using <page_token>.}
@@ -29,24 +29,16 @@ without recursing further down the list using <page_token>.}
 \item{page_token}{Page token (def: NULL). Used internally for recursion.}
 
 \item{recursive}{Should files be returned recursively \emph{in and under} the specified
-GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE).}
+GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
 }
 \value{
-Tibble with file basename, file size, file full data path, file dir name.
+A tibble with file ID, basename, size, last modified timestamp, full GDS path, presigned URL.
 }
 \description{
-List files on ICA GDS filesystem.
+Lists files in a GDS directory.
 }
 \examples{
 \dontrun{
-gdsdir <- file.path(
-  "gds://production/primary_data",
-  "240322_A00130_0290_BH5HLLDSXC/20240323f56ec5a5/WGS_TsqNano"
-)
-gdsdir <- file.path(
-  "gds://bssh.acddbfda498038ed99fa94fe79523959/Runs",
-  "240322_A00130_0290_BH5HLLDSXC_r.3TbcOsEKZUyetygkqIOXcg/InterOp"
-)
 gdsdir <- file.path(
   "gds://production/analysis_data/SBJ00699/umccrise",
   "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage"
@@ -57,6 +49,6 @@ include_url <- TRUE
 page_token <- NULL
 no_recurse <- TRUE
 recursive <- NULL
-gds_files_list(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive)
+gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive)
 }
 }
diff --git a/man/gds_files_list_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd
similarity index 72%
rename from man/gds_files_list_filter_relevant.Rd
rename to man/gds_list_files_filter_relevant.Rd
index 151f1e5..4ae0612 100644
--- a/man/gds_files_list_filter_relevant.Rd
+++ b/man/gds_list_files_filter_relevant.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ica.R
-\name{gds_files_list_filter_relevant}
-\alias{gds_files_list_filter_relevant}
-\title{List Relevant Files In GDS Directory}
+\name{gds_list_files_filter_relevant}
+\alias{gds_list_files_filter_relevant}
+\title{List Relevant Files In ICAv1 GDS Directory}
 \usage{
-gds_files_list_filter_relevant(
+gds_list_files_filter_relevant(
   gdsdir,
   token = Sys.getenv("ICA_ACCESS_TOKEN"),
   pattern = NULL,
@@ -28,10 +28,11 @@ gds_files_list_filter_relevant(
 \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 }
 \value{
-A tibble with type, bname, size, file_id, path, and presigned URL.
+A tibble with file type, basename, size, file_id, full path,
+and presigned URL if requested.
 \dontrun{
 gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
-gds_files_list_filter_relevant(gdsdir)
+gds_list_files_filter_relevant(gdsdir)
 }
 }
 \description{
diff --git a/man/s3_list_objects_dir.Rd b/man/s3_list_files_dir.Rd
similarity index 80%
rename from man/s3_list_objects_dir.Rd
rename to man/s3_list_files_dir.Rd
index b3c3b2b..055692f 100644
--- a/man/s3_list_objects_dir.Rd
+++ b/man/s3_list_files_dir.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/s3.R
-\name{s3_list_objects_dir}
-\alias{s3_list_objects_dir}
+\name{s3_list_files_dir}
+\alias{s3_list_files_dir}
 \title{List Objects in AWS S3 Directory}
 \usage{
-s3_list_objects_dir(s3dir, max_objects = 1000)
+s3_list_files_dir(s3dir, max_objects = 1000)
 }
 \arguments{
 \item{s3dir}{S3 directory.}
@@ -23,6 +23,6 @@ Returns some or all (up to 1,000) of the objects in an S3 directory.
 p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
 p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
 s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables")
-s3_list_objects_dir(s3dir, max_objects = 15)
+s3_list_files_dir(s3dir, max_objects = 15)
 }
 }
diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd
similarity index 75%
rename from man/s3_files_list_filter_relevant.Rd
rename to man/s3_list_files_filter_relevant.Rd
index 927e982..1904ea5 100644
--- a/man/s3_files_list_filter_relevant.Rd
+++ b/man/s3_list_files_filter_relevant.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/s3.R
-\name{s3_files_list_filter_relevant}
-\alias{s3_files_list_filter_relevant}
+\name{s3_list_files_filter_relevant}
+\alias{s3_list_files_filter_relevant}
 \title{List Relevant Files In AWS S3 Directory}
 \usage{
-s3_files_list_filter_relevant(
+s3_list_files_filter_relevant(
   s3dir,
   pattern = NULL,
   max_objects = 100,
@@ -27,7 +27,8 @@ s3_files_list_filter_relevant(
 \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 }
 \value{
-A tibble with file type, basename, file size, date, full path, and presigned URL if requested.
+A tibble with file type, basename, size, date, full path,
+and presigned URL if requested.
 }
 \description{
 Lists relevant files in an AWS S3 directory.
@@ -35,6 +36,6 @@ Lists relevant files in an AWS S3 directory.
 \examples{
 \dontrun{
 s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables"
-s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE)
+s3_list_files_filter_relevant(s3dir = s3dir, presign = FALSE)
 }
 }

From c28d8fa0bdd90375ea1b8e4de2f48e3dc3862fae Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 8 Sep 2024 14:14:42 +1000
Subject: [PATCH 10/24] more gds/s3/local filesystem tweaks

---
 NAMESPACE                               |   3 +-
 R/Wf.R                                  | 123 ++++++++++++++---------
 R/ica.R                                 |  91 +++++++++--------
 R/s3.R                                  |  12 ++-
 R/utils.R                               |  47 ++++++---
 man/Wf.Rd                               | 128 ++++++++++++++++--------
 man/dr_gds_download.Rd                  |  22 ++--
 man/gds_list_files_dir.Rd               |   9 +-
 man/gds_list_files_filter_relevant.Rd   |  42 +++++---
 man/local_files_list_filter_relevant.Rd |  25 -----
 man/local_list_files_dir.Rd             |  22 ++++
 man/local_list_files_filter_relevant.Rd |  25 +++++
 man/s3_list_files_filter_relevant.Rd    |   4 +-
 13 files changed, 355 insertions(+), 198 deletions(-)
 delete mode 100644 man/local_files_list_filter_relevant.Rd
 create mode 100644 man/local_list_files_dir.Rd
 create mode 100644 man/local_list_files_filter_relevant.Rd

diff --git a/NAMESPACE b/NAMESPACE
index ef0ba80..8c6d3ab 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -55,7 +55,8 @@ export(gds_list_files_filter_relevant)
 export(gds_volumes_list)
 export(grep_file)
 export(ica_token_validate)
-export(local_files_list_filter_relevant)
+export(local_list_files_dir)
+export(local_list_files_filter_relevant)
 export(match_regex)
 export(multiqc_column_map_append)
 export(multiqc_date_fmt)
diff --git a/R/Wf.R b/R/Wf.R
index 317100f..80f6c08 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -1,46 +1,42 @@
-#' Workflow R6 Class
+#' @title Workflow
 #'
 #' @description Workflow is a base R6 class representing a bioinformatic
 #' workflow run from a UMCCR workflow manager.
 #'
+#' A workflow has:
+#'
+#' - an output directory path with all the result output files (either on GDS, S3 or
+#' local filesystem)
+#' - a subset of files that are of interest for ingestion
+#'   - tibble with full path and basename columns
+#' - a set of parsers that can parse and tidy those files
+#'   - each parser takes a path and returns a tidy tibble
+#' - a list of tidy tibbles (or a tibble with nested tibbles)
+#'
 #' @examples
-#' p1 <- system.file("extdata/portaldb_workflow_top4.rds", package = "rportal") |>
-#'   readRDS() |>
-#'   dplyr::filter(type_name == "umccrise") |>
-#'   dplyr::slice(1)
-#' w <- Wf$new(
-#'   prid = p1$portal_run_id, type = p1$type_name, start = p1$start, end = p1$end,
-#'   status = p1$end_status, input = p1$input, output = p1$output
-#' )
-#' w
+#' \dontrun{
+#' p1 <- "~/icav1/g/production/analysis_data"
+#' p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
+#' um <- Wf$new(p, "umccrise")
+#' }
+#'
 #' @export
 Wf <- R6::R6Class(
   "Wf",
   public = list(
-    #' @field prid Portal run ID.
-    #' @field type Workflow type.
-    #' @field start Workflow start datetime.
-    #' @field end Workflow end datetime.
-    #' @field status Workflow end status.
-    #' @field input Workflow input JSON string.
-    #' @field output Workflow output JSON string.
-    prid = NULL,
+    #' @field path (`character(1)`)\cr
+    #' Output directory path with results.
+    #' @field type (`character(1)`)\cr
+    #' Type of workflow (e.g. umccrise, sash).
+    #' @field filesystem (`character(1)`)\cr
+    #' Filesystem of `path`.
+    path = NULL,
     type = NULL,
-    start = NULL,
-    end = NULL,
-    status = NULL,
-    input = NULL,
-    output = NULL,
+    filesystem = NULL,
     #' @description Create a new Workflow object.
-    #' @param prid Portal run ID.
-    #' @param type Workflow type.
-    #' @param start Workflow start datetime.
-    #' @param end Workflow end datetime.
-    #' @param status Workflow end status.
-    #' @param input Workflow input JSON string.
-    #' @param output Workflow output JSON string.
-    initialize = function(prid = NULL, type = NULL, start = NULL, end = NULL,
-                          status = NULL, input = NULL, output = NULL) {
+    #' @param path Output directory path with results.
+    #' @param type Type of workflow.
+    initialize = function(path = NULL, type = NULL) {
       types <- c(
         "bcl_convert",
         "tso_ctdna_tumor_only",
@@ -56,30 +52,67 @@ Wf <- R6::R6Class(
         "oncoanalyser_wgts_existing_both",
         "sash"
       )
-      assertthat::assert_that(
-        type %in% types
-      )
-      self$prid <- prid
+      assertthat::assert_that(type %in% types)
+      self$path <- path
       self$type <- type
-      self$start <- start
-      self$end <- end
-      self$status <- status
-      self$input <- input
-      self$output <- output
+      self$filesystem <- dplyr::case_when(
+        grepl("^gds://", path) ~ "gds",
+        grepl("^s3://", path) ~ "s3",
+        .default = "local"
+      )
     },
     #' @description Print details about the Workflow.
     #' @param ... (ignored).
     print = function(...) {
       res <- tibble::tribble(
         ~var, ~value,
-        "prid", self$prid,
+        "path", self$path,
         "type", self$type,
-        "start", as.character(self$start),
-        "end", as.character(self$end),
-        "status", self$status,
+        "filesystem", self$filesystem
       )
       print(res)
       invisible(self)
+    },
+    #' @description List all files under given path.
+    #' @param max_objects Maximum number of objects to list.
+    list_files = function(max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN")) {
+      path <- self$path
+      if (self$filesystem == "gds") {
+        d <- gds_list_files_dir(gdsdir = path, page_size = max_objects, token = ica_token)
+      } else if (self$filesystem == "s3") {
+        d <- s3_list_files_dir(s3dir = path, max_objects = max_objects)
+      } else {
+        d <- local_list_files_dir(localdir = path)
+      }
+      return(d)
+    },
+    #' @description List dracarys files under given path
+    #' @param page_size Page size
+    #' @param regexes Tibble with `regex` and `fun`ction name.
+    list_files_filter_relevant = function(regexes = NULL, max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
+      assertthat::assert_that(!is.null(regexes))
+      path <- self$path
+      if (self$filesystem == "gds") {
+        d <- gds_list_files_filter_relevant(gdsdir = path, token = ica_token, page_size = max_objects, regexes = regexes, ...)
+      } else if (self$filesystem == "s3") {
+        # "type", "bname", "size", "date_utc", "path"
+        d <- s3_list_files_filter_relevant(s3dir = path, max_objects = max_objects, regexes = regexes, ...)
+      } else {
+        # "type", "bname", "path"
+        d <- local_list_files_filter_relevant(path = path, regexes = regexes)
+      }
+      d
+    },
+    download_files = function(ica_token = Sys.getenv("ICA_ACCESS_TOKEN")) {
+      # TODO: add envvar checker
+      path <- self$path
+      if (self$filesystem == "gds") {
+        d <- dr_gds_download(gdsdir = path, token = ica_token)
+      } else if (self$filesystem == "s3") {
+        d <- s3_download_files(s3dir = path)
+      } else {
+        d <- local_download_files(localdir = path)
+      }
     }
   ) # end public
 )
diff --git a/R/ica.R b/R/ica.R
index 059377e..84b9350 100644
--- a/R/ica.R
+++ b/R/ica.R
@@ -1,4 +1,4 @@
-#' List files in ICAv1 GDS Directory
+#' List Files in ICAv1 GDS Directory
 #'
 #' Lists files in a GDS directory.
 #'
@@ -12,7 +12,8 @@
 #' @param recursive Should files be returned recursively _in and under_ the specified
 #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
 #'
-#' @return A tibble with file ID, basename, size, last modified timestamp, full GDS path, presigned URL.
+#' @return A tibble with file ID, basename, size, last modified timestamp,
+#' full GDS path, and presigned URL if requested.
 #' @examples
 #' \dontrun{
 #' gdsdir <- file.path(
@@ -21,15 +22,16 @@
 #' )
 #' token <- ica_token_validate()
 #' page_size <- 11
-#' include_url <- TRUE
+#' include_url <- F
 #' page_token <- NULL
 #' no_recurse <- TRUE
 #' recursive <- NULL
 #' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive)
 #' }
 #' @export
-gds_list_files_dir <- function(gdsdir, token, page_size = NULL, include_url = FALSE,
-                               no_recurse = TRUE, page_token = NULL, recursive = NULL) {
+gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL,
+                               include_url = FALSE, no_recurse = TRUE, page_token = NULL,
+                               recursive = NULL) {
   assertthat::assert_that(is.logical(no_recurse), is.logical(include_url))
   assertthat::assert_that(is.null(recursive) || is.logical(recursive))
   token <- ica_token_validate(token)
@@ -108,27 +110,30 @@ gds_list_files_dir <- function(gdsdir, token, page_size = NULL, include_url = FA
 #'
 #' Lists relevant files in a GDS directory.
 #'
-#' @param gdsdir GDS directory.
-#' @param token ICA access token.
+#' @inheritParams gds_list_files_dir
 #' @param pattern Pattern to further filter the returned file type tibble.
-#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE).
-#' @param page_size Page size (def: 100).
 #' @param regexes Tibble with `regex` and `fun`ction name.
+#' @param ... Passed into `gds_list_files_dir`.
 #'
-#' @return A tibble with file type, basename, size, file_id, full path,
+#' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path,
 #' and presigned URL if requested.
+#' @examples
 #' \dontrun{
 #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 #' gds_list_files_filter_relevant(gdsdir)
 #' }
 #' @export
-gds_list_files_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
-                                           pattern = NULL, include_url = FALSE,
-                                           page_size = 100, regexes = DR_FILE_REGEX, ...) {
+gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_FILE_REGEX,
+                                           token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                                           page_size = 100, include_url = FALSE,
+                                           no_recurse = TRUE, page_token = NULL,
+                                           recursive = NULL) {
   pattern <- pattern %||% ".*" # keep all recognisable files by default
-  cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url")
+  assertthat::assert_that(all(colnames(regexes) == c("regex", "fun")))
+  cols_sel <- c("type", "bname", "size", "lastmodified", "file_id", "path", "presigned_url")
   d <- dracarys::gds_list_files_dir(
-    gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, ...
+    gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url,
+    no_recurse = no_recurse, page_token = page_token, recursive = recursive
   ) |>
     dplyr::rowwise() |>
     dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
@@ -142,43 +147,49 @@ gds_list_files_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCES
 #'
 #' Download only GDS files that can be processed by dracarys.
 #'
-#' @param gdsdir Full path to GDS directory.
-#' @param outdir Path to output directory.
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#' @param page_size Page size (def: 100).
-#' @param pattern Pattern to further filter the returned file type tibble.
+#' @inheritParams gds_list_files_dir
+#' @inheritParams gds_list_files_filter_relevant
+#' @param outdir Local output directory.
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
-#' @param regexes Tibble with regex and function name.
-#' @param recursive Should files be returned recursively _in and under_ the specified
-#' GDS directory (TRUE), or _only directly in_ the specified GDS directory (FALSE) (def: TRUE).
+#' @examples
+#' \dontrun{
+#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
+#' outdir <- sub("gds:/", "~/icav1/g", gdsdir)
+#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
+#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F)
+#' }
 #'
 #' @export
-dr_gds_download <- function(gdsdir, outdir, token, page_size = 100, pattern = NULL,
-                            dryrun = FALSE, regexes = DR_FILE_REGEX, recursive = NULL) {
+dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                            pattern = NULL, page_size = 100, dryrun = FALSE,
+                            regexes = DR_FILE_REGEX, recursive = NULL) {
   e <- emojifont::emoji
   fs::dir_create(outdir)
-  d <- gds_list_files_dir(
-    gdsdir = gdsdir, token = token, page_size = page_size,
-    no_recurse = FALSE, recursive = recursive
-  ) |>
-    dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
-    dplyr::select("file_id", "type", "size", "path", "bname")
+  d <- gds_list_files_filter_relevant(
+    gdsdir = gdsdir, pattern = pattern, regexes = regexes,
+    token = token, page_size = page_size, include_url = FALSE,
+    no_recurse = FALSE, page_token = NULL,
+    recursive = recursive
+  )
 
-  # download recognisable dracarys files to outdir/{bname}
-  pattern <- pattern %||% ".*" # keep all recognisable files
-  d_filt <- d |>
-    dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::mutate(out = file.path(outdir, .data$bname))
+  d <- d |>
+    dplyr::mutate(
+      localpath = file.path(outdir, .data$bname),
+      gdspath = .data$path
+    ) |>
+    dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath")
   if (!dryrun) {
     cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}")
-    d_filt |>
+    d |>
       dplyr::rowwise() |>
-      dplyr::mutate(out_dl = gds_file_download_api(.data$file_id, .data$out, token))
+      dplyr::mutate(
+        dl = gds_file_download_api(gds_fileid = .data$file_id, out_file = .data$out, token = token)
+      )
   } else {
     cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}")
-    d_filt |>
-      dplyr::select("path", "type", "size") |>
+    d |>
+      dplyr::select("type", "bname", "size", "gdspath", localpath2be = "localpath") |>
       as.data.frame() |>
       print()
   }
diff --git a/R/s3.R b/R/s3.R
index fcd7802..fd67d88 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -55,8 +55,8 @@ s3_list_files_dir <- function(s3dir, max_objects = 1000) {
 #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated).
 #' @param regexes Tibble with `regex` and `fun`ction name.
 #'
-#' @return A tibble with file type, basename, size, date, full path,
-#' and presigned URL if requested.
+#' @return A tibble with file type, basename, size, last modified timestamp,
+#' full path, and presigned URL if requested.
 #' @examples
 #' \dontrun{
 #' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables"
@@ -72,6 +72,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1
     return(d_all)
   }
   pattern <- pattern %||% ".*" # keep all recognisable files by default
+  cols_sel <- c("type", "bname", "size", "lastmodified", "path")
   d <- d_all |>
     dplyr::rowwise() |>
     dplyr::mutate(
@@ -79,7 +80,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1
     ) |>
     dplyr::ungroup() |>
     dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::select("type", "bname", "size", "lastmodified", "path")
+    dplyr::select(dplyr::all_of(cols_sel))
 
   if (presign) {
     if (nrow(d) == 0) {
@@ -91,7 +92,8 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1
       dplyr::mutate(presigned_url = s3_file_presignedurl(
         client = s3_client, s3path = .data$path, expiry_seconds = expiry_sec
       )) |>
-      dplyr::ungroup()
+      dplyr::ungroup() |>
+      dplyr::select(dplyr::all_of(c(cols_sel, "presigned_url")))
   }
   d
 }
@@ -146,7 +148,7 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
   } else {
     cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}")
     d |>
-      dplyr::select("path", "type", "size") |>
+      dplyr::select("type", "bname", "size", "path") |>
       as.data.frame() |>
       print()
   }
diff --git a/R/utils.R b/R/utils.R
index 2d39dc7..99c6550 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1,27 +1,49 @@
+#' List Files in Local Directory
+#'
+#' Lists files in a local directory.
+#'
+#' @param localdir Path to local directory.
+#'
+#' @return A tibble with file basename, size, last modification timestamp
+#' and full path.
+#' @examples
+#' localdir <- system.file("R", package = "dracarys")
+#' x <- local_list_files_dir(localdir)
+#' @testexamples
+#' expect_equal(names(x), c("bname", "size", "lastmodified", "path"))
+#' @export
+local_list_files_dir <- function(localdir) {
+  fs::dir_info(path = localdir, recurse = TRUE, type = "file") |>
+    dplyr::mutate(
+      bname = basename(.data$path),
+      lastmodified = .data$modification_time
+    ) |>
+    dplyr::select("bname", "size", "lastmodified", "path")
+}
+
 #' List Relevant Files In Local Directory
 #'
 #' Lists relevant files in a local directory.
 #'
 #' @param path Path to local directory.
-#' @param regexes Tibble with `regex` and `fun`ction name.
-#'
-#' @return A tibble with type, bname, size, file_id, path, and presigned URL.
+#' @param regexes Tibble with `regex` and `fun`ction name (see example).
+#' @return A tibble with file type, basename, size, last modified timestamp, and
+#' path.
 #'
 #' @examples
-#' \dontrun{
-#' path <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise"
-#' local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX)
-#' }
+#' path <- system.file("extdata/tso", package = "dracarys")
+#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile")
+#' x <- local_list_files_filter_relevant(path, regexes)
+#' @testexamples
+#' expect_equal(nrow(x), 1)
 #' @export
-local_files_list_filter_relevant <- function(path, regexes = DR_FILE_REGEX) {
-  fs::dir_ls(path = path, recurse = TRUE, type = "file") |>
-    tibble::as_tibble_col(column_name = "path") |>
+local_list_files_filter_relevant <- function(path, regexes = DR_FILE_REGEX) {
+  local_list_files_dir(localdir = path) |>
     dplyr::mutate(
-      bname = basename(.data$path),
       type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes))
     ) |>
     dplyr::filter(!is.na(.data$type)) |>
-    dplyr::select("type", "bname", "path")
+    dplyr::select("type", "bname", "size", "lastmodified", "path")
 }
 
 #' Print current timestamp for logging
@@ -238,7 +260,6 @@ grep_file <- function(path = ".", regexp) {
   return(x)
 }
 
-
 #' @noRd
 dummy1 <- function() {
   # Solves R CMD check: Namespaces in Imports field not imported from
diff --git a/man/Wf.Rd b/man/Wf.Rd
index d5c99ae..ce44791 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -2,38 +2,47 @@
 % Please edit documentation in R/Wf.R
 \name{Wf}
 \alias{Wf}
-\title{Workflow R6 Class}
+\title{Workflow}
 \description{
 Workflow is a base R6 class representing a bioinformatic
 workflow run from a UMCCR workflow manager.
+
+A workflow has:
+\itemize{
+\item an output directory path with all the result output files (either on GDS, S3 or
+local filesystem)
+\item a subset of files that are of interest for ingestion
+\itemize{
+\item tibble with full path and basename columns
+}
+\item a set of parsers that can parse and tidy those files
+\itemize{
+\item each parser takes a path and returns a tidy tibble
+}
+\item a list of tidy tibbles (or a tibble with nested tibbles)
+}
 }
 \examples{
-p1 <- system.file("extdata/portaldb_workflow_top4.rds", package = "rportal") |>
-  readRDS() |>
-  dplyr::filter(type_name == "umccrise") |>
-  dplyr::slice(1)
-w <- Wf$new(
-  prid = p1$portal_run_id, type = p1$type_name, start = p1$start, end = p1$end,
-  status = p1$end_status, input = p1$input, output = p1$output
-)
-w
+\dontrun{
+p1 <- "~/icav1/g/production/analysis_data"
+p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
+um <- Wf$new(p, "umccrise")
+
+}
+
+
 }
 \section{Public fields}{
 \if{html}{\out{<div class="r6-fields">}}
 \describe{
-\item{\code{prid}}{Portal run ID.}
-
-\item{\code{type}}{Workflow type.}
-
-\item{\code{start}}{Workflow start datetime.}
+\item{\code{path}}{(\code{character(1)})\cr
+Output directory path with results.}
 
-\item{\code{end}}{Workflow end datetime.}
+\item{\code{type}}{(\code{character(1)})\cr
+Type of workflow (e.g. umccrise, sash).}
 
-\item{\code{status}}{Workflow end status.}
-
-\item{\code{input}}{Workflow input JSON string.}
-
-\item{\code{output}}{Workflow output JSON string.}
+\item{\code{filesystem}}{(\code{character(1)})\cr
+Filesystem of \code{path}.}
 }
 \if{html}{\out{</div>}}
 }
@@ -42,6 +51,9 @@ w
 \itemize{
 \item \href{#method-Wf-new}{\code{Wf$new()}}
 \item \href{#method-Wf-print}{\code{Wf$print()}}
+\item \href{#method-Wf-list_files}{\code{Wf$list_files()}}
+\item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}}
+\item \href{#method-Wf-download_files}{\code{Wf$download_files()}}
 \item \href{#method-Wf-clone}{\code{Wf$clone()}}
 }
 }
@@ -51,33 +63,15 @@ w
 \subsection{Method \code{new()}}{
 Create a new Workflow object.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf$new(
-  prid = NULL,
-  type = NULL,
-  start = NULL,
-  end = NULL,
-  status = NULL,
-  input = NULL,
-  output = NULL
-)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf$new(path = NULL, type = NULL)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{prid}}{Portal run ID.}
-
-\item{\code{type}}{Workflow type.}
-
-\item{\code{start}}{Workflow start datetime.}
-
-\item{\code{end}}{Workflow end datetime.}
+\item{\code{path}}{Output directory path with results.}
 
-\item{\code{status}}{Workflow end status.}
-
-\item{\code{input}}{Workflow input JSON string.}
-
-\item{\code{output}}{Workflow output JSON string.}
+\item{\code{type}}{Type of workflow.}
 }
 \if{html}{\out{</div>}}
 }
@@ -98,6 +92,56 @@ Print details about the Workflow.
 }
 \if{html}{\out{</div>}}
 }
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf-list_files"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf-list_files}{}}}
+\subsection{Method \code{list_files()}}{
+List all files under given path.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf$list_files(max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"))}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{max_objects}}{Maximum number of objects to list.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf-list_files_filter_relevant"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf-list_files_filter_relevant}{}}}
+\subsection{Method \code{list_files_filter_relevant()}}{
+List dracarys files under given path
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf$list_files_filter_relevant(
+  regexes = NULL,
+  max_objects = 1000,
+  ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+  ...
+)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.}
+
+\item{\code{page_size}}{Page size}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf-download_files"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf-download_files}{}}}
+\subsection{Method \code{download_files()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf$download_files(ica_token = Sys.getenv("ICA_ACCESS_TOKEN"))}\if{html}{\out{</div>}}
+}
+
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf-clone"></a>}}
diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd
index 087a7ed..0d09aa7 100644
--- a/man/dr_gds_download.Rd
+++ b/man/dr_gds_download.Rd
@@ -7,9 +7,9 @@
 dr_gds_download(
   gdsdir,
   outdir,
-  token,
-  page_size = 100,
+  token = Sys.getenv("ICA_ACCESS_TOKEN"),
   pattern = NULL,
+  page_size = 100,
   dryrun = FALSE,
   regexes = DR_FILE_REGEX,
   recursive = NULL
@@ -18,22 +18,30 @@ dr_gds_download(
 \arguments{
 \item{gdsdir}{Full path to GDS directory.}
 
-\item{outdir}{Path to output directory.}
+\item{outdir}{Local output directory.}
 
 \item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
-\item{page_size}{Page size (def: 100).}
-
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
+\item{page_size}{Page size (def: 10 via ICA API).}
+
 \item{dryrun}{If TRUE, just list the files that will be downloaded (don't
 download them).}
 
-\item{regexes}{Tibble with regex and function name.}
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 
 \item{recursive}{Should files be returned recursively \emph{in and under} the specified
-GDS directory (TRUE), or \emph{only directly in} the specified GDS directory (FALSE) (def: TRUE).}
+GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
 }
 \description{
 Download only GDS files that can be processed by dracarys.
 }
+\examples{
+\dontrun{
+gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
+outdir <- sub("gds:/", "~/icav1/g", gdsdir)
+dr_gds_download(gdsdir, outdir)
+}
+
+}
diff --git a/man/gds_list_files_dir.Rd b/man/gds_list_files_dir.Rd
index 2a80f47..7d52c1d 100644
--- a/man/gds_list_files_dir.Rd
+++ b/man/gds_list_files_dir.Rd
@@ -2,11 +2,11 @@
 % Please edit documentation in R/ica.R
 \name{gds_list_files_dir}
 \alias{gds_list_files_dir}
-\title{List files in ICAv1 GDS Directory}
+\title{List Files in ICAv1 GDS Directory}
 \usage{
 gds_list_files_dir(
   gdsdir,
-  token,
+  token = Sys.getenv("ICA_ACCESS_TOKEN"),
   page_size = NULL,
   include_url = FALSE,
   no_recurse = TRUE,
@@ -32,7 +32,8 @@ without recursing further down the list using <page_token>.}
 GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
 }
 \value{
-A tibble with file ID, basename, size, last modified timestamp, full GDS path, presigned URL.
+A tibble with file ID, basename, size, last modified timestamp,
+full GDS path, and presigned URL if requested.
 }
 \description{
 Lists files in a GDS directory.
@@ -45,7 +46,7 @@ gdsdir <- file.path(
 )
 token <- ica_token_validate()
 page_size <- 11
-include_url <- TRUE
+include_url <- F
 page_token <- NULL
 no_recurse <- TRUE
 recursive <- NULL
diff --git a/man/gds_list_files_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd
index 4ae0612..ec57909 100644
--- a/man/gds_list_files_filter_relevant.Rd
+++ b/man/gds_list_files_filter_relevant.Rd
@@ -6,35 +6,49 @@
 \usage{
 gds_list_files_filter_relevant(
   gdsdir,
-  token = Sys.getenv("ICA_ACCESS_TOKEN"),
   pattern = NULL,
-  include_url = FALSE,
-  page_size = 100,
   regexes = DR_FILE_REGEX,
-  ...
+  token = Sys.getenv("ICA_ACCESS_TOKEN"),
+  page_size = 100,
+  include_url = FALSE,
+  no_recurse = TRUE,
+  page_token = NULL,
+  recursive = NULL
 )
 }
 \arguments{
-\item{gdsdir}{GDS directory.}
-
-\item{token}{ICA access token.}
+\item{gdsdir}{Full path to GDS directory.}
 
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
-\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE).}
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 
-\item{page_size}{Page size (def: 100).}
+\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
-\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
+\item{page_size}{Page size (def: 10 via ICA API).}
+
+\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).}
+
+\item{no_recurse}{Do not recurse through the file list i.e. just give the first <page_size> items
+without recursing further down the list using <page_token>.}
+
+\item{page_token}{Page token (def: NULL). Used internally for recursion.}
+
+\item{recursive}{Should files be returned recursively \emph{in and under} the specified
+GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
+
+\item{...}{Passed into \code{gds_list_files_dir}.}
 }
 \value{
-A tibble with file type, basename, size, file_id, full path,
+A tibble with file type, basename, size, last modified timestamp, file_id, full path,
 and presigned URL if requested.
+}
+\description{
+Lists relevant files in a GDS directory.
+}
+\examples{
 \dontrun{
 gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 gds_list_files_filter_relevant(gdsdir)
 }
 }
-\description{
-Lists relevant files in a GDS directory.
-}
diff --git a/man/local_files_list_filter_relevant.Rd b/man/local_files_list_filter_relevant.Rd
deleted file mode 100644
index a54407f..0000000
--- a/man/local_files_list_filter_relevant.Rd
+++ /dev/null
@@ -1,25 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.R
-\name{local_files_list_filter_relevant}
-\alias{local_files_list_filter_relevant}
-\title{List Relevant Files In Local Directory}
-\usage{
-local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX)
-}
-\arguments{
-\item{path}{Path to local directory.}
-
-\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
-}
-\value{
-A tibble with type, bname, size, file_id, path, and presigned URL.
-}
-\description{
-Lists relevant files in a local directory.
-}
-\examples{
-\dontrun{
-path <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise"
-local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX)
-}
-}
diff --git a/man/local_list_files_dir.Rd b/man/local_list_files_dir.Rd
new file mode 100644
index 0000000..771aa9b
--- /dev/null
+++ b/man/local_list_files_dir.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{local_list_files_dir}
+\alias{local_list_files_dir}
+\title{List Files in Local Directory}
+\usage{
+local_list_files_dir(localdir)
+}
+\arguments{
+\item{localdir}{Path to local directory.}
+}
+\value{
+A tibble with file basename, size, last modification timestamp
+and full path.
+}
+\description{
+Lists files in a local directory.
+}
+\examples{
+localdir <- system.file("R", package = "dracarys")
+x <- local_list_files_dir(localdir)
+}
diff --git a/man/local_list_files_filter_relevant.Rd b/man/local_list_files_filter_relevant.Rd
new file mode 100644
index 0000000..4e17351
--- /dev/null
+++ b/man/local_list_files_filter_relevant.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{local_list_files_filter_relevant}
+\alias{local_list_files_filter_relevant}
+\title{List Relevant Files In Local Directory}
+\usage{
+local_list_files_filter_relevant(path, regexes = DR_FILE_REGEX)
+}
+\arguments{
+\item{path}{Path to local directory.}
+
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).}
+}
+\value{
+A tibble with file type, basename, size, last modified timestamp, and
+path.
+}
+\description{
+Lists relevant files in a local directory.
+}
+\examples{
+path <- system.file("extdata/tso", package = "dracarys")
+regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcFile")
+x <- local_list_files_filter_relevant(path, regexes)
+}
diff --git a/man/s3_list_files_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd
index 1904ea5..8839bde 100644
--- a/man/s3_list_files_filter_relevant.Rd
+++ b/man/s3_list_files_filter_relevant.Rd
@@ -27,8 +27,8 @@ s3_list_files_filter_relevant(
 \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 }
 \value{
-A tibble with file type, basename, size, date, full path,
-and presigned URL if requested.
+A tibble with file type, basename, size, last modified timestamp,
+full path, and presigned URL if requested.
 }
 \description{
 Lists relevant files in an AWS S3 directory.

From c0d57a1c4878beb2a679ea1412374c3d664ceb87 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 8 Sep 2024 15:02:32 +1000
Subject: [PATCH 11/24] more gds/s3 filesystem tweaks

---
 R/ica.R                               | 21 ++++++----
 R/s3.R                                | 59 +++++++++++++++------------
 man/Wf.Rd                             |  2 -
 man/dr_gds_download.Rd                |  3 +-
 man/dr_s3_download.Rd                 | 16 ++++----
 man/gds_list_files_filter_relevant.Rd |  3 +-
 man/s3_list_files_filter_relevant.Rd  | 17 ++++----
 7 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/R/ica.R b/R/ica.R
index 84b9350..3589688 100644
--- a/R/ica.R
+++ b/R/ica.R
@@ -113,12 +113,11 @@ gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), p
 #' @inheritParams gds_list_files_dir
 #' @param pattern Pattern to further filter the returned file type tibble.
 #' @param regexes Tibble with `regex` and `fun`ction name.
-#' @param ... Passed into `gds_list_files_dir`.
-#'
 #' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path,
 #' and presigned URL if requested.
 #' @examples
 #' \dontrun{
+#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
 #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 #' gds_list_files_filter_relevant(gdsdir)
 #' }
@@ -172,30 +171,34 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN
     no_recurse = FALSE, page_token = NULL,
     recursive = recursive
   )
-
   d <- d |>
     dplyr::mutate(
       localpath = file.path(outdir, .data$bname),
       gdspath = .data$path
     ) |>
     dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath")
+  # download recognisable dracarys files to outdir/{bname}
   if (!dryrun) {
     cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}")
-    d |>
+    res <- d |>
       dplyr::rowwise() |>
       dplyr::mutate(
-        dl = gds_file_download_api(gds_fileid = .data$file_id, out_file = .data$out, token = token)
-      )
+        dl = gds_file_download_api(
+          gds_fileid = .data$file_id, out_file = .data$localpath, token = token
+        ),
+        localpath = normalizePath(.data$localpath)
+      ) |>
+      dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id")
+    return(res)
   } else {
     cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}")
     d |>
-      dplyr::select("type", "bname", "size", "gdspath", localpath2be = "localpath") |>
+      dplyr::select("type", "bname", "size", "lastmodified", "gdspath", "file_id", localpath2be = "localpath") |>
       as.data.frame() |>
       print()
   }
 }
 
-
 #' GDS File Presigned URL
 #'
 #' Returns presigned URL of given GDS file.
@@ -278,7 +281,7 @@ gds_file_download_api <- function(gds_fileid, out_file, token) {
   # keep quiet instead of logging presigned urls
   status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE)
   assertthat::assert_that(status_code == 0)
-  out_file
+  normalizePath(out_file)
 }
 
 #' GDS File Download via CLI
diff --git a/R/s3.R b/R/s3.R
index fd67d88..1bf8f4a 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -48,24 +48,26 @@ s3_list_files_dir <- function(s3dir, max_objects = 1000) {
 #'
 #' Lists relevant files in an AWS S3 directory.
 #'
-#' @param s3dir S3 directory.
+#' @inheritParams s3_list_files_dir
 #' @param pattern Pattern to further filter the returned file type tibble.
-#' @param max_objects The total number of objects to return.
+#' @param regexes Tibble with `regex` and `fun`ction name.
 #' @param presign Include presigned URLs (def: FALSE).
 #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated).
-#' @param regexes Tibble with `regex` and `fun`ction name.
 #'
 #' @return A tibble with file type, basename, size, last modified timestamp,
 #' full path, and presigned URL if requested.
 #' @examples
 #' \dontrun{
-#' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables"
-#' s3_list_files_filter_relevant(s3dir = s3dir, presign = FALSE)
+#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
+#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
+#' s3dir <- file.path(p1, p2)
+#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
+#' s3_list_files_filter_relevant(s3dir = s3dir, regexes = regexes, max_objects = 300)
 #' }
 #' @export
-s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100,
-                                          presign = FALSE, expiry_sec = 3600,
-                                          regexes = DR_FILE_REGEX) {
+s3_list_files_filter_relevant <- function(s3dir, pattern = NULL,
+                                          regexes = DR_FILE_REGEX, max_objects = 100,
+                                          presign = FALSE, expiry_sec = 3600) {
   assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000)
   d_all <- s3_list_files_dir(s3dir = s3dir, max_objects = max_objects)
   if (nrow(d_all) == 0) {
@@ -102,21 +104,19 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1
 #'
 #' Download only S3 files that can be processed by dracarys.
 #'
-#' @param s3dir Full path to S3 directory.
+#' @inheritParams s3_list_files_dir
+#' @inheritParams s3_list_files_filter_relevant
 #' @param outdir Path to output directory.
-#' @param max_objects Maximum objects returned in file listing.
-#' @param pattern Pattern to further filter the returned file type tibble.
-#' @param regexes Tibble with regex and function name.
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
 #' @examples
 #' \dontrun{
-#' s3dir <- file.path(
-#'   "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596",
-#'   "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067"
-#' )
+#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
+#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
+#' s3dir <- file.path(p1, p2)
+#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
 #' outdir <- sub("s3:/", "~/s3", s3dir)
-#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F)
+#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = F)
 #' }
 #' @export
 dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
@@ -125,30 +125,35 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
   e <- emojifont::emoji
   fs::dir_create(outdir)
   d <- s3_list_files_filter_relevant(
-    s3dir = s3dir, pattern = NULL, max_objects = max_objects, presign = FALSE, regexes = regexes
+    s3dir = s3dir, pattern = NULL, regexes = regexes,
+    max_objects = max_objects, presign = FALSE
   )
   d <- d |>
-    dplyr::select("type", "size", "path", "bname") |>
-    dplyr::mutate(out = file.path(outdir, .data$bname))
-
+    dplyr::mutate(
+      localpath = file.path(outdir, .data$bname),
+      s3path = .data$path
+    ) |>
+    dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path")
   # download recognisable dracarys files to outdir/{bname}
   if (!dryrun) {
     cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}")
     d |>
       dplyr::rowwise() |>
       dplyr::mutate(
-        s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path),
-        s3key = sub("s3://(.*?)/(.*)", "\\2", .data$path),
+        s3bucket = sub("s3://(.*?)/.*", "\\1", .data$s3path),
+        s3key = sub("s3://(.*?)/(.*)", "\\2", .data$s3path),
         dl = list(
           s3$download_file(
-            Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$out
+            Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$localpath
           )
-        )
-      )
+        ),
+        localpath = normalizePath(.data$localpath)
+      ) |>
+      dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path")
   } else {
     cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}")
     d |>
-      dplyr::select("type", "bname", "size", "path") |>
+      dplyr::select("type", "bname", "size", "lastmodified", "s3path", localpath2be = "localpath") |>
       as.data.frame() |>
       print()
   }
diff --git a/man/Wf.Rd b/man/Wf.Rd
index ce44791..a99ffbb 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -27,10 +27,8 @@ local filesystem)
 p1 <- "~/icav1/g/production/analysis_data"
 p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
 um <- Wf$new(p, "umccrise")
-
 }
 
-
 }
 \section{Public fields}{
 \if{html}{\out{<div class="r6-fields">}}
diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd
index 0d09aa7..14a25fd 100644
--- a/man/dr_gds_download.Rd
+++ b/man/dr_gds_download.Rd
@@ -41,7 +41,8 @@ Download only GDS files that can be processed by dracarys.
 \dontrun{
 gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 outdir <- sub("gds:/", "~/icav1/g", gdsdir)
-dr_gds_download(gdsdir, outdir)
+regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile")
+dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F)
 }
 
 }
diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd
index d572c10..6619188 100644
--- a/man/dr_s3_download.Rd
+++ b/man/dr_s3_download.Rd
@@ -14,15 +14,15 @@ dr_s3_download(
 )
 }
 \arguments{
-\item{s3dir}{Full path to S3 directory.}
+\item{s3dir}{S3 directory.}
 
 \item{outdir}{Path to output directory.}
 
-\item{max_objects}{Maximum objects returned in file listing.}
+\item{max_objects}{Maximum objects returned.}
 
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
-\item{regexes}{Tibble with regex and function name.}
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 
 \item{dryrun}{If TRUE, just list the files that will be downloaded (don't
 download them).}
@@ -32,11 +32,11 @@ Download only S3 files that can be processed by dracarys.
 }
 \examples{
 \dontrun{
-s3dir <- file.path(
-  "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596",
-  "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067"
-)
+p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
+p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
+s3dir <- file.path(p1, p2)
+regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile")
 outdir <- sub("s3:/", "~/s3", s3dir)
-dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F)
+dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = F)
 }
 }
diff --git a/man/gds_list_files_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd
index ec57909..88a6748 100644
--- a/man/gds_list_files_filter_relevant.Rd
+++ b/man/gds_list_files_filter_relevant.Rd
@@ -36,8 +36,6 @@ without recursing further down the list using <page_token>.}
 
 \item{recursive}{Should files be returned recursively \emph{in and under} the specified
 GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
-
-\item{...}{Passed into \code{gds_list_files_dir}.}
 }
 \value{
 A tibble with file type, basename, size, last modified timestamp, file_id, full path,
@@ -48,6 +46,7 @@ Lists relevant files in a GDS directory.
 }
 \examples{
 \dontrun{
+regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile")
 gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 gds_list_files_filter_relevant(gdsdir)
 }
diff --git a/man/s3_list_files_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd
index 8839bde..bc55db3 100644
--- a/man/s3_list_files_filter_relevant.Rd
+++ b/man/s3_list_files_filter_relevant.Rd
@@ -7,10 +7,10 @@
 s3_list_files_filter_relevant(
   s3dir,
   pattern = NULL,
+  regexes = DR_FILE_REGEX,
   max_objects = 100,
   presign = FALSE,
-  expiry_sec = 3600,
-  regexes = DR_FILE_REGEX
+  expiry_sec = 3600
 )
 }
 \arguments{
@@ -18,13 +18,13 @@ s3_list_files_filter_relevant(
 
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
-\item{max_objects}{The total number of objects to return.}
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
+
+\item{max_objects}{Maximum objects returned.}
 
 \item{presign}{Include presigned URLs (def: FALSE).}
 
 \item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated).}
-
-\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
 }
 \value{
 A tibble with file type, basename, size, last modified timestamp,
@@ -35,7 +35,10 @@ Lists relevant files in an AWS S3 directory.
 }
 \examples{
 \dontrun{
-s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables"
-s3_list_files_filter_relevant(s3dir = s3dir, presign = FALSE)
+p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
+p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
+s3dir <- file.path(p1, p2)
+regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile")
+s3_list_files_filter_relevant(s3dir = s3dir, regexes = regexes, max_objects = 300)
 }
 }

From 61b4bf2747158a29d6433c9544da01b7f54e9559 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 8 Sep 2024 17:54:22 +1000
Subject: [PATCH 12/24] more gds/s3/local filesystem tweaks

---
 NAMESPACE                                     |   2 +-
 R/Wf.R                                        |  84 ++++--
 R/fs_icav1.R                                  | 258 +++++++++++++++++
 R/fs_local.R                                  |  46 +++
 R/{s3.R => fs_s3.R}                           |   0
 R/ica.R                                       | 266 +-----------------
 R/utils.R                                     |  48 ----
 man/Wf.Rd                                     |  60 +++-
 man/dr_gds_download.Rd                        |   4 +-
 man/dr_s3_download.Rd                         |   2 +-
 man/gds_file_download_api.Rd                  |   2 +-
 ...e_download.Rd => gds_file_download_cli.Rd} |   8 +-
 man/gds_file_presignedurl.Rd                  |   2 +-
 man/gds_list_files_dir.Rd                     |   2 +-
 man/gds_list_files_filter_relevant.Rd         |   4 +-
 man/local_list_files_dir.Rd                   |   2 +-
 man/local_list_files_filter_relevant.Rd       |  10 +-
 man/s3_file_presignedurl.Rd                   |   2 +-
 man/s3_list_files_dir.Rd                      |   2 +-
 man/s3_list_files_filter_relevant.Rd          |   2 +-
 man/s3_search.Rd                              |   2 +-
 21 files changed, 440 insertions(+), 368 deletions(-)
 create mode 100644 R/fs_icav1.R
 create mode 100644 R/fs_local.R
 rename R/{s3.R => fs_s3.R} (100%)
 rename man/{gds_file_download.Rd => gds_file_download_cli.Rd} (61%)

diff --git a/NAMESPACE b/NAMESPACE
index 8c6d3ab..aa3957e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -46,8 +46,8 @@ export(dr_output_format_valid)
 export(dr_s3_download)
 export(empty_tbl)
 export(file_regex_getter)
-export(gds_file_download)
 export(gds_file_download_api)
+export(gds_file_download_cli)
 export(gds_file_presignedurl)
 export(gds_files_list_fastq)
 export(gds_list_files_dir)
diff --git a/R/Wf.R b/R/Wf.R
index 80f6c08..0ddc2ee 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -25,19 +25,19 @@ Wf <- R6::R6Class(
   "Wf",
   public = list(
     #' @field path (`character(1)`)\cr
-    #' Output directory path with results.
-    #' @field type (`character(1)`)\cr
-    #' Type of workflow (e.g. umccrise, sash).
+    #' Path to directory with raw workflow results (from GDS, S3, or local filesystem).
+    #' @field wname (`character(1)`)\cr
+    #' Name of workflow (e.g. umccrise, sash).
     #' @field filesystem (`character(1)`)\cr
     #' Filesystem of `path`.
     path = NULL,
-    type = NULL,
+    wname = NULL,
     filesystem = NULL,
     #' @description Create a new Workflow object.
     #' @param path Output directory path with results.
-    #' @param type Type of workflow.
-    initialize = function(path = NULL, type = NULL) {
-      types <- c(
+    #' @param wname Name of workflow.
+    initialize = function(path = NULL, wname = NULL) {
+      wnames <- c(
         "bcl_convert",
         "tso_ctdna_tumor_only",
         "wgs_alignment_qc",
@@ -52,9 +52,9 @@ Wf <- R6::R6Class(
         "oncoanalyser_wgts_existing_both",
         "sash"
       )
-      assertthat::assert_that(type %in% types)
+      assertthat::assert_that(wname %in% wnames)
       self$path <- path
-      self$type <- type
+      self$wname <- wname
       self$filesystem <- dplyr::case_when(
         grepl("^gds://", path) ~ "gds",
         grepl("^s3://", path) ~ "s3",
@@ -67,52 +67,86 @@ Wf <- R6::R6Class(
       res <- tibble::tribble(
         ~var, ~value,
         "path", self$path,
-        "type", self$type,
+        "wname", self$wname,
         "filesystem", self$filesystem
       )
       print(res)
       invisible(self)
     },
     #' @description List all files under given path.
-    #' @param max_objects Maximum number of objects to list.
-    list_files = function(max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN")) {
+    #' @param max_files Maximum number of files to list.
+    #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+    #' @param ... Passed on to `gds_list_files_dir` function.
+    list_files = function(max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
       path <- self$path
       if (self$filesystem == "gds") {
-        d <- gds_list_files_dir(gdsdir = path, page_size = max_objects, token = ica_token)
+        d <- gds_list_files_dir(
+          gdsdir = path, token = ica_token, page_size = max_files, ...
+        )
       } else if (self$filesystem == "s3") {
-        d <- s3_list_files_dir(s3dir = path, max_objects = max_objects)
+        d <- s3_list_files_dir(s3dir = path, max_objects = max_files)
       } else {
         d <- local_list_files_dir(localdir = path)
       }
       return(d)
     },
     #' @description List dracarys files under given path
-    #' @param page_size Page size
     #' @param regexes Tibble with `regex` and `fun`ction name.
-    list_files_filter_relevant = function(regexes = NULL, max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
+    #' @param max_files Maximum number of files to list.
+    #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+    #' @param ... Passed on to the `gds_list_files_filter_relevant` or
+    #' the `s3_list_files_filter_relevant` function.
+    list_files_filter_relevant = function(regexes = NULL,
+                                          max_files = 1000,
+                                          ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
       assertthat::assert_that(!is.null(regexes))
       path <- self$path
       if (self$filesystem == "gds") {
-        d <- gds_list_files_filter_relevant(gdsdir = path, token = ica_token, page_size = max_objects, regexes = regexes, ...)
+        d <- gds_list_files_filter_relevant(
+          gdsdir = path, regexes = regexes, token = ica_token, page_size = max_files, ...
+        )
       } else if (self$filesystem == "s3") {
-        # "type", "bname", "size", "date_utc", "path"
-        d <- s3_list_files_filter_relevant(s3dir = path, max_objects = max_objects, regexes = regexes, ...)
+        d <- s3_list_files_filter_relevant(
+          s3dir = path, regexes = regexes, max_objects = max_files, ...
+        )
       } else {
-        # "type", "bname", "path"
-        d <- local_list_files_filter_relevant(path = path, regexes = regexes)
+        d <- local_list_files_filter_relevant(localdir = path, regexes = regexes)
       }
       d
     },
-    download_files = function(ica_token = Sys.getenv("ICA_ACCESS_TOKEN")) {
+    #' @description Download files from GDS/S3 to local filesystem.
+    #' @param outdir Path to output directory.
+    #' @param regexes Tibble with `regex` and `fun`ction name.
+    #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+    #' @param max_files Maximum number of files to list.
+    #' @param dryrun If TRUE, just list the files that will be downloaded (don't
+    #' download them).
+    #' @param recursive Should files be returned recursively _in and under_ the specified
+    #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
+    download_files = function(outdir, regexes = NULL,
+                              ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                              max_files = 1000, dryrun = FALSE, recursive = NULL) {
       # TODO: add envvar checker
       path <- self$path
+      assertthat::assert_that(!is.null(regexes))
       if (self$filesystem == "gds") {
-        d <- dr_gds_download(gdsdir = path, token = ica_token)
+        d <- dr_gds_download(
+          gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token,
+          page_size = max_files, dryrun = dryrun, recursive = recursive
+        )
+        self$filesystem <- "local"
+        self$path <- outdir
       } else if (self$filesystem == "s3") {
-        d <- s3_download_files(s3dir = path)
+        d <- dr_s3_download(
+          s3dir = path, outdir = outdir, regexes = regexes,
+          max_objects = max_files, dryrun = dryrun
+        )
+        self$filesystem <- "local"
+        self$path <- outdir
       } else {
-        d <- local_download_files(localdir = path)
+        d <- self$list_files_filter_relevant(regexes = regexes)
       }
+      return(d)
     }
   ) # end public
 )
diff --git a/R/fs_icav1.R b/R/fs_icav1.R
new file mode 100644
index 0000000..8f36e81
--- /dev/null
+++ b/R/fs_icav1.R
@@ -0,0 +1,258 @@
+#' List Files in ICAv1 GDS Directory
+#'
+#' Lists files in a GDS directory.
+#'
+#' @param gdsdir Full path to GDS directory.
+#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#' @param page_size Page size (def: 10 via ICA API).
+#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).
+#' @param page_token Page token (def: NULL). Used internally for recursion.
+#' @param no_recurse Do not recurse through the file list i.e. just give the first <page_size> items
+#' without recursing further down the list using <page_token>.
+#' @param recursive Should files be returned recursively _in and under_ the specified
+#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
+#'
+#' @return A tibble with file ID, basename, size, last modified timestamp,
+#' full GDS path, and presigned URL if requested.
+#' @examples
+#' \dontrun{
+#' gdsdir <- file.path(
+#'   "gds://production/analysis_data/SBJ00699/umccrise",
+#'   "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage"
+#' )
+#' token <- ica_token_validate()
+#' page_size <- 11
+#' include_url <- F
+#' page_token <- NULL
+#' no_recurse <- TRUE
+#' recursive <- NULL
+#' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive)
+#' }
+#' @export
+gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL,
+                               include_url = FALSE, no_recurse = TRUE, page_token = NULL,
+                               recursive = NULL) {
+  assertthat::assert_that(is.logical(no_recurse), is.logical(include_url))
+  assertthat::assert_that(is.null(recursive) || is.logical(recursive))
+  token <- ica_token_validate(token)
+  assertthat::assert_that(grepl("^gds://", gdsdir))
+  gdsdir_original <- gdsdir
+  if (!grepl("/$", gdsdir)) {
+    gdsdir <- glue("{gdsdir}/")
+  }
+  base_url <- "https://aps2.platform.illumina.com/v1"
+  volname <- sub("gds://(.*?)/.*", "\\1", gdsdir)
+  path2 <- sub("gds://(.*?)/(.*)", "\\2", gdsdir)
+  page_size <- ifelse(is.null(page_size), "", glue("&pageSize={page_size}"))
+  query_url <- glue("{base_url}/files?volume.name={volname}&path=/{path2}*{page_size}")
+  if (include_url) {
+    query_url <- glue("{query_url}&include=PresignedUrl")
+  }
+  if (!is.null(page_token)) {
+    query_url <- glue("{query_url}&pageToken={page_token}")
+  }
+  if (!is.null(recursive)) {
+    # without specifying recursive, it's true by default
+    recursive <- ifelse(recursive, "true", "false")
+    query_url <- glue("{query_url}&recursive={recursive}")
+  }
+  query_res <- httr::GET(
+    query_url,
+    httr::add_headers(Authorization = glue("Bearer {token}")),
+    httr::accept_json()
+  )
+  j <- jsonlite::fromJSON(httr::content(x = query_res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE)
+  if (j[["itemCount"]] == 0) {
+    if (gds_likely_file(gdsdir_original)) {
+      cli::cli_abort("{date_log()} ERROR: Is the input directory a file perhaps?\n{.file {gdsdir_original}}")
+    }
+    # if <somehow> there is a nextPageToken then abort, else continue
+    if (!is.null(j[["nextPageToken"]])) {
+      msg <- paste0(
+        "{date_log()} ERROR: ",
+        "No GDS files listed in the input directory. Please confirm you can ",
+        "access the following GDS input directory with your token: ",
+        "{.file {gdsdir_original}}"
+      )
+      cli::cli_abort(msg)
+    }
+  } # endif
+  d <- j[["items"]] |>
+    purrr::map(\(x) c(
+      file_id = x[["id"]],
+      path = x[["path"]],
+      size = x[["sizeInBytes"]],
+      lastmodified = x[["timeModified"]],
+      presigned_url = x[["presignedUrl"]]
+    )) |>
+    dplyr::bind_rows()
+  if (nrow(d) == 0) {
+    # We've iterated through all available items, and the next page has 0 items.
+    # So dplyr::bind_rows(d, NULL) will return d.
+    return(NULL)
+  }
+  res <- d |>
+    dplyr::mutate(
+      size = fs::as_fs_bytes(.data$size),
+      bname = basename(.data$path),
+      path = glue("gds://{volname}{.data$path}")
+    ) |>
+    dplyr::select(dplyr::any_of(c("bname", "size", "lastmodified", "file_id", "path", "presigned_url")))
+
+  if (!is.null(j[["nextPageToken"]]) && !no_recurse) {
+    res2 <- gds_list_files_dir(
+      gdsdir = gdsdir, token = token, page_size = NULL,
+      include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]],
+      recursive = NULL
+    )
+    res <- dplyr::bind_rows(res, res2)
+  }
+  res
+}
+
+#' List Relevant Files In ICAv1 GDS Directory
+#'
+#' Lists relevant files in a GDS directory.
+#'
+#' @inheritParams gds_list_files_dir
+#' @param pattern Pattern to further filter the returned file type tibble.
+#' @param regexes Tibble with `regex` and `fun`ction name (see example).
+#' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path,
+#' and presigned URL if requested.
+#' @examples
+#' \dontrun{
+#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
+#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
+#' gds_list_files_filter_relevant(gdsdir)
+#' }
+#' @export
+gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_FILE_REGEX,
+                                           token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                                           page_size = 100, include_url = FALSE,
+                                           no_recurse = TRUE, page_token = NULL,
+                                           recursive = NULL) {
+  pattern <- pattern %||% ".*" # keep all recognisable files by default
+  assertthat::assert_that(all(colnames(regexes) == c("regex", "fun")))
+  cols_sel <- c("type", "bname", "size", "lastmodified", "file_id", "path", "presigned_url")
+  d <- dracarys::gds_list_files_dir(
+    gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url,
+    no_recurse = no_recurse, page_token = page_token, recursive = recursive
+  ) |>
+    dplyr::rowwise() |>
+    dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
+    dplyr::ungroup() |>
+    dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
+    dplyr::select(dplyr::any_of(cols_sel))
+  d
+}
+
+#' dracarys GDS Download
+#'
+#' Download only GDS files that can be processed by dracarys.
+#'
+#' @inheritParams gds_list_files_dir
+#' @inheritParams gds_list_files_filter_relevant
+#' @param outdir Local output directory.
+#' @param dryrun If TRUE, just list the files that will be downloaded (don't
+#' download them).
+#' @examples
+#' \dontrun{
+#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
+#' outdir <- sub("gds:/", "~/icav1/g", gdsdir)
+#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
+#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F)
+#' }
+#'
+#' @export
+dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                            pattern = NULL, page_size = 100, dryrun = FALSE,
+                            regexes = DR_FILE_REGEX, recursive = NULL) {
+  e <- emojifont::emoji
+  fs::dir_create(outdir)
+  d <- gds_list_files_filter_relevant(
+    gdsdir = gdsdir, pattern = pattern, regexes = regexes,
+    token = token, page_size = page_size, include_url = FALSE,
+    no_recurse = FALSE, page_token = NULL,
+    recursive = recursive
+  )
+  d <- d |>
+    dplyr::mutate(
+      localpath = file.path(outdir, .data$bname),
+      gdspath = .data$path
+    ) |>
+    dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath")
+  # download recognisable dracarys files to outdir/{bname}
+  if (!dryrun) {
+    cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}")
+    res <- d |>
+      dplyr::rowwise() |>
+      dplyr::mutate(
+        dl = gds_file_download_api(
+          gds_fileid = .data$file_id, out_file = .data$localpath, token = token
+        ),
+        localpath = normalizePath(.data$localpath)
+      ) |>
+      dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id")
+    return(res)
+  } else {
+    cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}")
+    d |>
+      dplyr::select("type", "bname", "size", "lastmodified", "gdspath", "file_id", localpath2be = "localpath") |>
+      as.data.frame() |>
+      print()
+  }
+}
+
+#' GDS File Presigned URL
+#'
+#' Returns presigned URL of given GDS file.
+#'
+#' @param gds_fileid GDS file ID.
+#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#' @return Presigned URL if valid.
+#' @export
+gds_file_presignedurl <- function(gds_fileid, token) {
+  token <- ica_token_validate(token)
+  base_url <- "https://aps2.platform.illumina.com/v1"
+  url <- glue("{base_url}/files/{gds_fileid}")
+  res <- httr::GET(
+    url,
+    httr::add_headers(Authorization = glue("Bearer {token}")),
+    httr::accept_json()
+  )
+  presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]]
+  assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url))
+  presigned_url
+}
+
+#' GDS File Download via API
+#'
+#' @param gds_fileid GDS file ID.
+#' @param out_file Path to output file.
+#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#'
+#' @examples
+#' \dontrun{
+#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0"
+#' out <- tempfile()
+#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
+#' }
+#' @export
+gds_file_download_api <- function(gds_fileid, out_file, token) {
+  presigned_url <- gds_file_presignedurl(gds_fileid, token)
+  # keep quiet instead of logging presigned urls
+  status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE)
+  assertthat::assert_that(status_code == 0)
+  normalizePath(out_file)
+}
+
+#' GDS File Download via CLI
+#'
+#' @param gds Full path to GDS file.
+#' @param out Path to output file.
+#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#' @export
+gds_file_download_cli <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) {
+  token <- ica_token_validate(token)
+  system(glue("ica files download {gds} {out} --access-token {token}"))
+}
diff --git a/R/fs_local.R b/R/fs_local.R
new file mode 100644
index 0000000..b90d94c
--- /dev/null
+++ b/R/fs_local.R
@@ -0,0 +1,46 @@
+#' List Files in Local Directory
+#'
+#' Lists files in a local directory.
+#'
+#' @param localdir Path to local directory.
+#' @return A tibble with file basename, size, last modification timestamp
+#' and full path.
+#' @examples
+#' localdir <- system.file("R", package = "dracarys")
+#' x <- local_list_files_dir(localdir)
+#' @testexamples
+#' expect_equal(names(x), c("bname", "size", "lastmodified", "path"))
+#' @export
+local_list_files_dir <- function(localdir) {
+  fs::dir_info(path = localdir, recurse = TRUE, type = "file") |>
+    dplyr::mutate(
+      bname = basename(.data$path),
+      lastmodified = .data$modification_time
+    ) |>
+    dplyr::select("bname", "size", "lastmodified", "path")
+}
+
+#' List Relevant Files In Local Directory
+#'
+#' Lists relevant files in a local directory.
+#'
+#' @inheritParams local_list_files_dir
+#' @param regexes Tibble with `regex` and `fun`ction name (see example).
+#' @return A tibble with file type, basename, size, last modified timestamp, and
+#' path.
+#'
+#' @examples
+#' localdir <- system.file("extdata/tso", package = "dracarys")
+#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile")
+#' x <- local_list_files_filter_relevant(localdir, regexes)
+#' @testexamples
+#' expect_equal(nrow(x), 1)
+#' @export
+local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX) {
+  local_list_files_dir(localdir = localdir) |>
+    dplyr::mutate(
+      type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes))
+    ) |>
+    dplyr::filter(!is.na(.data$type)) |>
+    dplyr::select("type", "bname", "size", "lastmodified", localpath = "path")
+}
diff --git a/R/s3.R b/R/fs_s3.R
similarity index 100%
rename from R/s3.R
rename to R/fs_s3.R
diff --git a/R/ica.R b/R/ica.R
index 3589688..951095b 100644
--- a/R/ica.R
+++ b/R/ica.R
@@ -1,226 +1,3 @@
-#' List Files in ICAv1 GDS Directory
-#'
-#' Lists files in a GDS directory.
-#'
-#' @param gdsdir Full path to GDS directory.
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#' @param page_size Page size (def: 10 via ICA API).
-#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).
-#' @param page_token Page token (def: NULL). Used internally for recursion.
-#' @param no_recurse Do not recurse through the file list i.e. just give the first <page_size> items
-#' without recursing further down the list using <page_token>.
-#' @param recursive Should files be returned recursively _in and under_ the specified
-#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
-#'
-#' @return A tibble with file ID, basename, size, last modified timestamp,
-#' full GDS path, and presigned URL if requested.
-#' @examples
-#' \dontrun{
-#' gdsdir <- file.path(
-#'   "gds://production/analysis_data/SBJ00699/umccrise",
-#'   "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage"
-#' )
-#' token <- ica_token_validate()
-#' page_size <- 11
-#' include_url <- F
-#' page_token <- NULL
-#' no_recurse <- TRUE
-#' recursive <- NULL
-#' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive)
-#' }
-#' @export
-gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL,
-                               include_url = FALSE, no_recurse = TRUE, page_token = NULL,
-                               recursive = NULL) {
-  assertthat::assert_that(is.logical(no_recurse), is.logical(include_url))
-  assertthat::assert_that(is.null(recursive) || is.logical(recursive))
-  token <- ica_token_validate(token)
-  assertthat::assert_that(grepl("^gds://", gdsdir))
-  gdsdir_original <- gdsdir
-  if (!grepl("/$", gdsdir)) {
-    gdsdir <- glue("{gdsdir}/")
-  }
-  base_url <- "https://aps2.platform.illumina.com/v1"
-  volname <- sub("gds://(.*?)/.*", "\\1", gdsdir)
-  path2 <- sub("gds://(.*?)/(.*)", "\\2", gdsdir)
-  page_size <- ifelse(is.null(page_size), "", glue("&pageSize={page_size}"))
-  query_url <- glue("{base_url}/files?volume.name={volname}&path=/{path2}*{page_size}")
-  if (include_url) {
-    query_url <- glue("{query_url}&include=PresignedUrl")
-  }
-  if (!is.null(page_token)) {
-    query_url <- glue("{query_url}&pageToken={page_token}")
-  }
-  if (!is.null(recursive)) {
-    # without specifying recursive, it's true by default
-    recursive <- ifelse(recursive, "true", "false")
-    query_url <- glue("{query_url}&recursive={recursive}")
-  }
-  query_res <- httr::GET(
-    query_url,
-    httr::add_headers(Authorization = glue("Bearer {token}")),
-    httr::accept_json()
-  )
-  j <- jsonlite::fromJSON(httr::content(x = query_res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE)
-  if (j[["itemCount"]] == 0) {
-    if (likely_file(gdsdir_original)) {
-      cli::cli_abort("{date_log()} ERROR: Is the input directory a file perhaps?\n{.file {gdsdir_original}}")
-    }
-    # if <somehow> there is a nextPageToken then abort, else continue
-    if (!is.null(j[["nextPageToken"]])) {
-      msg <- paste0(
-        "{date_log()} ERROR: ",
-        "No GDS files listed in the input directory. Please confirm you can ",
-        "access the following GDS input directory with your token: ",
-        "{.file {gdsdir_original}}"
-      )
-      cli::cli_abort(msg)
-    }
-  } # endif
-  d <- j[["items"]] |>
-    purrr::map(\(x) c(
-      file_id = x[["id"]], path = x[["path"]], size = x[["sizeInBytes"]],
-      lastmodified = x[["timeModified"]], presigned_url = x[["presignedUrl"]]
-    )) |>
-    dplyr::bind_rows()
-  if (nrow(d) == 0) {
-    # We've iterated through all available items, and the next page has 0 items.
-    # So dplyr::bind_rows(d, NULL) will return d.
-    return(NULL)
-  }
-  res <- d |>
-    dplyr::mutate(
-      size = fs::as_fs_bytes(.data$size),
-      bname = basename(.data$path),
-      path = glue("gds://{volname}{.data$path}")
-    ) |>
-    dplyr::select(dplyr::any_of(c("file_id", "bname", "size", "lastmodified", "path", "presigned_url")))
-  if (!is.null(j[["nextPageToken"]]) && !no_recurse) {
-    res2 <- gds_list_files_dir(
-      gdsdir = gdsdir, token = token, page_size = NULL,
-      include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]],
-      recursive = NULL
-    )
-    res <- dplyr::bind_rows(res, res2)
-  }
-  res
-}
-
-#' List Relevant Files In ICAv1 GDS Directory
-#'
-#' Lists relevant files in a GDS directory.
-#'
-#' @inheritParams gds_list_files_dir
-#' @param pattern Pattern to further filter the returned file type tibble.
-#' @param regexes Tibble with `regex` and `fun`ction name.
-#' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path,
-#' and presigned URL if requested.
-#' @examples
-#' \dontrun{
-#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
-#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
-#' gds_list_files_filter_relevant(gdsdir)
-#' }
-#' @export
-gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_FILE_REGEX,
-                                           token = Sys.getenv("ICA_ACCESS_TOKEN"),
-                                           page_size = 100, include_url = FALSE,
-                                           no_recurse = TRUE, page_token = NULL,
-                                           recursive = NULL) {
-  pattern <- pattern %||% ".*" # keep all recognisable files by default
-  assertthat::assert_that(all(colnames(regexes) == c("regex", "fun")))
-  cols_sel <- c("type", "bname", "size", "lastmodified", "file_id", "path", "presigned_url")
-  d <- dracarys::gds_list_files_dir(
-    gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url,
-    no_recurse = no_recurse, page_token = page_token, recursive = recursive
-  ) |>
-    dplyr::rowwise() |>
-    dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
-    dplyr::ungroup() |>
-    dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::select(dplyr::any_of(cols_sel))
-  d
-}
-
-#' dracarys GDS Download
-#'
-#' Download only GDS files that can be processed by dracarys.
-#'
-#' @inheritParams gds_list_files_dir
-#' @inheritParams gds_list_files_filter_relevant
-#' @param outdir Local output directory.
-#' @param dryrun If TRUE, just list the files that will be downloaded (don't
-#' download them).
-#' @examples
-#' \dontrun{
-#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
-#' outdir <- sub("gds:/", "~/icav1/g", gdsdir)
-#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
-#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F)
-#' }
-#'
-#' @export
-dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
-                            pattern = NULL, page_size = 100, dryrun = FALSE,
-                            regexes = DR_FILE_REGEX, recursive = NULL) {
-  e <- emojifont::emoji
-  fs::dir_create(outdir)
-  d <- gds_list_files_filter_relevant(
-    gdsdir = gdsdir, pattern = pattern, regexes = regexes,
-    token = token, page_size = page_size, include_url = FALSE,
-    no_recurse = FALSE, page_token = NULL,
-    recursive = recursive
-  )
-  d <- d |>
-    dplyr::mutate(
-      localpath = file.path(outdir, .data$bname),
-      gdspath = .data$path
-    ) |>
-    dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath")
-  # download recognisable dracarys files to outdir/{bname}
-  if (!dryrun) {
-    cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}")
-    res <- d |>
-      dplyr::rowwise() |>
-      dplyr::mutate(
-        dl = gds_file_download_api(
-          gds_fileid = .data$file_id, out_file = .data$localpath, token = token
-        ),
-        localpath = normalizePath(.data$localpath)
-      ) |>
-      dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id")
-    return(res)
-  } else {
-    cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}")
-    d |>
-      dplyr::select("type", "bname", "size", "lastmodified", "gdspath", "file_id", localpath2be = "localpath") |>
-      as.data.frame() |>
-      print()
-  }
-}
-
-#' GDS File Presigned URL
-#'
-#' Returns presigned URL of given GDS file.
-#'
-#' @param gds_fileid GDS file ID.
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#' @return Presigned URL if valid.
-#' @export
-gds_file_presignedurl <- function(gds_fileid, token) {
-  token <- ica_token_validate(token)
-  base_url <- "https://aps2.platform.illumina.com/v1"
-  url <- glue("{base_url}/files/{gds_fileid}")
-  res <- httr::GET(
-    url,
-    httr::add_headers(Authorization = glue("Bearer {token}")),
-    httr::accept_json()
-  )
-  presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]]
-  assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url))
-  presigned_url
-}
-
 #' List FASTQs In GDS Directory
 #'
 #' @param gdsdir GDS directory.
@@ -247,11 +24,8 @@ gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size =
     "fastq\\.gz$", "FASTQ"
   )
   g <- gds_list_files_filter_relevant(
-    gdsdir = gdsdir, token = token, pattern = NULL, include_url = include_url,
-    page_size = page_size, regexes = fq_regex
-  )
-  assertthat::assert_that(
-    all(colnames(g) == c("type", "bname", "size", "file_id", "path"))
+    gdsdir = gdsdir, pattern = NULL, regexes = fq_regex,
+    token = token, page_size = page_size, include_url = include_url
   )
   g |>
     dplyr::mutate(
@@ -259,42 +33,10 @@ gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size =
       size_num = as.numeric(.data$size)
     ) |>
     dplyr::select(
-      "type", "bname", "size", "size_chr", "size_num", "file_id", "path"
+      "type", "bname", "size", "lastmodified", "size_chr", "size_num", "file_id", "path"
     )
 }
 
-#' GDS File Download via API
-#'
-#' @param gds_fileid GDS file ID.
-#' @param out_file Path to output file.
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#'
-#' @examples
-#' \dontrun{
-#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0"
-#' out <- tempfile()
-#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
-#' }
-#' @export
-gds_file_download_api <- function(gds_fileid, out_file, token) {
-  presigned_url <- gds_file_presignedurl(gds_fileid, token)
-  # keep quiet instead of logging presigned urls
-  status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE)
-  assertthat::assert_that(status_code == 0)
-  normalizePath(out_file)
-}
-
-#' GDS File Download via CLI
-#'
-#' @param gds Full path to GDS file.
-#' @param out Path to output file.
-#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-#' @export
-gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) {
-  token <- ica_token_validate(token)
-  system(glue("ica files download {gds} {out} --access-token {token}"))
-}
-
 #' List GDS Volumes
 #'
 #' Lists GDS volumes accessible by the provided ICA token.
@@ -357,7 +99,7 @@ ica_token_exp <- function(token = Sys.getenv("ICA_ACCESS_TOKEN")) {
   structure(l$payload$exp, class = c("POSIXct", "POSIXt"))
 }
 
-likely_file <- function(x) {
+gds_likely_file <- function(x) {
   e <- c(
     "txt", "tsv", "csv", "html", "json", "stdout", "stderr", "stdouterr",
     "log", "vcf", "gz", "bam", "bai"
diff --git a/R/utils.R b/R/utils.R
index 99c6550..378588b 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1,51 +1,3 @@
-#' List Files in Local Directory
-#'
-#' Lists files in a local directory.
-#'
-#' @param localdir Path to local directory.
-#'
-#' @return A tibble with file basename, size, last modification timestamp
-#' and full path.
-#' @examples
-#' localdir <- system.file("R", package = "dracarys")
-#' x <- local_list_files_dir(localdir)
-#' @testexamples
-#' expect_equal(names(x), c("bname", "size", "lastmodified", "path"))
-#' @export
-local_list_files_dir <- function(localdir) {
-  fs::dir_info(path = localdir, recurse = TRUE, type = "file") |>
-    dplyr::mutate(
-      bname = basename(.data$path),
-      lastmodified = .data$modification_time
-    ) |>
-    dplyr::select("bname", "size", "lastmodified", "path")
-}
-
-#' List Relevant Files In Local Directory
-#'
-#' Lists relevant files in a local directory.
-#'
-#' @param path Path to local directory.
-#' @param regexes Tibble with `regex` and `fun`ction name (see example).
-#' @return A tibble with file type, basename, size, last modified timestamp, and
-#' path.
-#'
-#' @examples
-#' path <- system.file("extdata/tso", package = "dracarys")
-#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile")
-#' x <- local_list_files_filter_relevant(path, regexes)
-#' @testexamples
-#' expect_equal(nrow(x), 1)
-#' @export
-local_list_files_filter_relevant <- function(path, regexes = DR_FILE_REGEX) {
-  local_list_files_dir(localdir = path) |>
-    dplyr::mutate(
-      type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes))
-    ) |>
-    dplyr::filter(!is.na(.data$type)) |>
-    dplyr::select("type", "bname", "size", "lastmodified", "path")
-}
-
 #' Print current timestamp for logging
 #'
 #' @return Current timestamp as character.
diff --git a/man/Wf.Rd b/man/Wf.Rd
index a99ffbb..7efeb70 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -34,10 +34,10 @@ um <- Wf$new(p, "umccrise")
 \if{html}{\out{<div class="r6-fields">}}
 \describe{
 \item{\code{path}}{(\code{character(1)})\cr
-Output directory path with results.}
+Path to directory with raw workflow results (from GDS, S3, or local filesystem).}
 
-\item{\code{type}}{(\code{character(1)})\cr
-Type of workflow (e.g. umccrise, sash).}
+\item{\code{wname}}{(\code{character(1)})\cr
+Name of workflow (e.g. umccrise, sash).}
 
 \item{\code{filesystem}}{(\code{character(1)})\cr
 Filesystem of \code{path}.}
@@ -61,7 +61,7 @@ Filesystem of \code{path}.}
 \subsection{Method \code{new()}}{
 Create a new Workflow object.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf$new(path = NULL, type = NULL)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf$new(path = NULL, wname = NULL)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -69,7 +69,7 @@ Create a new Workflow object.
 \describe{
 \item{\code{path}}{Output directory path with results.}
 
-\item{\code{type}}{Type of workflow.}
+\item{\code{wname}}{Name of workflow.}
 }
 \if{html}{\out{</div>}}
 }
@@ -97,13 +97,21 @@ Print details about the Workflow.
 \subsection{Method \code{list_files()}}{
 List all files under given path.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf$list_files(max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"))}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf$list_files(
+  max_files = 1000,
+  ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+  ...
+)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{max_objects}}{Maximum number of objects to list.}
+\item{\code{max_files}}{Maximum number of files to list.}
+
+\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
+
+\item{\code{...}}{Passed on to \code{gds_list_files_dir} function.}
 }
 \if{html}{\out{</div>}}
 }
@@ -116,7 +124,7 @@ List dracarys files under given path
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$list_files_filter_relevant(
   regexes = NULL,
-  max_objects = 1000,
+  max_files = 1000,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   ...
 )}\if{html}{\out{</div>}}
@@ -127,7 +135,12 @@ List dracarys files under given path
 \describe{
 \item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.}
 
-\item{\code{page_size}}{Page size}
+\item{\code{max_files}}{Maximum number of files to list.}
+
+\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
+
+\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or
+the \code{s3_list_files_filter_relevant} function.}
 }
 \if{html}{\out{</div>}}
 }
@@ -136,10 +149,37 @@ List dracarys files under given path
 \if{html}{\out{<a id="method-Wf-download_files"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf-download_files}{}}}
 \subsection{Method \code{download_files()}}{
+Download files from GDS/S3 to local filesystem.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf$download_files(ica_token = Sys.getenv("ICA_ACCESS_TOKEN"))}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf$download_files(
+  outdir,
+  regexes = NULL,
+  ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+  max_files = 1000,
+  dryrun = FALSE,
+  recursive = NULL
+)}\if{html}{\out{</div>}}
 }
 
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{outdir}}{Path to output directory.}
+
+\item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.}
+
+\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
+
+\item{\code{max_files}}{Maximum number of files to list.}
+
+\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't
+download them).}
+
+\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified
+GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
+}
+\if{html}{\out{</div>}}
+}
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf-clone"></a>}}
diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd
index 14a25fd..f748163 100644
--- a/man/dr_gds_download.Rd
+++ b/man/dr_gds_download.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ica.R
+% Please edit documentation in R/fs_icav1.R
 \name{dr_gds_download}
 \alias{dr_gds_download}
 \title{dracarys GDS Download}
@@ -29,7 +29,7 @@ dr_gds_download(
 \item{dryrun}{If TRUE, just list the files that will be downloaded (don't
 download them).}
 
-\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).}
 
 \item{recursive}{Should files be returned recursively \emph{in and under} the specified
 GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd
index 6619188..86ff3f8 100644
--- a/man/dr_s3_download.Rd
+++ b/man/dr_s3_download.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/s3.R
+% Please edit documentation in R/fs_s3.R
 \name{dr_s3_download}
 \alias{dr_s3_download}
 \title{dracarys S3 Download}
diff --git a/man/gds_file_download_api.Rd b/man/gds_file_download_api.Rd
index 1397860..b10a438 100644
--- a/man/gds_file_download_api.Rd
+++ b/man/gds_file_download_api.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ica.R
+% Please edit documentation in R/fs_icav1.R
 \name{gds_file_download_api}
 \alias{gds_file_download_api}
 \title{GDS File Download via API}
diff --git a/man/gds_file_download.Rd b/man/gds_file_download_cli.Rd
similarity index 61%
rename from man/gds_file_download.Rd
rename to man/gds_file_download_cli.Rd
index 10ab7b5..f5946fb 100644
--- a/man/gds_file_download.Rd
+++ b/man/gds_file_download_cli.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ica.R
-\name{gds_file_download}
-\alias{gds_file_download}
+% Please edit documentation in R/fs_icav1.R
+\name{gds_file_download_cli}
+\alias{gds_file_download_cli}
 \title{GDS File Download via CLI}
 \usage{
-gds_file_download(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN"))
+gds_file_download_cli(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN"))
 }
 \arguments{
 \item{gds}{Full path to GDS file.}
diff --git a/man/gds_file_presignedurl.Rd b/man/gds_file_presignedurl.Rd
index 064adb5..841db4a 100644
--- a/man/gds_file_presignedurl.Rd
+++ b/man/gds_file_presignedurl.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ica.R
+% Please edit documentation in R/fs_icav1.R
 \name{gds_file_presignedurl}
 \alias{gds_file_presignedurl}
 \title{GDS File Presigned URL}
diff --git a/man/gds_list_files_dir.Rd b/man/gds_list_files_dir.Rd
index 7d52c1d..d60b546 100644
--- a/man/gds_list_files_dir.Rd
+++ b/man/gds_list_files_dir.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ica.R
+% Please edit documentation in R/fs_icav1.R
 \name{gds_list_files_dir}
 \alias{gds_list_files_dir}
 \title{List Files in ICAv1 GDS Directory}
diff --git a/man/gds_list_files_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd
index 88a6748..1e280b4 100644
--- a/man/gds_list_files_filter_relevant.Rd
+++ b/man/gds_list_files_filter_relevant.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ica.R
+% Please edit documentation in R/fs_icav1.R
 \name{gds_list_files_filter_relevant}
 \alias{gds_list_files_filter_relevant}
 \title{List Relevant Files In ICAv1 GDS Directory}
@@ -21,7 +21,7 @@ gds_list_files_filter_relevant(
 
 \item{pattern}{Pattern to further filter the returned file type tibble.}
 
-\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.}
+\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).}
 
 \item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
diff --git a/man/local_list_files_dir.Rd b/man/local_list_files_dir.Rd
index 771aa9b..c66a6c3 100644
--- a/man/local_list_files_dir.Rd
+++ b/man/local_list_files_dir.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.R
+% Please edit documentation in R/fs_local.R
 \name{local_list_files_dir}
 \alias{local_list_files_dir}
 \title{List Files in Local Directory}
diff --git a/man/local_list_files_filter_relevant.Rd b/man/local_list_files_filter_relevant.Rd
index 4e17351..caf0832 100644
--- a/man/local_list_files_filter_relevant.Rd
+++ b/man/local_list_files_filter_relevant.Rd
@@ -1,13 +1,13 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.R
+% Please edit documentation in R/fs_local.R
 \name{local_list_files_filter_relevant}
 \alias{local_list_files_filter_relevant}
 \title{List Relevant Files In Local Directory}
 \usage{
-local_list_files_filter_relevant(path, regexes = DR_FILE_REGEX)
+local_list_files_filter_relevant(localdir, regexes = DR_FILE_REGEX)
 }
 \arguments{
-\item{path}{Path to local directory.}
+\item{localdir}{Path to local directory.}
 
 \item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).}
 }
@@ -19,7 +19,7 @@ path.
 Lists relevant files in a local directory.
 }
 \examples{
-path <- system.file("extdata/tso", package = "dracarys")
+localdir <- system.file("extdata/tso", package = "dracarys")
 regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcFile")
-x <- local_list_files_filter_relevant(path, regexes)
+x <- local_list_files_filter_relevant(localdir, regexes)
 }
diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd
index 1598a92..79eb7d6 100644
--- a/man/s3_file_presignedurl.Rd
+++ b/man/s3_file_presignedurl.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/s3.R
+% Please edit documentation in R/fs_s3.R
 \name{s3_file_presignedurl}
 \alias{s3_file_presignedurl}
 \title{S3 Generate Presigned URL}
diff --git a/man/s3_list_files_dir.Rd b/man/s3_list_files_dir.Rd
index 055692f..820da1d 100644
--- a/man/s3_list_files_dir.Rd
+++ b/man/s3_list_files_dir.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/s3.R
+% Please edit documentation in R/fs_s3.R
 \name{s3_list_files_dir}
 \alias{s3_list_files_dir}
 \title{List Objects in AWS S3 Directory}
diff --git a/man/s3_list_files_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd
index bc55db3..1716825 100644
--- a/man/s3_list_files_filter_relevant.Rd
+++ b/man/s3_list_files_filter_relevant.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/s3.R
+% Please edit documentation in R/fs_s3.R
 \name{s3_list_files_filter_relevant}
 \alias{s3_list_files_filter_relevant}
 \title{List Relevant Files In AWS S3 Directory}
diff --git a/man/s3_search.Rd b/man/s3_search.Rd
index c0d9f64..c4a1db2 100644
--- a/man/s3_search.Rd
+++ b/man/s3_search.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/s3.R
+% Please edit documentation in R/fs_s3.R
 \name{s3_search}
 \alias{s3_search}
 \title{Search AWS S3 Objects}

From b8cc5cde6ee839f6d62d831668f2cfd14f8eaa90 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 8 Sep 2024 20:12:27 +1000
Subject: [PATCH 13/24] Wf.R: downloading works across gds/s3

---
 R/Wf.R    | 61 ++++++++++++++++++++++++++++++++++++++++++++++---------
 man/Wf.Rd | 56 ++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index 0ddc2ee..0dce589 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -5,7 +5,7 @@
 #'
 #' A workflow has:
 #'
-#' - an output directory path with all the result output files (either on GDS, S3 or
+#' - a directory path with all the raw output files (either on GDS, S3 or
 #' local filesystem)
 #' - a subset of files that are of interest for ingestion
 #'   - tibble with full path and basename columns
@@ -15,9 +15,43 @@
 #'
 #' @examples
 #' \dontrun{
-#' p1 <- "~/icav1/g/production/analysis_data"
-#' p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
-#' um <- Wf$new(p, "umccrise")
+#' regexes <- tibble::tribble(
+#'   ~regex, ~fun,
+#'   "-chord\\.tsv\\.gz$", "UmChordTsvFile",
+#'   "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
+#'   "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
+#'   "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
+#'   "-dbs\\.tsv\\.gz$", "UmSigsDbsFile",
+#'   "-indel\\.tsv\\.gz$", "UmSigsIndelFile",
+#'   "-qc_summary\\.tsv\\.gz$", "UmQcSumFile",
+#' )
+#'
+#' #---- LOCAL ----#
+#' p1_local <- "~/icav1/g/production/analysis_data"
+#' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
+#' um1 <- Wf$new(path = p, wname = "umccrise")
+#' um1$list_files(max_files = 10)
+#' um1$list_files_filter_relevant(regexes = regexes)
+#'
+#' #---- GDS ----#
+#' p1_gds <- "gds://production/analysis_data"
+#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
+#' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
+#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
+#' um2 <- Wf$new(path = p, wname = "umccrise")
+#' um2$list_files(max_files = 10)
+#' um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500)
+#' d <- um2$download_files(outdir = outdir, regexes = regexes, ica_token = token, max_files = 1000, dryrun = F)
+#'
+#' #---- S3 ----#
+#' p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3"
+#' p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables"
+#' p <- file.path(p1_s3, p2_s3)
+#' outdir <- sub("s3:/", "~/s3", p)
+#' um3 <- Wf$new(path = p, wname = "sash")
+#' um3$list_files(max_files = 10)
+#' um3$list_files_filter_relevant(regexes = regexes, max_files = 50)
+#' d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F)
 #' }
 #'
 #' @export
@@ -74,7 +108,7 @@ Wf <- R6::R6Class(
       invisible(self)
     },
     #' @description List all files under given path.
-    #' @param max_files Maximum number of files to list.
+    #' @param max_files Max number of files to list (for gds/s3 only).
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
     #' @param ... Passed on to `gds_list_files_dir` function.
     list_files = function(max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
@@ -92,7 +126,7 @@ Wf <- R6::R6Class(
     },
     #' @description List dracarys files under given path
     #' @param regexes Tibble with `regex` and `fun`ction name.
-    #' @param max_files Maximum number of files to list.
+    #' @param max_files Max number of files to list (for gds/s3 only).
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
     #' @param ... Passed on to the `gds_list_files_filter_relevant` or
     #' the `s3_list_files_filter_relevant` function.
@@ -134,19 +168,26 @@ Wf <- R6::R6Class(
           gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token,
           page_size = max_files, dryrun = dryrun, recursive = recursive
         )
-        self$filesystem <- "local"
-        self$path <- outdir
+        if (!dryrun) {
+          self$filesystem <- "local"
+          self$path <- outdir
+        }
       } else if (self$filesystem == "s3") {
         d <- dr_s3_download(
           s3dir = path, outdir = outdir, regexes = regexes,
           max_objects = max_files, dryrun = dryrun
         )
-        self$filesystem <- "local"
-        self$path <- outdir
+        if (!dryrun) {
+          self$filesystem <- "local"
+          self$path <- outdir
+        }
       } else {
         d <- self$list_files_filter_relevant(regexes = regexes)
       }
       return(d)
+    },
+    tidy_files = function() {
+
     }
   ) # end public
 )
diff --git a/man/Wf.Rd b/man/Wf.Rd
index 7efeb70..19dd472 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -9,7 +9,7 @@ workflow run from a UMCCR workflow manager.
 
 A workflow has:
 \itemize{
-\item an output directory path with all the result output files (either on GDS, S3 or
+\item a directory path with all the raw output files (either on GDS, S3 or
 local filesystem)
 \item a subset of files that are of interest for ingestion
 \itemize{
@@ -24,9 +24,43 @@ local filesystem)
 }
 \examples{
 \dontrun{
-p1 <- "~/icav1/g/production/analysis_data"
-p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
-um <- Wf$new(p, "umccrise")
+regexes <- tibble::tribble(
+       ~regex, ~fun,
+       "-chord\\\\.tsv\\\\.gz$", "UmChordTsvFile",
+       "-hrdetect\\\\.tsv\\\\.gz$", "UmHrdetectTsvFile",
+       "-snv_2015\\\\.tsv\\\\.gz$", "UmSigsSnvFile",
+       "-snv_2020\\\\.tsv\\\\.gz$", "UmSigsSnvFile",
+       "-dbs\\\\.tsv\\\\.gz$", "UmSigsDbsFile",
+       "-indel\\\\.tsv\\\\.gz$", "UmSigsIndelFile",
+       "-qc_summary\\\\.tsv\\\\.gz$", "UmQcSumFile",
+     )
+
+#---- LOCAL ----#
+p1_local <- "~/icav1/g/production/analysis_data"
+p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
+um1 <- Wf$new(path = p, wname = "umccrise")
+um1$list_files(max_files = 10)
+um1$list_files_filter_relevant(regexes = regexes)
+
+#---- GDS ----#
+p1_gds <- "gds://production/analysis_data"
+p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
+outdir <- file.path(sub("gds:/", "~/icav1/g", p))
+token <- Sys.getenv("ICA_ACCESS_TOKEN")
+um2 <- Wf$new(path = p, wname = "umccrise")
+um2$list_files(max_files = 10)
+um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500)
+d <- um2$download_files(outdir = outdir, regexes = regexes, ica_token = token, max_files = 1000, dryrun = F)
+
+#---- S3 ----#
+p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3"
+p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables"
+p <- file.path(p1_s3, p2_s3)
+outdir <- sub("s3:/", "~/s3", p)
+um3 <- Wf$new(path = p, wname = "sash")
+um3$list_files(max_files = 10)
+um3$list_files_filter_relevant(regexes = regexes, max_files = 50)
+d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F)
 }
 
 }
@@ -52,6 +86,7 @@ Filesystem of \code{path}.}
 \item \href{#method-Wf-list_files}{\code{Wf$list_files()}}
 \item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}}
 \item \href{#method-Wf-download_files}{\code{Wf$download_files()}}
+\item \href{#method-Wf-tidy_files}{\code{Wf$tidy_files()}}
 \item \href{#method-Wf-clone}{\code{Wf$clone()}}
 }
 }
@@ -107,7 +142,7 @@ List all files under given path.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{max_files}}{Maximum number of files to list.}
+\item{\code{max_files}}{Max number of files to list (for gds/s3 only).}
 
 \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
@@ -135,7 +170,7 @@ List dracarys files under given path
 \describe{
 \item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.}
 
-\item{\code{max_files}}{Maximum number of files to list.}
+\item{\code{max_files}}{Max number of files to list (for gds/s3 only).}
 
 \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
@@ -180,6 +215,15 @@ GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE
 }
 \if{html}{\out{</div>}}
 }
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf-tidy_files"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf-tidy_files}{}}}
+\subsection{Method \code{tidy_files()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf$tidy_files()}\if{html}{\out{</div>}}
+}
+
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf-clone"></a>}}

From 7074df8122f0bca05d0f010cbff9c56db4b84360 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 8 Sep 2024 21:31:34 +1000
Subject: [PATCH 14/24] add tidy_files function

---
 NAMESPACE                                     |  1 +
 R/Wf.R                                        | 24 ++++++++----
 R/fs_local.R                                  | 16 ++++++--
 man/Wf.Rd                                     | 38 ++++++++++++-------
 man/local_list_files_dir.Rd                   |  4 +-
 man/local_list_files_filter_relevant.Rd       |  8 +++-
 man/tidy_files.Rd                             | 27 +++++++++++++
 .../test-roxytest-testexamples-fs_local.R     | 20 ++++++++++
 8 files changed, 111 insertions(+), 27 deletions(-)
 create mode 100644 man/tidy_files.Rd
 create mode 100644 tests/testthat/test-roxytest-testexamples-fs_local.R

diff --git a/NAMESPACE b/NAMESPACE
index aa3957e..416306c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -75,6 +75,7 @@ export(s3_list_files_dir)
 export(s3_list_files_filter_relevant)
 export(s3_search)
 export(session_info_kable)
+export(tidy_files)
 export(time_metrics_process)
 export(umccr_tidy)
 export(write_dracarys_list_of_tbls)
diff --git a/R/Wf.R b/R/Wf.R
index 0dce589..3c2c3bd 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -30,8 +30,9 @@
 #' p1_local <- "~/icav1/g/production/analysis_data"
 #' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
 #' um1 <- Wf$new(path = p, wname = "umccrise")
-#' um1$list_files(max_files = 10)
-#' um1$list_files_filter_relevant(regexes = regexes)
+#' um1$list_files(max_files = 100)
+#' um1$list_files_filter_relevant(regexes = regexes, max_files = 100)
+#'
 #'
 #' #---- GDS ----#
 #' p1_gds <- "gds://production/analysis_data"
@@ -41,7 +42,10 @@
 #' um2 <- Wf$new(path = p, wname = "umccrise")
 #' um2$list_files(max_files = 10)
 #' um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500)
-#' d <- um2$download_files(outdir = outdir, regexes = regexes, ica_token = token, max_files = 1000, dryrun = F)
+#' d <- um2$download_files(
+#'   outdir = outdir, regexes = regexes, ica_token = token,
+#'   max_files = 1000, dryrun = F
+#' )
 #'
 #' #---- S3 ----#
 #' p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3"
@@ -120,7 +124,7 @@ Wf <- R6::R6Class(
       } else if (self$filesystem == "s3") {
         d <- s3_list_files_dir(s3dir = path, max_objects = max_files)
       } else {
-        d <- local_list_files_dir(localdir = path)
+        d <- local_list_files_dir(localdir = path, max_files = max_files)
       }
       return(d)
     },
@@ -144,7 +148,9 @@ Wf <- R6::R6Class(
           s3dir = path, regexes = regexes, max_objects = max_files, ...
         )
       } else {
-        d <- local_list_files_filter_relevant(localdir = path, regexes = regexes)
+        d <- local_list_files_filter_relevant(
+          localdir = path, regexes = regexes, max_files = max_files
+        )
       }
       d
     },
@@ -182,12 +188,14 @@ Wf <- R6::R6Class(
           self$path <- outdir
         }
       } else {
-        d <- self$list_files_filter_relevant(regexes = regexes)
+        d <- self$list_files_filter_relevant(regexes = regexes, max_files = max_files)
       }
       return(d)
     },
-    tidy_files = function() {
-
+    #' @description Tidy given files.
+    #' @param x Tibble with `fun`ction to parse the file and `localpath` to the file.
+    tidy_files = function(x) {
+      tidy_files(x)
     }
   ) # end public
 )
diff --git a/R/fs_local.R b/R/fs_local.R
index b90d94c..07397aa 100644
--- a/R/fs_local.R
+++ b/R/fs_local.R
@@ -3,6 +3,8 @@
 #' Lists files in a local directory.
 #'
 #' @param localdir Path to local directory.
+#' @param max_files Max files returned.
+#'
 #' @return A tibble with file basename, size, last modification timestamp
 #' and full path.
 #' @examples
@@ -11,13 +13,18 @@
 #' @testexamples
 #' expect_equal(names(x), c("bname", "size", "lastmodified", "path"))
 #' @export
-local_list_files_dir <- function(localdir) {
-  fs::dir_info(path = localdir, recurse = TRUE, type = "file") |>
+local_list_files_dir <- function(localdir, max_files = NULL) {
+  d <- fs::dir_info(path = localdir, recurse = TRUE, type = "file") |>
     dplyr::mutate(
       bname = basename(.data$path),
       lastmodified = .data$modification_time
     ) |>
     dplyr::select("bname", "size", "lastmodified", "path")
+  if (!is.null(max_files)) {
+    d <- d |>
+      dplyr::slice_head(n = max_files)
+  }
+  d
 }
 
 #' List Relevant Files In Local Directory
@@ -26,6 +33,7 @@ local_list_files_dir <- function(localdir) {
 #'
 #' @inheritParams local_list_files_dir
 #' @param regexes Tibble with `regex` and `fun`ction name (see example).
+#'
 #' @return A tibble with file type, basename, size, last modified timestamp, and
 #' path.
 #'
@@ -36,8 +44,8 @@ local_list_files_dir <- function(localdir) {
 #' @testexamples
 #' expect_equal(nrow(x), 1)
 #' @export
-local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX) {
-  local_list_files_dir(localdir = localdir) |>
+local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX, max_files = NULL) {
+  local_list_files_dir(localdir = localdir, max_files = max_files) |>
     dplyr::mutate(
       type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes))
     ) |>
diff --git a/man/Wf.Rd b/man/Wf.Rd
index 19dd472..bc9b4b3 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -25,22 +25,23 @@ local filesystem)
 \examples{
 \dontrun{
 regexes <- tibble::tribble(
-       ~regex, ~fun,
-       "-chord\\\\.tsv\\\\.gz$", "UmChordTsvFile",
-       "-hrdetect\\\\.tsv\\\\.gz$", "UmHrdetectTsvFile",
-       "-snv_2015\\\\.tsv\\\\.gz$", "UmSigsSnvFile",
-       "-snv_2020\\\\.tsv\\\\.gz$", "UmSigsSnvFile",
-       "-dbs\\\\.tsv\\\\.gz$", "UmSigsDbsFile",
-       "-indel\\\\.tsv\\\\.gz$", "UmSigsIndelFile",
-       "-qc_summary\\\\.tsv\\\\.gz$", "UmQcSumFile",
-     )
+  ~regex, ~fun,
+  "-chord\\\\.tsv\\\\.gz$", "UmChordTsvFile",
+  "-hrdetect\\\\.tsv\\\\.gz$", "UmHrdetectTsvFile",
+  "-snv_2015\\\\.tsv\\\\.gz$", "UmSigsSnvFile",
+  "-snv_2020\\\\.tsv\\\\.gz$", "UmSigsSnvFile",
+  "-dbs\\\\.tsv\\\\.gz$", "UmSigsDbsFile",
+  "-indel\\\\.tsv\\\\.gz$", "UmSigsIndelFile",
+  "-qc_summary\\\\.tsv\\\\.gz$", "UmQcSumFile",
+)
 
 #---- LOCAL ----#
 p1_local <- "~/icav1/g/production/analysis_data"
 p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
 um1 <- Wf$new(path = p, wname = "umccrise")
-um1$list_files(max_files = 10)
-um1$list_files_filter_relevant(regexes = regexes)
+um1$list_files(max_files = 100)
+um1$list_files_filter_relevant(regexes = regexes, max_files = 100)
+
 
 #---- GDS ----#
 p1_gds <- "gds://production/analysis_data"
@@ -50,7 +51,10 @@ token <- Sys.getenv("ICA_ACCESS_TOKEN")
 um2 <- Wf$new(path = p, wname = "umccrise")
 um2$list_files(max_files = 10)
 um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500)
-d <- um2$download_files(outdir = outdir, regexes = regexes, ica_token = token, max_files = 1000, dryrun = F)
+d <- um2$download_files(
+  outdir = outdir, regexes = regexes, ica_token = token,
+  max_files = 1000, dryrun = F
+)
 
 #---- S3 ----#
 p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3"
@@ -220,10 +224,18 @@ GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE
 \if{html}{\out{<a id="method-Wf-tidy_files"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf-tidy_files}{}}}
 \subsection{Method \code{tidy_files()}}{
+Tidy given files.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf$tidy_files()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf$tidy_files(x)}\if{html}{\out{</div>}}
 }
 
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Tibble with \code{fun}ction to parse the file and \code{localpath} to the file.}
+}
+\if{html}{\out{</div>}}
+}
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf-clone"></a>}}
diff --git a/man/local_list_files_dir.Rd b/man/local_list_files_dir.Rd
index c66a6c3..d77ca63 100644
--- a/man/local_list_files_dir.Rd
+++ b/man/local_list_files_dir.Rd
@@ -4,10 +4,12 @@
 \alias{local_list_files_dir}
 \title{List Files in Local Directory}
 \usage{
-local_list_files_dir(localdir)
+local_list_files_dir(localdir, max_files = NULL)
 }
 \arguments{
 \item{localdir}{Path to local directory.}
+
+\item{max_files}{Max files returned.}
 }
 \value{
 A tibble with file basename, size, last modification timestamp
diff --git a/man/local_list_files_filter_relevant.Rd b/man/local_list_files_filter_relevant.Rd
index caf0832..ac62065 100644
--- a/man/local_list_files_filter_relevant.Rd
+++ b/man/local_list_files_filter_relevant.Rd
@@ -4,12 +4,18 @@
 \alias{local_list_files_filter_relevant}
 \title{List Relevant Files In Local Directory}
 \usage{
-local_list_files_filter_relevant(localdir, regexes = DR_FILE_REGEX)
+local_list_files_filter_relevant(
+  localdir,
+  regexes = DR_FILE_REGEX,
+  max_files = NULL
+)
 }
 \arguments{
 \item{localdir}{Path to local directory.}
 
 \item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).}
+
+\item{max_files}{Max files returned.}
 }
 \value{
 A tibble with file type, basename, size, last modified timestamp, and
diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd
new file mode 100644
index 0000000..de96245
--- /dev/null
+++ b/man/tidy_files.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tidy.R
+\name{tidy_files}
+\alias{tidy_files}
+\title{Tidy Files}
+\usage{
+tidy_files(x)
+}
+\arguments{
+\item{x}{Tibble with \code{fun}ction to parse the file and \code{localpath} to the file.}
+}
+\value{
+Tibble with parsed data in a \code{data} list-column.
+}
+\description{
+Tidy Files
+}
+\examples{
+\dontrun{
+p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
+p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz"
+p <- file.path(p1, p2)
+x <- tibble::tibble(fun = "readr::read_tsv", localpath = p)
+)
+}
+
+}
diff --git a/tests/testthat/test-roxytest-testexamples-fs_local.R b/tests/testthat/test-roxytest-testexamples-fs_local.R
new file mode 100644
index 0000000..764c4aa
--- /dev/null
+++ b/tests/testthat/test-roxytest-testexamples-fs_local.R
@@ -0,0 +1,20 @@
+# Generated by roxytest: do not edit by hand!
+
+# File R/fs_local.R: @testexamples
+
+test_that("Function local_list_files_dir() @ L16", {
+  
+  localdir <- system.file("R", package = "dracarys")
+  x <- local_list_files_dir(localdir)
+  expect_equal(names(x), c("bname", "size", "lastmodified", "path"))
+})
+
+
+test_that("Function local_list_files_filter_relevant() @ L47", {
+  
+  localdir <- system.file("extdata/tso", package = "dracarys")
+  regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile")
+  x <- local_list_files_filter_relevant(localdir, regexes)
+  expect_equal(nrow(x), 1)
+})
+

From 3be8ad574e7eb14c09b8152571b6b610a8d921ca Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 8 Sep 2024 22:33:13 +1000
Subject: [PATCH 15/24] initialise Wf superclass with regexes

---
 R/Wf.R    | 31 +++++++++++++++++--------------
 man/Wf.Rd | 27 +++++++++++++--------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index 3c2c3bd..634c6ad 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -29,9 +29,9 @@
 #' #---- LOCAL ----#
 #' p1_local <- "~/icav1/g/production/analysis_data"
 #' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
-#' um1 <- Wf$new(path = p, wname = "umccrise")
+#' um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
 #' um1$list_files(max_files = 100)
-#' um1$list_files_filter_relevant(regexes = regexes, max_files = 100)
+#' um1$list_files_filter_relevant(max_files = 100)
 #'
 #'
 #' #---- GDS ----#
@@ -39,11 +39,11 @@
 #' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
 #' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
 #' token <- Sys.getenv("ICA_ACCESS_TOKEN")
-#' um2 <- Wf$new(path = p, wname = "umccrise")
+#' um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
 #' um2$list_files(max_files = 10)
-#' um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500)
+#' um2$list_files_filter_relevant(ica_token = token, max_files = 500)
 #' d <- um2$download_files(
-#'   outdir = outdir, regexes = regexes, ica_token = token,
+#'   outdir = outdir, ica_token = token,
 #'   max_files = 1000, dryrun = F
 #' )
 #'
@@ -52,9 +52,9 @@
 #' p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables"
 #' p <- file.path(p1_s3, p2_s3)
 #' outdir <- sub("s3:/", "~/s3", p)
-#' um3 <- Wf$new(path = p, wname = "sash")
+#' um3 <- Wf$new(path = p, wname = "sash", regexes = regexes)
 #' um3$list_files(max_files = 10)
-#' um3$list_files_filter_relevant(regexes = regexes, max_files = 50)
+#' um3$list_files_filter_relevant(max_files = 50)
 #' d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F)
 #' }
 #'
@@ -68,13 +68,17 @@ Wf <- R6::R6Class(
     #' Name of workflow (e.g. umccrise, sash).
     #' @field filesystem (`character(1)`)\cr
     #' Filesystem of `path`.
+    #' @field regexes (`tibble()`)\cr
+    #' Tibble with file `regex` and `fun`ction to parse it.
     path = NULL,
     wname = NULL,
     filesystem = NULL,
+    regexes = NULL,
     #' @description Create a new Workflow object.
     #' @param path Output directory path with results.
     #' @param wname Name of workflow.
-    initialize = function(path = NULL, wname = NULL) {
+    #' @param regexes Tibble with file `regex` and `fun`ction to parse it.
+    initialize = function(path = NULL, wname = NULL, regexes = NULL) {
       wnames <- c(
         "bcl_convert",
         "tso_ctdna_tumor_only",
@@ -98,6 +102,7 @@ Wf <- R6::R6Class(
         grepl("^s3://", path) ~ "s3",
         .default = "local"
       )
+      self$regexes <- regexes
     },
     #' @description Print details about the Workflow.
     #' @param ... (ignored).
@@ -129,14 +134,13 @@ Wf <- R6::R6Class(
       return(d)
     },
     #' @description List dracarys files under given path
-    #' @param regexes Tibble with `regex` and `fun`ction name.
     #' @param max_files Max number of files to list (for gds/s3 only).
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
     #' @param ... Passed on to the `gds_list_files_filter_relevant` or
     #' the `s3_list_files_filter_relevant` function.
-    list_files_filter_relevant = function(regexes = NULL,
-                                          max_files = 1000,
+    list_files_filter_relevant = function(max_files = 1000,
                                           ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
+      regexes <- self$regexes
       assertthat::assert_that(!is.null(regexes))
       path <- self$path
       if (self$filesystem == "gds") {
@@ -156,18 +160,17 @@ Wf <- R6::R6Class(
     },
     #' @description Download files from GDS/S3 to local filesystem.
     #' @param outdir Path to output directory.
-    #' @param regexes Tibble with `regex` and `fun`ction name.
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
     #' @param max_files Maximum number of files to list.
     #' @param dryrun If TRUE, just list the files that will be downloaded (don't
     #' download them).
     #' @param recursive Should files be returned recursively _in and under_ the specified
     #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
-    download_files = function(outdir, regexes = NULL,
-                              ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+    download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
                               max_files = 1000, dryrun = FALSE, recursive = NULL) {
       # TODO: add envvar checker
       path <- self$path
+      regexes <- self$regexes
       assertthat::assert_that(!is.null(regexes))
       if (self$filesystem == "gds") {
         d <- dr_gds_download(
diff --git a/man/Wf.Rd b/man/Wf.Rd
index bc9b4b3..243f68b 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -38,9 +38,9 @@ regexes <- tibble::tribble(
 #---- LOCAL ----#
 p1_local <- "~/icav1/g/production/analysis_data"
 p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
-um1 <- Wf$new(path = p, wname = "umccrise")
+um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
 um1$list_files(max_files = 100)
-um1$list_files_filter_relevant(regexes = regexes, max_files = 100)
+um1$list_files_filter_relevant(max_files = 100)
 
 
 #---- GDS ----#
@@ -48,11 +48,11 @@ p1_gds <- "gds://production/analysis_data"
 p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
 outdir <- file.path(sub("gds:/", "~/icav1/g", p))
 token <- Sys.getenv("ICA_ACCESS_TOKEN")
-um2 <- Wf$new(path = p, wname = "umccrise")
+um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
 um2$list_files(max_files = 10)
-um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500)
+um2$list_files_filter_relevant(ica_token = token, max_files = 500)
 d <- um2$download_files(
-  outdir = outdir, regexes = regexes, ica_token = token,
+  outdir = outdir, ica_token = token,
   max_files = 1000, dryrun = F
 )
 
@@ -61,9 +61,9 @@ p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275
 p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables"
 p <- file.path(p1_s3, p2_s3)
 outdir <- sub("s3:/", "~/s3", p)
-um3 <- Wf$new(path = p, wname = "sash")
+um3 <- Wf$new(path = p, wname = "sash", regexes = regexes)
 um3$list_files(max_files = 10)
-um3$list_files_filter_relevant(regexes = regexes, max_files = 50)
+um3$list_files_filter_relevant(max_files = 50)
 d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F)
 }
 
@@ -79,6 +79,9 @@ Name of workflow (e.g. umccrise, sash).}
 
 \item{\code{filesystem}}{(\code{character(1)})\cr
 Filesystem of \code{path}.}
+
+\item{\code{regexes}}{(\code{tibble()})\cr
+Tibble with file \code{regex} and \code{fun}ction to parse it.}
 }
 \if{html}{\out{</div>}}
 }
@@ -100,7 +103,7 @@ Filesystem of \code{path}.}
 \subsection{Method \code{new()}}{
 Create a new Workflow object.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf$new(path = NULL, wname = NULL)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf$new(path = NULL, wname = NULL, regexes = NULL)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -109,6 +112,8 @@ Create a new Workflow object.
 \item{\code{path}}{Output directory path with results.}
 
 \item{\code{wname}}{Name of workflow.}
+
+\item{\code{regexes}}{Tibble with file \code{regex} and \code{fun}ction to parse it.}
 }
 \if{html}{\out{</div>}}
 }
@@ -162,7 +167,6 @@ List all files under given path.
 List dracarys files under given path
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$list_files_filter_relevant(
-  regexes = NULL,
   max_files = 1000,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   ...
@@ -172,8 +176,6 @@ List dracarys files under given path
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.}
-
 \item{\code{max_files}}{Max number of files to list (for gds/s3 only).}
 
 \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
@@ -192,7 +194,6 @@ Download files from GDS/S3 to local filesystem.
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$download_files(
   outdir,
-  regexes = NULL,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   max_files = 1000,
   dryrun = FALSE,
@@ -205,8 +206,6 @@ Download files from GDS/S3 to local filesystem.
 \describe{
 \item{\code{outdir}}{Path to output directory.}
 
-\item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.}
-
 \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
 \item{\code{max_files}}{Maximum number of files to list.}

From f57f1012d3546f884c001db50377d0c3cf30b859 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Sun, 8 Sep 2024 22:34:17 +1000
Subject: [PATCH 16/24] Wf_umccrise class cleanup

---
 R/umccrise.R       |  80 ++++++++++--------------------------
 man/Wf_umccrise.Rd | 100 ++++++++++-----------------------------------
 2 files changed, 43 insertions(+), 137 deletions(-)

diff --git a/R/umccrise.R b/R/umccrise.R
index 8903583..bfb2902 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -5,44 +5,26 @@
 #'
 #' @examples
 #' \dontrun{
-#' token <- Sys.getenv("ICA_ACCESS_TOKEN") |> ica_token_validate()
-#' SubjectID <- "SBJ01155"
-#' SampleID_tumor <- "PRJ211091"
-#' gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043"
-#' gdsdir <- file.path(gdsdir1, "L2101566__L2101565")
-#' obj <- Wf_umccrise$new(gdsdir)
-#' gds_files <- obj$gds_list(
-#'   gdsdir = gdsdir, token = token, SubjectID = SubjectID, SampleID_tumor
-#' )
-#' outdir <- file.path(sub("gds://", "", gdsdir))
-#' outdir <- file.path(normalizePath("~/icav1/g"), outdir)
-#' out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token)
-#' tidy1 <- obj$tidy(indir = outdir, out_files = out_files)
+#' #---- LOCAL ----#
+#' p1_local <- "~/icav1/g/production/analysis_data"
+#' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
+#' um1 <- Wf_umccrise$new(path = p)
+#' um1$list_files(max_files = 10)
+#' um1$list_files_filter_relevant(max_files = 100)
 #' }
 #'
 #' @export
 Wf_umccrise <- R6::R6Class(
   "Wf_umccrise",
+  inherit = Wf,
   public = list(
-    #' @field path Path to the `umccrise` directory.
-    #' @field contents Tibble with file path, basename, and size.
-    path = NULL,
-    contents = NULL,
     #' @description Create a new Wf_umccrise object.
-    #' @param path Path to the `umccrise` directory.
+    #' @param path Output directory path with results.
+    #' @param wname Name of workflow.
     initialize = function(path = NULL) {
-      stopifnot(is.character(path), length(path) == 1)
-      self$path <- path
-    },
-    #' @description List Relevant Files In umccrise GDS Directory
-    #' @param gdsdir Path to the `umccrise` directory.
-    #' @param SubjectID The SubjectID of the sample (used to construct path).
-    #' @param SampleID_tumor The SampleID of the tumor sample (used to construct path).
-    #' @param token ICA access token.
-    gds_list = function(gdsdir, SubjectID, SampleID_tumor, token = Sys.getenv("ICA_ACCESS_TOKEN")) {
-      reg1 <- tibble::tribble(
+      wname <- "umccrise"
+      regexes <- tibble::tribble(
         ~regex, ~fun,
-        # "-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile",
         "-chord\\.tsv\\.gz$", "UmChordTsvFile",
         "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
         "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
@@ -50,36 +32,18 @@ Wf_umccrise <- R6::R6Class(
         "-dbs\\.tsv\\.gz$", "UmSigsDbsFile",
         "-indel\\.tsv\\.gz$", "UmSigsIndelFile",
         "-qc_summary\\.tsv\\.gz$", "UmQcSumFile",
-        "multiqc_conpair.txt", "UmConpairMultiqc"
-      )
-      reg2 <- tibble::tribble(
-        ~regex, ~fun,
+        "multiqc_conpair.txt", "UmConpairMultiqc",
         "-somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile"
       )
-      dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}"))
-      dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}"))
-      dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json
-      f1 <- gds_files_list_filter_relevant(gdsdir = dir_fin, token, page_size = 300, regexes = reg1)
-      f2 <- gds_files_list_filter_relevant(gdsdir = dir_wrk_pcgr, token, page_size = 50, regexes = reg2)
-      gds_files <- dplyr::bind_rows(f1, f2)
-      return(gds_files)
+      super$initialize(path = path, wname = wname, regexes = regexes)
     },
-
-    #' @description GDS File Download via API
-    #'
-    #' @param gds_files Tibble with bname and file_id for umccrise files.
-    #' @param outdir Directory to output files (loosely, not in a structured manner).
-    #' @param token ICA access token.
-    gds_download = function(gds_files, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN")) {
-      assertthat::assert_that(all(c("bname", "file_id") %in% colnames(gds_files)))
-      gds_files |>
-        dplyr::rowwise() |>
-        dplyr::mutate(
-          out = file.path(outdir, .data$bname),
-          out_dl = gds_file_download_api(.data$file_id, .data$out, token)
-        )
-    },
-
+    # dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}"))
+    # dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}"))
+    # dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json
+    # f1 <- gds_files_list_filter_relevant(gdsdir = dir_fin, token, page_size = 300, regexes = reg1)
+    # f2 <- gds_files_list_filter_relevant(gdsdir = dir_wrk_pcgr, token, page_size = 50, regexes = reg2)
+    # gds_files <- dplyr::bind_rows(f1, f2)
+    # return(gds_files)
     #' @description Tidy up the output files from umccrise
     #'
     #' @param indir Path to the `umccrise` directory.
@@ -103,9 +67,7 @@ Wf_umccrise <- R6::R6Class(
     },
 
     #' @description Read multiqc_conpair.txt file.
-    #'
-    #' @param x (`character(1)`)\cr
-    #'   Path to multiqc_conpair.txt file.
+    #' @param x Path to file.
     read_conpairmultiqc = function(x) {
       um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota")
       um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples)))
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index 9d6abfb..7be8d45 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -8,42 +8,38 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow
 }
 \examples{
 \dontrun{
-token <- Sys.getenv("ICA_ACCESS_TOKEN") |> ica_token_validate()
-SubjectID <- "SBJ01155"
-SampleID_tumor <- "PRJ211091"
-gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043"
-gdsdir <- file.path(gdsdir1, "L2101566__L2101565")
-obj <- Wf_umccrise$new(gdsdir)
-gds_files <- obj$gds_list(
-  gdsdir = gdsdir, token = token, SubjectID = SubjectID, SampleID_tumor
-)
-outdir <- file.path(sub("gds://", "", gdsdir))
-outdir <- file.path(normalizePath("~/icav1/g"), outdir)
-out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token)
-tidy1 <- obj$tidy(indir = outdir, out_files = out_files)
+#---- LOCAL ----#
+p1_local <- "~/icav1/g/production/analysis_data"
+p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
+um1 <- Wf_umccrise$new(path = p)
+um1$list_files(max_files = 10)
+um1$list_files_filter_relevant(max_files = 100)
 }
 
 }
-\section{Public fields}{
-\if{html}{\out{<div class="r6-fields">}}
-\describe{
-\item{\code{path}}{Path to the \code{umccrise} directory.}
-
-\item{\code{contents}}{Tibble with file path, basename, and size.}
-}
-\if{html}{\out{</div>}}
+\section{Super class}{
+\code{\link[dracarys:Wf]{dracarys::Wf}} -> \code{Wf_umccrise}
 }
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
 \item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}}
-\item \href{#method-Wf_umccrise-gds_list}{\code{Wf_umccrise$gds_list()}}
-\item \href{#method-Wf_umccrise-gds_download}{\code{Wf_umccrise$gds_download()}}
 \item \href{#method-Wf_umccrise-tidy}{\code{Wf_umccrise$tidy()}}
 \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}}
 \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}}
 }
 }
+\if{html}{\out{
+<details open><summary>Inherited methods</summary>
+<ul>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="download_files"><a href='../../dracarys/html/Wf.html#method-Wf-download_files'><code>dracarys::Wf$download_files()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files"><a href='../../dracarys/html/Wf.html#method-Wf-list_files'><code>dracarys::Wf$list_files()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files_filter_relevant"><a href='../../dracarys/html/Wf.html#method-Wf-list_files_filter_relevant'><code>dracarys::Wf$list_files_filter_relevant()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="print"><a href='../../dracarys/html/Wf.html#method-Wf-print'><code>dracarys::Wf$print()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="tidy_files"><a href='../../dracarys/html/Wf.html#method-Wf-tidy_files'><code>dracarys::Wf$tidy_files()</code></a></span></li>
+</ul>
+</details>
+}}
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf_umccrise-new"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf_umccrise-new}{}}}
@@ -56,60 +52,9 @@ Create a new Wf_umccrise object.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{path}}{Path to the \code{umccrise} directory.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-gds_list"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_list}{}}}
-\subsection{Method \code{gds_list()}}{
-List Relevant Files In umccrise GDS Directory
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$gds_list(
-  gdsdir,
-  SubjectID,
-  SampleID_tumor,
-  token = Sys.getenv("ICA_ACCESS_TOKEN")
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{gdsdir}}{Path to the \code{umccrise} directory.}
-
-\item{\code{SubjectID}}{The SubjectID of the sample (used to construct path).}
-
-\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (used to construct path).}
-
-\item{\code{token}}{ICA access token.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-gds_download"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_download}{}}}
-\subsection{Method \code{gds_download()}}{
-GDS File Download via API
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$gds_download(
-  gds_files,
-  outdir,
-  token = Sys.getenv("ICA_ACCESS_TOKEN")
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{gds_files}}{Tibble with bname and file_id for umccrise files.}
-
-\item{\code{outdir}}{Directory to output files (loosely, not in a structured manner).}
+\item{\code{path}}{Output directory path with results.}
 
-\item{\code{token}}{ICA access token.}
+\item{\code{wname}}{Name of workflow.}
 }
 \if{html}{\out{</div>}}
 }
@@ -145,8 +90,7 @@ Read multiqc_conpair.txt file.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{x}}{(\code{character(1)})\cr
-Path to multiqc_conpair.txt file.}
+\item{\code{x}}{Path to file.}
 }
 \if{html}{\out{</div>}}
 }

From d7230feb6fd024d4a89e32b455ffa1d0ecf1c6fe Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Mon, 9 Sep 2024 01:35:08 +1000
Subject: [PATCH 17/24] evaluate tidiers inside R6 env

---
 NAMESPACE                                     |   1 -
 R/Wf.R                                        |  54 +--
 R/fs_icav1.R                                  |   8 +-
 R/fs_s3.R                                     |   8 +-
 R/regex.R                                     |   6 +-
 R/tidy.R                                      |  27 ++
 R/umccrise.R                                  | 328 +++++++++---------
 man/UmccriseCanRepTables.Rd                   | 181 ----------
 man/Wf.Rd                                     |  40 ++-
 man/Wf_umccrise.Rd                            | 209 ++++++++++-
 man/dr_func_eval.Rd                           |   5 +-
 man/dr_gds_download.Rd                        |   7 +-
 man/dr_s3_download.Rd                         |   7 +-
 man/tidy_files.Rd                             |  10 +-
 .../test-roxytest-testexamples-regex.R        |   2 +-
 15 files changed, 466 insertions(+), 427 deletions(-)
 delete mode 100644 man/UmccriseCanRepTables.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 416306c..c2f6f66 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -28,7 +28,6 @@ export(TsoSampleAnalysisResultsFile)
 export(TsoTargetRegionCoverageFile)
 export(TsoTmbFile)
 export(TsoTmbTraceTsvFile)
-export(UmccriseCanRepTables)
 export(VCMetricsFile)
 export(Wf)
 export(Wf_tso_ctdna_tumor_only)
diff --git a/R/Wf.R b/R/Wf.R
index 634c6ad..bb18f62 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -30,9 +30,8 @@
 #' p1_local <- "~/icav1/g/production/analysis_data"
 #' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
 #' um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
-#' um1$list_files(max_files = 100)
-#' um1$list_files_filter_relevant(max_files = 100)
-#'
+#' um1$list_files(max_files = 10)
+#' um1$list_files_filter_relevant(max_files = 10)
 #'
 #' #---- GDS ----#
 #' p1_gds <- "gds://production/analysis_data"
@@ -44,7 +43,7 @@
 #' um2$list_files_filter_relevant(ica_token = token, max_files = 500)
 #' d <- um2$download_files(
 #'   outdir = outdir, ica_token = token,
-#'   max_files = 1000, dryrun = F
+#'   max_files = 1000, dryrun = T
 #' )
 #'
 #' #---- S3 ----#
@@ -62,20 +61,17 @@
 Wf <- R6::R6Class(
   "Wf",
   public = list(
-    #' @field path (`character(1)`)\cr
-    #' Path to directory with raw workflow results (from GDS, S3, or local filesystem).
-    #' @field wname (`character(1)`)\cr
-    #' Name of workflow (e.g. umccrise, sash).
-    #' @field filesystem (`character(1)`)\cr
-    #' Filesystem of `path`.
-    #' @field regexes (`tibble()`)\cr
-    #' Tibble with file `regex` and `fun`ction to parse it.
+    #' @field path Path to directory with raw workflow results (from GDS, S3, or
+    #' local filesystem).
+    #' @field wname Name of workflow (e.g. umccrise, sash).
+    #' @field filesystem  Filesystem of `path` (gds/s3/local).
+    #' @field regexes Tibble with file `regex` and `fun`ction to parse it.
     path = NULL,
     wname = NULL,
     filesystem = NULL,
     regexes = NULL,
     #' @description Create a new Workflow object.
-    #' @param path Output directory path with results.
+    #' @param path Path to directory with raw workflow results.
     #' @param wname Name of workflow.
     #' @param regexes Tibble with file `regex` and `fun`ction to parse it.
     initialize = function(path = NULL, wname = NULL, regexes = NULL) {
@@ -117,11 +113,12 @@ Wf <- R6::R6Class(
       invisible(self)
     },
     #' @description List all files under given path.
+    #' @param path Path with raw results.
     #' @param max_files Max number of files to list (for gds/s3 only).
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
     #' @param ... Passed on to `gds_list_files_dir` function.
-    list_files = function(max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
-      path <- self$path
+    list_files = function(path = self$path, max_files = 1000,
+                          ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
       if (self$filesystem == "gds") {
         d <- gds_list_files_dir(
           gdsdir = path, token = ica_token, page_size = max_files, ...
@@ -134,15 +131,15 @@ Wf <- R6::R6Class(
       return(d)
     },
     #' @description List dracarys files under given path
+    #' @param path Path with raw results.
     #' @param max_files Max number of files to list (for gds/s3 only).
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
     #' @param ... Passed on to the `gds_list_files_filter_relevant` or
     #' the `s3_list_files_filter_relevant` function.
-    list_files_filter_relevant = function(max_files = 1000,
+    list_files_filter_relevant = function(path = self$path, max_files = 1000,
                                           ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
       regexes <- self$regexes
       assertthat::assert_that(!is.null(regexes))
-      path <- self$path
       if (self$filesystem == "gds") {
         d <- gds_list_files_filter_relevant(
           gdsdir = path, regexes = regexes, token = ica_token, page_size = max_files, ...
@@ -159,23 +156,26 @@ Wf <- R6::R6Class(
       d
     },
     #' @description Download files from GDS/S3 to local filesystem.
+    #' @param path Path with raw results.
     #' @param outdir Path to output directory.
     #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
-    #' @param max_files Maximum number of files to list.
+    #' @param max_files Max number of files to list.
     #' @param dryrun If TRUE, just list the files that will be downloaded (don't
     #' download them).
     #' @param recursive Should files be returned recursively _in and under_ the specified
     #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
-    download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
-                              max_files = 1000, dryrun = FALSE, recursive = NULL) {
+    #' @param list_filter_fun Function to filter relevant files.
+    download_files = function(path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                              max_files = 1000, dryrun = FALSE, recursive = NULL,
+                              list_filter_fun = NULL) {
       # TODO: add envvar checker
-      path <- self$path
       regexes <- self$regexes
-      assertthat::assert_that(!is.null(regexes))
+      assertthat::assert_that(!is.null(regexes), !is.null(list_filter_fun))
       if (self$filesystem == "gds") {
         d <- dr_gds_download(
           gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token,
-          page_size = max_files, dryrun = dryrun, recursive = recursive
+          page_size = max_files, dryrun = dryrun, recursive = recursive,
+          list_filter_fun = list_filter_fun
         )
         if (!dryrun) {
           self$filesystem <- "local"
@@ -184,7 +184,8 @@ Wf <- R6::R6Class(
       } else if (self$filesystem == "s3") {
         d <- dr_s3_download(
           s3dir = path, outdir = outdir, regexes = regexes,
-          max_objects = max_files, dryrun = dryrun
+          max_objects = max_files, dryrun = dryrun,
+          list_filter_fun = list_filter_fun
         )
         if (!dryrun) {
           self$filesystem <- "local"
@@ -196,9 +197,10 @@ Wf <- R6::R6Class(
       return(d)
     },
     #' @description Tidy given files.
-    #' @param x Tibble with `fun`ction to parse the file and `localpath` to the file.
+    #' @param x Tibble with `localpath` to file and the function `type` to parse it.
     tidy_files = function(x) {
-      tidy_files(x)
+      # awesomeness
+      tidy_files(x, envir = self)
     }
   ) # end public
 )
diff --git a/R/fs_icav1.R b/R/fs_icav1.R
index 8f36e81..487669d 100644
--- a/R/fs_icav1.R
+++ b/R/fs_icav1.R
@@ -155,21 +155,23 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_
 #' @param outdir Local output directory.
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
+#' @param list_filter_fun Function to filter relevant GDS files.
 #' @examples
 #' \dontrun{
 #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 #' outdir <- sub("gds:/", "~/icav1/g", gdsdir)
 #' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
-#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F)
+#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T)
 #' }
 #'
 #' @export
 dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
                             pattern = NULL, page_size = 100, dryrun = FALSE,
-                            regexes = DR_FILE_REGEX, recursive = NULL) {
+                            regexes = DR_FILE_REGEX, recursive = NULL,
+                            list_filter_fun = gds_list_files_filter_relevant) {
   e <- emojifont::emoji
   fs::dir_create(outdir)
-  d <- gds_list_files_filter_relevant(
+  d <- list_filter_fun(
     gdsdir = gdsdir, pattern = pattern, regexes = regexes,
     token = token, page_size = page_size, include_url = FALSE,
     no_recurse = FALSE, page_token = NULL,
diff --git a/R/fs_s3.R b/R/fs_s3.R
index 1bf8f4a..a4ca9ef 100644
--- a/R/fs_s3.R
+++ b/R/fs_s3.R
@@ -109,6 +109,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL,
 #' @param outdir Path to output directory.
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
+#' @param list_filter_fun Function to filter relevant S3 files.
 #' @examples
 #' \dontrun{
 #' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
@@ -116,15 +117,16 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL,
 #' s3dir <- file.path(p1, p2)
 #' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
 #' outdir <- sub("s3:/", "~/s3", s3dir)
-#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = F)
+#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = T)
 #' }
 #' @export
 dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
-                           regexes = DR_FILE_REGEX, dryrun = FALSE) {
+                           regexes = DR_FILE_REGEX, dryrun = FALSE,
+                           list_filter_fun = s3_list_files_filter_relevant) {
   s3 <- paws.storage::s3()
   e <- emojifont::emoji
   fs::dir_create(outdir)
-  d <- s3_list_files_filter_relevant(
+  d <- list_filter_fun(
     s3dir = s3dir, pattern = NULL, regexes = regexes,
     max_objects = max_objects, presign = FALSE
   )
diff --git a/R/regex.R b/R/regex.R
index 4ca485b..34f4079 100644
--- a/R/regex.R
+++ b/R/regex.R
@@ -83,6 +83,8 @@ FILES_DOWNLOAD_BUT_IGNORE <- c(
 #' @param f Name of function to evaluate.
 #' @param v Character vector of strings evaluating to functions. By default,
 #' this points to the functions in the DR_FILE_REGEX dracarys tibble.
+#' @param envir the environment in which to evaluate the function e.g. use `self`
+#' when using inside R6 classes.
 #'
 #' @return Evaluated function.
 #' @examples
@@ -94,13 +96,13 @@ FILES_DOWNLOAD_BUT_IGNORE <- c(
 #' expect_equal(mean_1_to_10, base::mean(1:10))
 #' expect_null(dr_func_eval("foo"))
 #' @export
-dr_func_eval <- function(f, v = NULL) {
+dr_func_eval <- function(f, v = NULL, envir = parent.frame()) {
   v <- v %||% DR_FILE_REGEX[["fun"]]
   if (!f %in% v) {
     return(NULL)
   }
   # evaluate string
-  eval(parse(text = f))
+  eval(parse(text = f), envir = envir)
 }
 
 #' Get dracarys `DR_FILE_REGEX`
diff --git a/R/tidy.R b/R/tidy.R
index a1b9313..06fac82 100644
--- a/R/tidy.R
+++ b/R/tidy.R
@@ -1,3 +1,30 @@
+#' Tidy Files
+#'
+#' @param x Tibble with `localpath` to file and the function `type` to parse it.
+#' @param envir the environment in which to evaluate the function e.g. use `self`
+#' when using inside R6 classes.
+#'
+#' @return Tibble with parsed data in a `data` list-column.
+#' @examples
+#' \dontrun{
+#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
+#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz"
+#' p <- file.path(p1, p2)
+#' x <- tibble::tibble(type = "readr::read_tsv", localpath = p)
+#' tidy_files(x)
+#' }
+#'
+#' @export
+tidy_files <- function(x, envir = parent.frame()) {
+  assertthat::assert_that(is.data.frame(x))
+  assertthat::assert_that(all(c("type", "localpath") %in% colnames(x)))
+  x |>
+    dplyr::rowwise() |>
+    dplyr::mutate(
+      data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath))
+    )
+}
+
 #' Tidy UMCCR Results
 #'
 #' Tidies UMCCR workflow results into a list of tibbles and writes individual
diff --git a/R/umccrise.R b/R/umccrise.R
index bfb2902..9c465c9 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -5,12 +5,29 @@
 #'
 #' @examples
 #' \dontrun{
+#'
 #' #---- LOCAL ----#
+#' SubjectID <- "SBJ01155"
+#' SampleID_tumor <- "PRJ211091"
 #' p1_local <- "~/icav1/g/production/analysis_data"
 #' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
-#' um1 <- Wf_umccrise$new(path = p)
-#' um1$list_files(max_files = 10)
-#' um1$list_files_filter_relevant(max_files = 100)
+#' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+#' um1$list_files(max_files = 100)
+#'
+#' #---- GDS ----#
+#' SubjectID <- "SBJ03043"
+#' SampleID_tumor <- "PRJ230004"
+#' p1_gds <- "gds://production/analysis_data"
+#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
+#' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
+#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
+#' um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+#' um2$list_files(max_files = 8)
+#' um2$list_files_filter_relevant(ica_token = token, max_files = 500)
+#' d <- um2$download_files(
+#'   outdir = outdir, ica_token = token,
+#'   max_files = 1000, dryrun = F
+#' )
 #' }
 #'
 #' @export
@@ -18,151 +35,103 @@ Wf_umccrise <- R6::R6Class(
   "Wf_umccrise",
   inherit = Wf,
   public = list(
+    #' @field SubjectID The SubjectID of the sample (needed for path lookup).
+    #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
+    SubjectID = NULL,
+    SampleID_tumor = NULL,
     #' @description Create a new Wf_umccrise object.
     #' @param path Output directory path with results.
-    #' @param wname Name of workflow.
-    initialize = function(path = NULL) {
+    #' @param SubjectID The SubjectID of the sample (needed for path lookup).
+    #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
+    initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) {
       wname <- "umccrise"
       regexes <- tibble::tribble(
         ~regex, ~fun,
-        "-chord\\.tsv\\.gz$", "UmChordTsvFile",
-        "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
-        "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
-        "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
-        "-dbs\\.tsv\\.gz$", "UmSigsDbsFile",
-        "-indel\\.tsv\\.gz$", "UmSigsIndelFile",
-        "-qc_summary\\.tsv\\.gz$", "UmQcSumFile",
-        "multiqc_conpair.txt", "UmConpairMultiqc",
-        "-somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile"
-      )
-      super$initialize(path = path, wname = wname, regexes = regexes)
-    },
-    # dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}"))
-    # dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}"))
-    # dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json
-    # f1 <- gds_files_list_filter_relevant(gdsdir = dir_fin, token, page_size = 300, regexes = reg1)
-    # f2 <- gds_files_list_filter_relevant(gdsdir = dir_wrk_pcgr, token, page_size = 50, regexes = reg2)
-    # gds_files <- dplyr::bind_rows(f1, f2)
-    # return(gds_files)
-    #' @description Tidy up the output files from umccrise
-    #'
-    #' @param indir Path to the `umccrise` directory.
-    #' @param out_files Tibble with file path, basename, and size.
-    tidy = function(indir, out_files) {
-      obj_canrep <- UmccriseCanRepTables$new(indir)
-      canrep_parse <- obj_canrep$read()
-      pcgr_json <- out_files |>
-        dplyr::filter(.data$type == "PcgrJsonFile") |>
-        dplyr::pull("out") |>
-        PcgrJsonFile$new() |>
-        read()
-      conpair_tsv <- out_files |>
-        dplyr::filter(.data$type == "UmConpairMultiqc") |>
-        dplyr::pull("out") |>
-        self$read_conpairmultiqc()
-      d <- canrep_parse
-      d[["pcgr_json"]] <- pcgr_json[["metrics"]]
-      d[["conpair"]] <- conpair_tsv
-      d
-    },
+        "-chord\\.tsv\\.gz$", "chordtsv",
+        "-hrdetect\\.tsv\\.gz$", "hrdetecttsv",
+        "-snv_2015\\.tsv\\.gz$", "sigstsv",
+        "-snv_2020\\.tsv\\.gz$", "sigstsv",
+        "-dbs\\.tsv\\.gz$", "sigstsv",
+        "-indel\\.tsv\\.gz$", "sigstsv",
+        "-qc_summary\\.tsv\\.gz$", "qcsummarytsv",
+        "multiqc_conpair.txt", "conpairmultiqc",
+        "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson"
+      ) |>
+        dplyr::mutate(fun = paste0("read_", .data$fun))
 
-    #' @description Read multiqc_conpair.txt file.
-    #' @param x Path to file.
-    read_conpairmultiqc = function(x) {
-      um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota")
-      um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples)))
-      cnames <- list(
-        old = c(
-          "Sample", "concordance_concordance", "concordance_used_markers",
-          "concordance_total_markers", "concordance_marker_threshold",
-          "concordance_min_mapping_quality", "concordance_min_base_quality",
-          "contamination"
-        ),
-        new = c(
-          "sampleid", "contamination", "concordance", "markers_used",
-          "markers_total", "marker_threshold",
-          "mapq_min", "baseq_min"
-        )
-      )
-      ctypes <- list(
-        old = c("cddddddd"),
-        new = c("cddddddd")
-      )
-      if (!file.exists(x)) {
-        return(empty_tbl(cnames$new, ctypes$new))
-      }
-      d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c"))
-      assertthat::assert_that(all(colnames(d1) == cnames$old))
-      d1 |>
-        dplyr::filter(!.data$Sample %in% um_ref_samples) |>
-        dplyr::relocate("contamination", .after = "Sample") |>
-        rlang::set_names(cnames$new)
-    }
-  )
-)
-
-#' UmccriseCanRepTables R6 Class
-#'
-#' @description
-#' Reads and writes tidy versions of files within the `cancer_report_tables` directory
-#' output from the `umccrise` workflow.
-#'
-#' @examples
-#' \dontrun{
-#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
-#' p2 <- "L2101566__L2101565"
-#' p <- file.path(p1, p2)
-#' obj <- UmccriseCanRepTables$new(p)
-#' obj$path
-#' obj$contents
-#' d <- obj$read()
-#' obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
-#' }
-#'
-#' @export
-UmccriseCanRepTables <- R6::R6Class(
-  "UmccriseCanRepTables",
-  public = list(
-    #' @field path Path to the `cancer_report_tables` directory.
-    #' @field contents Tibble with file path, basename, and size.
-    path = NULL,
-    contents = NULL,
-    #' @description Create a new UmccriseCanRepTables object.
-    #' @param path Path to the `cancer_report_tables` directory.
-    initialize = function(path = NULL) {
-      stopifnot(is.character(path), length(path) == 1)
-      self$path <- normalizePath(path)
-      self$contents <- fs::dir_info(path, type = "file", recurse = TRUE) |>
-        dplyr::mutate(
-          bname = basename(.data$path),
-          size = as.character(trimws(.data$size))
-        ) |>
-        dplyr::select("path", "bname", "size")
+      super$initialize(path = path, wname = wname, regexes = regexes)
+      self$SubjectID <- SubjectID
+      self$SampleID_tumor <- SampleID_tumor
     },
-    #' @description Print details about the cancer_report_tables directory.
+    #' @description Print details about the Workflow.
     #' @param ... (ignored).
     print = function(...) {
-      bnames <- self$contents |>
-        dplyr::mutate(
-          low = tolower(.data$bname),
-        ) |>
-        dplyr::arrange(.data$low) |>
-        dplyr::mutate(
-          n = dplyr::row_number(),
-          bn = glue("{.data$n}. {.data$bname} ({.data$size})")
-        ) |>
-        dplyr::pull("bn")
-      cat("#--- UmccriseCanRepTables ---#\n")
-      cat(glue("Path: {self$path}"), "\n")
-      cat("Contents:\n")
-      cat(bnames, sep = "\n")
+      res <- tibble::tribble(
+        ~var, ~value,
+        "path", self$path,
+        "wname", self$wname,
+        "filesystem", self$filesystem,
+        "SubjectID", self$SubjectID,
+        "SampleID_tumor", self$SampleID_tumor
+      )
+      print(res)
       invisible(self)
     },
-
-    #' @description Read `chord.tsv.gz` file output from umccrise.
-    #'
-    #' @param x (`character(1)`)\cr
-    #'   Path to `chord.tsv.gz` file.
+    #' @description List dracarys files under given path
+    #' @param max_files Max number of files to list (for gds/s3 only).
+    #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+    #' @param ... Passed on to the `gds_list_files_filter_relevant` or
+    #' the `s3_list_files_filter_relevant` function.
+    list_files_filter_relevant = function(max_files = 1000,
+                                          ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) {
+      path <- self$path
+      dir_final <- file.path(path, glue("{self$SubjectID}__{self$SampleID_tumor}"))
+      dir_work <- file.path(path, "work", glue("{self$SubjectID}__{self$SampleID_tumor}"))
+      dir_work_pcgr <- file.path(dir_work, "pcgr") # for pcgr json
+      f1 <- super$list_files_filter_relevant(path = dir_final, max_files = 300, ica_token = ica_token)
+      f2 <- super$list_files_filter_relevant(path = dir_work_pcgr, max_files = 50, ica_token = ica_token)
+      f_all <- dplyr::bind_rows(f1, f2)
+      return(f_all)
+    },
+    #' @description Download files from GDS/S3 to local filesystem.
+    #' @param outdir Path to output directory.
+    #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+    #' @param max_files Max number of files to list.
+    #' @param dryrun If TRUE, just list the files that will be downloaded (don't
+    #' download them).
+    #' @param recursive Should files be returned recursively _in and under_ the specified
+    #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
+    #' @param list_filter_fun Function to filter relevant files.
+    download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                              max_files = 1000, dryrun = FALSE, recursive = NULL) {
+      super$download_files(
+        outdir = outdir, ica_token = ica_token, max_files = max_files,
+        dryrun = dryrun, recursive = recursive,
+        list_filter_fun = self$list_files_filter_relevant
+      )
+    },
+    #' @description Read `pcgr.json.gz` file.
+    #' @param x Path to file.
+    read_pcgrjson = function(x) {
+      j <- read_jsongz_jsonlite(x)
+      tmb <-
+        j[["content"]][["tmb"]][["variant_statistic"]] %||%
+        j[["content"]][["tmb"]][["v_stat"]] %||%
+        list(tmb_estimate = NA, n_tmb = NA)
+      tmb <- purrr::flatten(tmb) |>
+        tibble::as_tibble_row() |>
+        dplyr::select("tmb_estimate", "n_tmb")
+      msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]]
+      # handle nulls
+      msi <- msi %||% list(fracIndels = NA, predicted_class = NA)
+      msi <- purrr::flatten(msi) |>
+        tibble::as_tibble_row() |>
+        dplyr::select("fracIndels", "predicted_class")
+      metrics <- dplyr::bind_cols(msi, tmb)
+      return(metrics)
+    },
+    #' @description Read `chord.tsv.gz` cancer report file.
+    #' @param x Path to file.
     read_chordtsv = function(x) {
       ct <- readr::cols_only(
         p_hrd = "d",
@@ -173,10 +142,8 @@ UmccriseCanRepTables <- R6::R6Class(
       )
       read_tsvgz(x, col_types = ct)
     },
-    #' @description Read `hrdetect.tsv.gz` file output from umccrise.
-    #'
-    #' @param x (`character(1)`)\cr
-    #'   Path to `hrdetect.tsv.gz` file.
+    #' @description Read `hrdetect.tsv.gz` cancer report file.
+    #' @param x Path to file.
     read_hrdetecttsv = function(x) {
       ct <- readr::cols(
         .default = "d",
@@ -185,24 +152,17 @@ UmccriseCanRepTables <- R6::R6Class(
       read_tsvgz(x, col_types = ct) |>
         dplyr::select(-c("sample"))
     },
-
-
-    #' @description Read `snv_20XX.tsv.gz` file output from umccrise.
-    #'
-    #' @param x (`character(1)`)\cr
-    #'   Path to `snv_20XX.tsv.gz` file.
-    read_sigs = function(x) {
+    #' @description Read `snv_20XX.tsv.gz` cancer report file.
+    #' @param x Path to file.
+    read_sigstsv = function(x) {
       ct <- readr::cols(
         .default = "d",
         Signature = "c"
       )
       read_tsvgz(x, col_types = ct)
     },
-
-    #' @description Read `qc_summary.tsv.gz` file output from umccrise.
-    #'
-    #' @param x (`character(1)`)\cr
-    #'   Path to `qc_summary.tsv.gz` file.
+    #' @description Read `qc_summary.tsv.gz` cancer report file.
+    #' @param x Path to file.
     read_qcsummarytsv = function(x) {
       d <- read_tsvgz(x, col_types = readr::cols(.default = "c"))
       d |>
@@ -235,23 +195,51 @@ UmccriseCanRepTables <- R6::R6Class(
           "hypermutated", "bpi_enabled"
         )
     },
-    #' @description
-    #' Reads contents of `cancer_report_tables` directory output by umccrise.
-    #'
-    #' @return A list of tibbles.
-    #' @export
-    read = function() {
-      x <- self$path
-      # now return all as list elements
-      list(
-        chord = grep_file(x, "-chord\\.tsv\\.gz$") |> self$read_chordtsv(),
-        hrdetect = grep_file(x, "-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(),
-        sigs2015 = grep_file(x, "-snv_2015\\.tsv\\.gz$") |> self$read_sigs(),
-        sigs2020 = grep_file(x, "-snv_2020\\.tsv\\.gz$") |> self$read_sigs(),
-        sigsdbs = grep_file(x, "-dbs\\.tsv\\.gz$") |> self$read_sigs(),
-        sigsindel = grep_file(x, "-indel\\.tsv\\.gz$") |> self$read_sigs(),
-        qcsum = grep_file(x, "-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv()
+    #' @description Read multiqc_conpair.txt file.
+    #' @param x Path to file.
+    read_conpairmultiqc = function(x) {
+      um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota")
+      um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples)))
+      cnames <- list(
+        old = c(
+          "Sample", "concordance_concordance", "concordance_used_markers",
+          "concordance_total_markers", "concordance_marker_threshold",
+          "concordance_min_mapping_quality", "concordance_min_base_quality",
+          "contamination"
+        ),
+        new = c(
+          "sampleid", "contamination", "concordance", "markers_used",
+          "markers_total", "marker_threshold",
+          "mapq_min", "baseq_min"
+        )
       )
+      ctypes <- list(
+        old = c("cddddddd"),
+        new = c("cddddddd")
+      )
+      if (!file.exists(x)) {
+        return(empty_tbl(cnames$new, ctypes$new))
+      }
+      d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c"))
+      assertthat::assert_that(all(colnames(d1) == cnames$old))
+      d1 |>
+        dplyr::filter(!.data$Sample %in% um_ref_samples) |>
+        dplyr::relocate("contamination", .after = "Sample") |>
+        rlang::set_names(cnames$new)
     }
-  )
+  ) # end public
 )
+
+#    read = function() {
+#      x <- self$path
+#      # now return all as list elements
+#      list(
+#        chord = grep_file(x, "-chord\\.tsv\\.gz$") |> self$read_chordtsv(),
+#        hrdetect = grep_file(x, "-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(),
+#        sigs2015 = grep_file(x, "-snv_2015\\.tsv\\.gz$") |> self$read_sigs(),
+#        sigs2020 = grep_file(x, "-snv_2020\\.tsv\\.gz$") |> self$read_sigs(),
+#        sigsdbs = grep_file(x, "-dbs\\.tsv\\.gz$") |> self$read_sigs(),
+#        sigsindel = grep_file(x, "-indel\\.tsv\\.gz$") |> self$read_sigs(),
+#        qcsum = grep_file(x, "-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv()
+#      )
+#    }
diff --git a/man/UmccriseCanRepTables.Rd b/man/UmccriseCanRepTables.Rd
deleted file mode 100644
index 5c43358..0000000
--- a/man/UmccriseCanRepTables.Rd
+++ /dev/null
@@ -1,181 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/umccrise.R
-\name{UmccriseCanRepTables}
-\alias{UmccriseCanRepTables}
-\title{UmccriseCanRepTables R6 Class}
-\description{
-Reads and writes tidy versions of files within the \code{cancer_report_tables} directory
-output from the \code{umccrise} workflow.
-}
-\examples{
-\dontrun{
-p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
-p2 <- "L2101566__L2101565"
-p <- file.path(p1, p2)
-obj <- UmccriseCanRepTables$new(p)
-obj$path
-obj$contents
-d <- obj$read()
-obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
-}
-
-}
-\section{Public fields}{
-\if{html}{\out{<div class="r6-fields">}}
-\describe{
-\item{\code{path}}{Path to the \code{cancer_report_tables} directory.}
-
-\item{\code{contents}}{Tibble with file path, basename, and size.}
-}
-\if{html}{\out{</div>}}
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-UmccriseCanRepTables-new}{\code{UmccriseCanRepTables$new()}}
-\item \href{#method-UmccriseCanRepTables-print}{\code{UmccriseCanRepTables$print()}}
-\item \href{#method-UmccriseCanRepTables-read_chordtsv}{\code{UmccriseCanRepTables$read_chordtsv()}}
-\item \href{#method-UmccriseCanRepTables-read_hrdetecttsv}{\code{UmccriseCanRepTables$read_hrdetecttsv()}}
-\item \href{#method-UmccriseCanRepTables-read_sigs}{\code{UmccriseCanRepTables$read_sigs()}}
-\item \href{#method-UmccriseCanRepTables-read_qcsummarytsv}{\code{UmccriseCanRepTables$read_qcsummarytsv()}}
-\item \href{#method-UmccriseCanRepTables-read}{\code{UmccriseCanRepTables$read()}}
-\item \href{#method-UmccriseCanRepTables-clone}{\code{UmccriseCanRepTables$clone()}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-new"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-new}{}}}
-\subsection{Method \code{new()}}{
-Create a new UmccriseCanRepTables object.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$new(path = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{path}}{Path to the \code{cancer_report_tables} directory.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-print"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-print}{}}}
-\subsection{Method \code{print()}}{
-Print details about the cancer_report_tables directory.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$print(...)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{...}}{(ignored).}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-read_chordtsv"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_chordtsv}{}}}
-\subsection{Method \code{read_chordtsv()}}{
-Read \code{chord.tsv.gz} file output from umccrise.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$read_chordtsv(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{(\code{character(1)})\cr
-Path to \code{chord.tsv.gz} file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-read_hrdetecttsv"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_hrdetecttsv}{}}}
-\subsection{Method \code{read_hrdetecttsv()}}{
-Read \code{hrdetect.tsv.gz} file output from umccrise.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$read_hrdetecttsv(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{(\code{character(1)})\cr
-Path to \code{hrdetect.tsv.gz} file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-read_sigs"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_sigs}{}}}
-\subsection{Method \code{read_sigs()}}{
-Read \code{snv_20XX.tsv.gz} file output from umccrise.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$read_sigs(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{(\code{character(1)})\cr
-Path to \code{snv_20XX.tsv.gz} file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-read_qcsummarytsv"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_qcsummarytsv}{}}}
-\subsection{Method \code{read_qcsummarytsv()}}{
-Read \code{qc_summary.tsv.gz} file output from umccrise.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$read_qcsummarytsv(x)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{(\code{character(1)})\cr
-Path to \code{qc_summary.tsv.gz} file.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-read"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read}{}}}
-\subsection{Method \code{read()}}{
-Reads contents of \code{cancer_report_tables} directory output by umccrise.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$read()}\if{html}{\out{</div>}}
-}
-
-\subsection{Returns}{
-A list of tibbles.
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-UmccriseCanRepTables-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{UmccriseCanRepTables$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/man/Wf.Rd b/man/Wf.Rd
index 243f68b..ce29182 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -39,9 +39,8 @@ regexes <- tibble::tribble(
 p1_local <- "~/icav1/g/production/analysis_data"
 p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
 um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
-um1$list_files(max_files = 100)
-um1$list_files_filter_relevant(max_files = 100)
-
+um1$list_files(max_files = 10)
+um1$list_files_filter_relevant(max_files = 10)
 
 #---- GDS ----#
 p1_gds <- "gds://production/analysis_data"
@@ -53,7 +52,7 @@ um2$list_files(max_files = 10)
 um2$list_files_filter_relevant(ica_token = token, max_files = 500)
 d <- um2$download_files(
   outdir = outdir, ica_token = token,
-  max_files = 1000, dryrun = F
+  max_files = 1000, dryrun = T
 )
 
 #---- S3 ----#
@@ -71,17 +70,14 @@ d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryr
 \section{Public fields}{
 \if{html}{\out{<div class="r6-fields">}}
 \describe{
-\item{\code{path}}{(\code{character(1)})\cr
-Path to directory with raw workflow results (from GDS, S3, or local filesystem).}
+\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or
+local filesystem).}
 
-\item{\code{wname}}{(\code{character(1)})\cr
-Name of workflow (e.g. umccrise, sash).}
+\item{\code{wname}}{Name of workflow (e.g. umccrise, sash).}
 
-\item{\code{filesystem}}{(\code{character(1)})\cr
-Filesystem of \code{path}.}
+\item{\code{filesystem}}{Filesystem of \code{path} (gds/s3/local).}
 
-\item{\code{regexes}}{(\code{tibble()})\cr
-Tibble with file \code{regex} and \code{fun}ction to parse it.}
+\item{\code{regexes}}{Tibble with file \code{regex} and \code{fun}ction to parse it.}
 }
 \if{html}{\out{</div>}}
 }
@@ -109,7 +105,7 @@ Create a new Workflow object.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{path}}{Output directory path with results.}
+\item{\code{path}}{Path to directory with raw workflow results.}
 
 \item{\code{wname}}{Name of workflow.}
 
@@ -142,6 +138,7 @@ Print details about the Workflow.
 List all files under given path.
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$list_files(
+  path = self$path,
   max_files = 1000,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   ...
@@ -151,6 +148,8 @@ List all files under given path.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
+\item{\code{path}}{Path with raw results.}
+
 \item{\code{max_files}}{Max number of files to list (for gds/s3 only).}
 
 \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
@@ -167,6 +166,7 @@ List all files under given path.
 List dracarys files under given path
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$list_files_filter_relevant(
+  path = self$path,
   max_files = 1000,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   ...
@@ -176,6 +176,8 @@ List dracarys files under given path
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
+\item{\code{path}}{Path with raw results.}
+
 \item{\code{max_files}}{Max number of files to list (for gds/s3 only).}
 
 \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
@@ -193,28 +195,34 @@ the \code{s3_list_files_filter_relevant} function.}
 Download files from GDS/S3 to local filesystem.
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf$download_files(
+  path = self$path,
   outdir,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   max_files = 1000,
   dryrun = FALSE,
-  recursive = NULL
+  recursive = NULL,
+  list_filter_fun = NULL
 )}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
+\item{\code{path}}{Path with raw results.}
+
 \item{\code{outdir}}{Path to output directory.}
 
 \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
-\item{\code{max_files}}{Maximum number of files to list.}
+\item{\code{max_files}}{Max number of files to list.}
 
 \item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't
 download them).}
 
 \item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified
 GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
+
+\item{\code{list_filter_fun}}{Function to filter relevant files.}
 }
 \if{html}{\out{</div>}}
 }
@@ -231,7 +239,7 @@ Tidy given files.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{x}}{Tibble with \code{fun}ction to parse the file and \code{localpath} to the file.}
+\item{\code{x}}{Tibble with \code{localpath} to file and the function \code{type} to parse it.}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index 7be8d45..d8eea05 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -8,23 +8,57 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow
 }
 \examples{
 \dontrun{
+
 #---- LOCAL ----#
+SubjectID <- "SBJ01155"
+SampleID_tumor <- "PRJ211091"
 p1_local <- "~/icav1/g/production/analysis_data"
 p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
-um1 <- Wf_umccrise$new(path = p)
-um1$list_files(max_files = 10)
-um1$list_files_filter_relevant(max_files = 100)
+um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+um1$list_files(max_files = 100)
+
+#---- GDS ----#
+SubjectID <- "SBJ03043"
+SampleID_tumor <- "PRJ230004"
+p1_gds <- "gds://production/analysis_data"
+p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
+outdir <- file.path(sub("gds:/", "~/icav1/g", p))
+token <- Sys.getenv("ICA_ACCESS_TOKEN")
+um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
+um2$list_files(max_files = 8)
+um2$list_files_filter_relevant(ica_token = token, max_files = 500)
+d <- um2$download_files(
+  outdir = outdir, ica_token = token,
+  max_files = 1000, dryrun = F
+)
+
 }
 
 }
 \section{Super class}{
 \code{\link[dracarys:Wf]{dracarys::Wf}} -> \code{Wf_umccrise}
 }
+\section{Public fields}{
+\if{html}{\out{<div class="r6-fields">}}
+\describe{
+\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).}
+
+\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).}
+}
+\if{html}{\out{</div>}}
+}
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
 \item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}}
-\item \href{#method-Wf_umccrise-tidy}{\code{Wf_umccrise$tidy()}}
+\item \href{#method-Wf_umccrise-print}{\code{Wf_umccrise$print()}}
+\item \href{#method-Wf_umccrise-list_files_filter_relevant}{\code{Wf_umccrise$list_files_filter_relevant()}}
+\item \href{#method-Wf_umccrise-download_files}{\code{Wf_umccrise$download_files()}}
+\item \href{#method-Wf_umccrise-read_pcgrjson}{\code{Wf_umccrise$read_pcgrjson()}}
+\item \href{#method-Wf_umccrise-read_chordtsv}{\code{Wf_umccrise$read_chordtsv()}}
+\item \href{#method-Wf_umccrise-read_hrdetecttsv}{\code{Wf_umccrise$read_hrdetecttsv()}}
+\item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}}
+\item \href{#method-Wf_umccrise-read_qcsummarytsv}{\code{Wf_umccrise$read_qcsummarytsv()}}
 \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}}
 \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}}
 }
@@ -32,10 +66,7 @@ um1$list_files_filter_relevant(max_files = 100)
 \if{html}{\out{
 <details open><summary>Inherited methods</summary>
 <ul>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="download_files"><a href='../../dracarys/html/Wf.html#method-Wf-download_files'><code>dracarys::Wf$download_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files"><a href='../../dracarys/html/Wf.html#method-Wf-list_files'><code>dracarys::Wf$list_files()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files_filter_relevant"><a href='../../dracarys/html/Wf.html#method-Wf-list_files_filter_relevant'><code>dracarys::Wf$list_files_filter_relevant()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="print"><a href='../../dracarys/html/Wf.html#method-Wf-print'><code>dracarys::Wf$print()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="tidy_files"><a href='../../dracarys/html/Wf.html#method-Wf-tidy_files'><code>dracarys::Wf$tidy_files()</code></a></span></li>
 </ul>
 </details>
@@ -46,7 +77,7 @@ um1$list_files_filter_relevant(max_files = 100)
 \subsection{Method \code{new()}}{
 Create a new Wf_umccrise object.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$new(path = NULL)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$new(path = NULL, SubjectID = NULL, SampleID_tumor = NULL)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -54,26 +85,172 @@ Create a new Wf_umccrise object.
 \describe{
 \item{\code{path}}{Output directory path with results.}
 
-\item{\code{wname}}{Name of workflow.}
+\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).}
+
+\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).}
 }
 \if{html}{\out{</div>}}
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Wf_umccrise-tidy"></a>}}
-\if{latex}{\out{\hypertarget{method-Wf_umccrise-tidy}{}}}
-\subsection{Method \code{tidy()}}{
-Tidy up the output files from umccrise
+\if{html}{\out{<a id="method-Wf_umccrise-print"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-print}{}}}
+\subsection{Method \code{print()}}{
+Print details about the Workflow.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$tidy(indir, out_files)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$print(...)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{indir}}{Path to the \code{umccrise} directory.}
+\item{\code{...}}{(ignored).}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-list_files_filter_relevant"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-list_files_filter_relevant}{}}}
+\subsection{Method \code{list_files_filter_relevant()}}{
+List dracarys files under given path
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$list_files_filter_relevant(
+  max_files = 1000,
+  ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+  ...
+)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{max_files}}{Max number of files to list (for gds/s3 only).}
+
+\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
 
-\item{\code{out_files}}{Tibble with file path, basename, and size.}
+\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or
+the \code{s3_list_files_filter_relevant} function.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-download_files"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-download_files}{}}}
+\subsection{Method \code{download_files()}}{
+Download files from GDS/S3 to local filesystem.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$download_files(
+  outdir,
+  ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+  max_files = 1000,
+  dryrun = FALSE,
+  recursive = NULL
+)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{outdir}}{Path to output directory.}
+
+\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
+
+\item{\code{max_files}}{Max number of files to list.}
+
+\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't
+download them).}
+
+\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified
+GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
+
+\item{\code{list_filter_fun}}{Function to filter relevant files.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_pcgrjson"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgrjson}{}}}
+\subsection{Method \code{read_pcgrjson()}}{
+Read \code{pcgr.json.gz} file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_pcgrjson(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_chordtsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_chordtsv}{}}}
+\subsection{Method \code{read_chordtsv()}}{
+Read \code{chord.tsv.gz} cancer report file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_chordtsv(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_hrdetecttsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrdetecttsv}{}}}
+\subsection{Method \code{read_hrdetecttsv()}}{
+Read \code{hrdetect.tsv.gz} cancer report file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_hrdetecttsv(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_sigstsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigstsv}{}}}
+\subsection{Method \code{read_sigstsv()}}{
+Read \code{snv_20XX.tsv.gz} cancer report file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_qcsummarytsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcsummarytsv}{}}}
+\subsection{Method \code{read_qcsummarytsv()}}{
+Read \code{qc_summary.tsv.gz} cancer report file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_qcsummarytsv(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/dr_func_eval.Rd b/man/dr_func_eval.Rd
index 2dfedf1..0697c7f 100644
--- a/man/dr_func_eval.Rd
+++ b/man/dr_func_eval.Rd
@@ -4,13 +4,16 @@
 \alias{dr_func_eval}
 \title{Evaluate dracarys Function}
 \usage{
-dr_func_eval(f, v = NULL)
+dr_func_eval(f, v = NULL, envir = parent.frame())
 }
 \arguments{
 \item{f}{Name of function to evaluate.}
 
 \item{v}{Character vector of strings evaluating to functions. By default,
 this points to the functions in the DR_FILE_REGEX dracarys tibble.}
+
+\item{envir}{the environment in which to evaluate the function e.g. use \code{self}
+when using inside R6 classes.}
 }
 \value{
 Evaluated function.
diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd
index f748163..677a654 100644
--- a/man/dr_gds_download.Rd
+++ b/man/dr_gds_download.Rd
@@ -12,7 +12,8 @@ dr_gds_download(
   page_size = 100,
   dryrun = FALSE,
   regexes = DR_FILE_REGEX,
-  recursive = NULL
+  recursive = NULL,
+  list_filter_fun = gds_list_files_filter_relevant
 )
 }
 \arguments{
@@ -33,6 +34,8 @@ download them).}
 
 \item{recursive}{Should files be returned recursively \emph{in and under} the specified
 GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).}
+
+\item{list_filter_fun}{Function to filter relevant GDS files.}
 }
 \description{
 Download only GDS files that can be processed by dracarys.
@@ -42,7 +45,7 @@ Download only GDS files that can be processed by dracarys.
 gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 outdir <- sub("gds:/", "~/icav1/g", gdsdir)
 regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile")
-dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F)
+dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T)
 }
 
 }
diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd
index 86ff3f8..1a6475c 100644
--- a/man/dr_s3_download.Rd
+++ b/man/dr_s3_download.Rd
@@ -10,7 +10,8 @@ dr_s3_download(
   max_objects = 100,
   pattern = NULL,
   regexes = DR_FILE_REGEX,
-  dryrun = FALSE
+  dryrun = FALSE,
+  list_filter_fun = s3_list_files_filter_relevant
 )
 }
 \arguments{
@@ -26,6 +27,8 @@ dr_s3_download(
 
 \item{dryrun}{If TRUE, just list the files that will be downloaded (don't
 download them).}
+
+\item{list_filter_fun}{Function to filter relevant S3 files.}
 }
 \description{
 Download only S3 files that can be processed by dracarys.
@@ -37,6 +40,6 @@ p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
 s3dir <- file.path(p1, p2)
 regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile")
 outdir <- sub("s3:/", "~/s3", s3dir)
-dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = F)
+dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = T)
 }
 }
diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd
index de96245..d61b441 100644
--- a/man/tidy_files.Rd
+++ b/man/tidy_files.Rd
@@ -4,10 +4,13 @@
 \alias{tidy_files}
 \title{Tidy Files}
 \usage{
-tidy_files(x)
+tidy_files(x, envir = parent.frame())
 }
 \arguments{
-\item{x}{Tibble with \code{fun}ction to parse the file and \code{localpath} to the file.}
+\item{x}{Tibble with \code{localpath} to file and the function \code{type} to parse it.}
+
+\item{envir}{the environment in which to evaluate the function e.g. use \code{self}
+when using inside R6 classes.}
 }
 \value{
 Tibble with parsed data in a \code{data} list-column.
@@ -20,7 +23,8 @@ Tidy Files
 p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
 p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz"
 p <- file.path(p1, p2)
-x <- tibble::tibble(fun = "readr::read_tsv", localpath = p)
+x <- tibble::tibble(type = "readr::read_tsv", localpath = p)
+tidy_files(x)
 )
 }
 
diff --git a/tests/testthat/test-roxytest-testexamples-regex.R b/tests/testthat/test-roxytest-testexamples-regex.R
index 91a7f26..05787ff 100644
--- a/tests/testthat/test-roxytest-testexamples-regex.R
+++ b/tests/testthat/test-roxytest-testexamples-regex.R
@@ -2,7 +2,7 @@
 
 # File R/regex.R: @testexamples
 
-test_that("Function dr_func_eval() @ L97", {
+test_that("Function dr_func_eval() @ L99", {
   
   mean_1_to_10 <- dr_func_eval("mean", v = c("mean", "sd"))(1:10)
   x <- system.file("extdata/tso/sample705.fragment_length_hist.json.gz", package = "dracarys")

From a23dbad0ee6f0ebb3edb61d42cdabc0b926294aa Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Mon, 9 Sep 2024 20:47:38 +1000
Subject: [PATCH 18/24] mirror gds/s3 subdir when downloading

---
 R/fs_icav1.R | 14 +++++++++++---
 R/fs_s3.R    | 16 ++++++++++++----
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/R/fs_icav1.R b/R/fs_icav1.R
index 487669d..e38d851 100644
--- a/R/fs_icav1.R
+++ b/R/fs_icav1.R
@@ -160,7 +160,11 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_
 #' \dontrun{
 #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 #' outdir <- sub("gds:/", "~/icav1/g", gdsdir)
-#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
+#' regexes <- tibble::tribble(
+#'   ~regex, ~fun,
+#'   "multiqc_data\\.json$", "MultiqcJsonFile",
+#'   "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson"
+#' )
 #' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T)
 #' }
 #'
@@ -179,11 +183,15 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN
   )
   d <- d |>
     dplyr::mutate(
-      localpath = file.path(outdir, .data$bname),
+      gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path),
+      gdspath_minus_gdsdir_outdir = fs::dir_create(
+        file.path(outdir, dirname(gdspath_minus_gdsdir))
+      ),
+      localpath = file.path(.data$gdspath_minus_gdsdir_outdir, .data$bname),
       gdspath = .data$path
     ) |>
     dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath")
-  # download recognisable dracarys files to outdir/{bname}
+  # download recognisable dracarys files to outdir/<mirrored-cloud-path>/{bname}
   if (!dryrun) {
     cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}")
     res <- d |>
diff --git a/R/fs_s3.R b/R/fs_s3.R
index a4ca9ef..2dac50c 100644
--- a/R/fs_s3.R
+++ b/R/fs_s3.R
@@ -115,9 +115,13 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL,
 #' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
 #' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
 #' s3dir <- file.path(p1, p2)
-#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile")
+#' regexes <- tibble::tribble(
+#'   ~regex, ~fun,
+#'   "multiqc_data\\.json$", "MultiqcJsonFile",
+#'   "pcgr.*\\.json\\.gz$", "pcgrjson"
+#' )
 #' outdir <- sub("s3:/", "~/s3", s3dir)
-#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = T)
+#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 500, regexes = regexes, dryrun = F)
 #' }
 #' @export
 dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
@@ -132,11 +136,15 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
   )
   d <- d |>
     dplyr::mutate(
-      localpath = file.path(outdir, .data$bname),
+      s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path),
+      s3path_minus_s3dir_outdir = fs::dir_create(
+        file.path(outdir, dirname(s3path_minus_s3dir))
+      ),
+      localpath = file.path(.data$s3path_minus_s3dir_outdir, .data$bname),
       s3path = .data$path
     ) |>
     dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path")
-  # download recognisable dracarys files to outdir/{bname}
+  # download recognisable dracarys files to outdir/<mirrored-cloud-path>/{bname}
   if (!dryrun) {
     cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}")
     d |>

From 2e6001434964aa172e8a7b2202d9b2736c027d67 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Mon, 9 Sep 2024 22:54:20 +1000
Subject: [PATCH 19/24] wrap sig parsers

---
 R/Wf.R                 |  1 +
 R/umccrise.R           | 42 +++++++++++++++++-----
 man/Wf.Rd              |  1 +
 man/Wf_umccrise.Rd     | 79 ++++++++++++++++++++++++++++++++++++++++--
 man/dr_gds_download.Rd |  6 +++-
 man/dr_s3_download.Rd  |  8 +++--
 man/tidy_files.Rd      |  1 -
 7 files changed, 122 insertions(+), 16 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index bb18f62..f101975 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -45,6 +45,7 @@
 #'   outdir = outdir, ica_token = token,
 #'   max_files = 1000, dryrun = T
 #' )
+#' d_tidy <- um2$tidy_files(d)
 #'
 #' #---- S3 ----#
 #' p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3"
diff --git a/R/umccrise.R b/R/umccrise.R
index 9c465c9..14f4479 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -7,12 +7,15 @@
 #' \dontrun{
 #'
 #' #---- LOCAL ----#
-#' SubjectID <- "SBJ01155"
-#' SampleID_tumor <- "PRJ211091"
+#' SubjectID <- "SBJ03043"
+#' SampleID_tumor <- "PRJ230004"
 #' p1_local <- "~/icav1/g/production/analysis_data"
-#' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
+#' p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
 #' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
-#' um1$list_files(max_files = 100)
+#' um1$list_files(max_files = 10)
+#' um1$list_files_filter_relevant()
+#' d <- um1$download_files(max_files = 1000, dryrun = T)
+#' d_tidy <- um1$tidy_files(d)
 #'
 #' #---- GDS ----#
 #' SubjectID <- "SBJ03043"
@@ -28,6 +31,7 @@
 #'   outdir = outdir, ica_token = token,
 #'   max_files = 1000, dryrun = F
 #' )
+#' d_tidy <- um2$tidy_files(d)
 #' }
 #'
 #' @export
@@ -49,10 +53,10 @@ Wf_umccrise <- R6::R6Class(
         ~regex, ~fun,
         "-chord\\.tsv\\.gz$", "chordtsv",
         "-hrdetect\\.tsv\\.gz$", "hrdetecttsv",
-        "-snv_2015\\.tsv\\.gz$", "sigstsv",
-        "-snv_2020\\.tsv\\.gz$", "sigstsv",
-        "-dbs\\.tsv\\.gz$", "sigstsv",
-        "-indel\\.tsv\\.gz$", "sigstsv",
+        "-snv_2015\\.tsv\\.gz$", "sigssnv2015tsv",
+        "-snv_2020\\.tsv\\.gz$", "sigssnv2020tsv",
+        "-dbs\\.tsv\\.gz$", "sigsdbstsv",
+        "-indel\\.tsv\\.gz$", "sigsindeltsv",
         "-qc_summary\\.tsv\\.gz$", "qcsummarytsv",
         "multiqc_conpair.txt", "conpairmultiqc",
         "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson"
@@ -152,7 +156,7 @@ Wf_umccrise <- R6::R6Class(
       read_tsvgz(x, col_types = ct) |>
         dplyr::select(-c("sample"))
     },
-    #' @description Read `snv_20XX.tsv.gz` cancer report file.
+    #' @description Read signature cancer report file.
     #' @param x Path to file.
     read_sigstsv = function(x) {
       ct <- readr::cols(
@@ -161,6 +165,26 @@ Wf_umccrise <- R6::R6Class(
       )
       read_tsvgz(x, col_types = ct)
     },
+    #' @description Read `snv_2015.tsv.gz` sigs cancer report file.
+    #' @param x Path to file.
+    read_sigssnv2015tsv = function(x) {
+      self$read_sigstsv(x)
+    },
+    #' @description Read `snv_2020.tsv.gz` sigs cancer report file.
+    #' @param x Path to file.
+    read_sigssnv2020tsv = function(x) {
+      self$read_sigstsv(x)
+    },
+    #' @description Read `dbs.tsv.gz` sigs cancer report file.
+    #' @param x Path to file.
+    read_sigsdbstsv = function(x) {
+      self$read_sigstsv(x)
+    },
+    #' @description Read `indel.tsv.gz` sigs cancer report file.
+    #' @param x Path to file.
+    read_sigsindeltsv = function(x) {
+      self$read_sigstsv(x)
+    },
     #' @description Read `qc_summary.tsv.gz` cancer report file.
     #' @param x Path to file.
     read_qcsummarytsv = function(x) {
diff --git a/man/Wf.Rd b/man/Wf.Rd
index ce29182..f3e6767 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -54,6 +54,7 @@ d <- um2$download_files(
   outdir = outdir, ica_token = token,
   max_files = 1000, dryrun = T
 )
+d_tidy <- um2$tidy_files(d)
 
 #---- S3 ----#
 p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3"
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index d8eea05..6fc7693 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -15,7 +15,8 @@ SampleID_tumor <- "PRJ211091"
 p1_local <- "~/icav1/g/production/analysis_data"
 p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
 um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
-um1$list_files(max_files = 100)
+um1$list_files(max_files = 10)
+um1$list_files_filter_relevant()
 
 #---- GDS ----#
 SubjectID <- "SBJ03043"
@@ -31,7 +32,7 @@ d <- um2$download_files(
   outdir = outdir, ica_token = token,
   max_files = 1000, dryrun = F
 )
-
+d_tidy <- um2$tidy_files(d)
 }
 
 }
@@ -58,6 +59,10 @@ d <- um2$download_files(
 \item \href{#method-Wf_umccrise-read_chordtsv}{\code{Wf_umccrise$read_chordtsv()}}
 \item \href{#method-Wf_umccrise-read_hrdetecttsv}{\code{Wf_umccrise$read_hrdetecttsv()}}
 \item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}}
+\item \href{#method-Wf_umccrise-read_sigssnv2015tsv}{\code{Wf_umccrise$read_sigssnv2015tsv()}}
+\item \href{#method-Wf_umccrise-read_sigssnv2020tsv}{\code{Wf_umccrise$read_sigssnv2020tsv()}}
+\item \href{#method-Wf_umccrise-read_sigsdbstsv}{\code{Wf_umccrise$read_sigsdbstsv()}}
+\item \href{#method-Wf_umccrise-read_sigsindeltsv}{\code{Wf_umccrise$read_sigsindeltsv()}}
 \item \href{#method-Wf_umccrise-read_qcsummarytsv}{\code{Wf_umccrise$read_qcsummarytsv()}}
 \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}}
 \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}}
@@ -225,11 +230,79 @@ Read \code{hrdetect.tsv.gz} cancer report file.
 \if{html}{\out{<a id="method-Wf_umccrise-read_sigstsv"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigstsv}{}}}
 \subsection{Method \code{read_sigstsv()}}{
-Read \code{snv_20XX.tsv.gz} cancer report file.
+Read signature cancer report file.
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{</div>}}
 }
 
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_sigssnv2015tsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigssnv2015tsv}{}}}
+\subsection{Method \code{read_sigssnv2015tsv()}}{
+Read \code{snv_2015.tsv.gz} sigs cancer report file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigssnv2015tsv(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_sigssnv2020tsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigssnv2020tsv}{}}}
+\subsection{Method \code{read_sigssnv2020tsv()}}{
+Read \code{snv_2020.tsv.gz} sigs cancer report file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigssnv2020tsv(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_sigsdbstsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsdbstsv}{}}}
+\subsection{Method \code{read_sigsdbstsv()}}{
+Read \code{dbs.tsv.gz} sigs cancer report file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigsdbstsv(x)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Path to file.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf_umccrise-read_sigsindeltsv"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsindeltsv}{}}}
+\subsection{Method \code{read_sigsindeltsv()}}{
+Read \code{indel.tsv.gz} sigs cancer report file.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf_umccrise$read_sigsindeltsv(x)}\if{html}{\out{</div>}}
+}
+
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd
index 677a654..61aa7f8 100644
--- a/man/dr_gds_download.Rd
+++ b/man/dr_gds_download.Rd
@@ -44,7 +44,11 @@ Download only GDS files that can be processed by dracarys.
 \dontrun{
 gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
 outdir <- sub("gds:/", "~/icav1/g", gdsdir)
-regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile")
+regexes <- tibble::tribble(
+  ~regex, ~fun,
+  "multiqc_data\\\\.json$", "MultiqcJsonFile",
+  "-somatic\\\\.pcgr\\\\.json\\\\.gz$", "pcgrjson"
+)
 dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T)
 }
 
diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd
index 1a6475c..674ed93 100644
--- a/man/dr_s3_download.Rd
+++ b/man/dr_s3_download.Rd
@@ -38,8 +38,12 @@ Download only S3 files that can be processed by dracarys.
 p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
 p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220"
 s3dir <- file.path(p1, p2)
-regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile")
+regexes <- tibble::tribble(
+  ~regex, ~fun,
+  "multiqc_data\\\\.json$", "MultiqcJsonFile",
+  "pcgr.*\\\\.json\\\\.gz$", "pcgrjson"
+)
 outdir <- sub("s3:/", "~/s3", s3dir)
-dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = T)
+dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 500, regexes = regexes, dryrun = F)
 }
 }
diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd
index d61b441..8d1849b 100644
--- a/man/tidy_files.Rd
+++ b/man/tidy_files.Rd
@@ -25,7 +25,6 @@ p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz"
 p <- file.path(p1, p2)
 x <- tibble::tibble(type = "readr::read_tsv", localpath = p)
 tidy_files(x)
-)
 }
 
 }

From 3125056ba50b72aa7c37d6ad9d70bc0bb6348843 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Mon, 9 Sep 2024 23:34:16 +1000
Subject: [PATCH 20/24] add Wf class write method

---
 R/Wf.R             | 23 +++++++++++++++++++++++
 R/umccrise.R       |  8 +++++++-
 man/Wf.Rd          | 26 ++++++++++++++++++++++++++
 man/Wf_umccrise.Rd | 15 ++++++++++++---
 4 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/R/Wf.R b/R/Wf.R
index f101975..aa5bfae 100644
--- a/R/Wf.R
+++ b/R/Wf.R
@@ -202,6 +202,29 @@ Wf <- R6::R6Class(
     tidy_files = function(x) {
       # awesomeness
       tidy_files(x, envir = self)
+    },
+    #' @description Write tidy data.
+    #' @param x Tibble with tidy `data` and file `type`.
+    #' @param outdir Directory path to output tidy files.
+    #' @param prefix Prefix of output files.
+    #' @param format Format of output files.
+    #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
+    write = function(x, outdir = NULL, prefix = NULL, format = "tsv", drid = NULL) {
+      assertthat::assert_that(!is.null(prefix))
+      if (!is.null(outdir)) {
+        prefix <- file.path(outdir, prefix)
+      }
+      d_write <- x |>
+        dplyr::rowwise() |>
+        dplyr::mutate(
+          section = sub("read_", "", .data$type),
+          p = glue("{prefix}_{.data$section}"),
+          out = list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid))
+        ) |>
+        dplyr::ungroup() |>
+        dplyr::select("section", "data") |>
+        tibble::deframe()
+      invisible(d_write)
     }
   ) # end public
 )
diff --git a/R/umccrise.R b/R/umccrise.R
index 14f4479..2b962da 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -14,8 +14,14 @@
 #' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
 #' um1$list_files(max_files = 10)
 #' um1$list_files_filter_relevant()
-#' d <- um1$download_files(max_files = 1000, dryrun = T)
+#' d <- um1$download_files(max_files = 1000, dryrun = F)
 #' d_tidy <- um1$tidy_files(d)
+#' d_write <- um1$write(
+#'   d_tidy,
+#'   outdir = file.path(p, "dracarys_tidy"),
+#'   prefix = glue("{SubjectID}__{SampleID_tumor}"),
+#'   format = "tsv"
+#' )
 #'
 #' #---- GDS ----#
 #' SubjectID <- "SBJ03043"
diff --git a/man/Wf.Rd b/man/Wf.Rd
index f3e6767..f785399 100644
--- a/man/Wf.Rd
+++ b/man/Wf.Rd
@@ -91,6 +91,7 @@ local filesystem).}
 \item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}}
 \item \href{#method-Wf-download_files}{\code{Wf$download_files()}}
 \item \href{#method-Wf-tidy_files}{\code{Wf$tidy_files()}}
+\item \href{#method-Wf-write}{\code{Wf$write()}}
 \item \href{#method-Wf-clone}{\code{Wf$clone()}}
 }
 }
@@ -246,6 +247,31 @@ Tidy given files.
 }
 }
 \if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Wf-write"></a>}}
+\if{latex}{\out{\hypertarget{method-Wf-write}{}}}
+\subsection{Method \code{write()}}{
+Write tidy data.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Wf$write(x, outdir = NULL, prefix = NULL, format = "tsv", drid = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Tibble with tidy \code{data} and file \code{type}.}
+
+\item{\code{outdir}}{Directory path to output tidy files.}
+
+\item{\code{prefix}}{Prefix of output files.}
+
+\item{\code{format}}{Format of output files.}
+
+\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-Wf-clone"></a>}}
 \if{latex}{\out{\hypertarget{method-Wf-clone}{}}}
 \subsection{Method \code{clone()}}{
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index 6fc7693..1ff1a33 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -10,13 +10,21 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow
 \dontrun{
 
 #---- LOCAL ----#
-SubjectID <- "SBJ01155"
-SampleID_tumor <- "PRJ211091"
+SubjectID <- "SBJ03043"
+SampleID_tumor <- "PRJ230004"
 p1_local <- "~/icav1/g/production/analysis_data"
-p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
+p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
 um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
 um1$list_files(max_files = 10)
 um1$list_files_filter_relevant()
+d <- um1$download_files(max_files = 1000, dryrun = F)
+d_tidy <- um1$tidy_files(d)
+d_write <- um1$write(
+  d_tidy,
+  outdir = file.path(p, "dracarys_tidy"),
+  prefix = glue("{SubjectID}__{SampleID_tumor}"),
+  format = "tsv"
+)
 
 #---- GDS ----#
 SubjectID <- "SBJ03043"
@@ -73,6 +81,7 @@ d_tidy <- um2$tidy_files(d)
 <ul>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="list_files"><a href='../../dracarys/html/Wf.html#method-Wf-list_files'><code>dracarys::Wf$list_files()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="tidy_files"><a href='../../dracarys/html/Wf.html#method-Wf-tidy_files'><code>dracarys::Wf$tidy_files()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="dracarys" data-topic="Wf" data-id="write"><a href='../../dracarys/html/Wf.html#method-Wf-write'><code>dracarys::Wf$write()</code></a></span></li>
 </ul>
 </details>
 }}

From efb827ecb3c0aea0f1e0f6118bc55f81e2db2e81 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 10 Sep 2024 11:03:42 +1000
Subject: [PATCH 21/24] add Wf_umccrise_download_tidy_write

---
 NAMESPACE                              |  1 +
 R/fs_icav1.R                           |  2 +-
 R/fs_s3.R                              |  2 +-
 R/umccrise.R                           | 79 +++++++++++++++++++++-----
 man/Wf_umccrise.Rd                     | 11 +++-
 man/Wf_umccrise_download_tidy_write.Rd | 54 ++++++++++++++++++
 6 files changed, 131 insertions(+), 18 deletions(-)
 create mode 100644 man/Wf_umccrise_download_tidy_write.Rd

diff --git a/NAMESPACE b/NAMESPACE
index c2f6f66..b10ca1f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -32,6 +32,7 @@ export(VCMetricsFile)
 export(Wf)
 export(Wf_tso_ctdna_tumor_only)
 export(Wf_umccrise)
+export(Wf_umccrise_download_tidy_write)
 export(WgsContigMeanCovFile)
 export(WgsCoverageMetricsFile)
 export(WgsFineHistFile)
diff --git a/R/fs_icav1.R b/R/fs_icav1.R
index e38d851..16df2c9 100644
--- a/R/fs_icav1.R
+++ b/R/fs_icav1.R
@@ -185,7 +185,7 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN
     dplyr::mutate(
       gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path),
       gdspath_minus_gdsdir_outdir = fs::dir_create(
-        file.path(outdir, dirname(gdspath_minus_gdsdir))
+        file.path(outdir, dirname(.data$gdspath_minus_gdsdir))
       ),
       localpath = file.path(.data$gdspath_minus_gdsdir_outdir, .data$bname),
       gdspath = .data$path
diff --git a/R/fs_s3.R b/R/fs_s3.R
index 2dac50c..34c437e 100644
--- a/R/fs_s3.R
+++ b/R/fs_s3.R
@@ -138,7 +138,7 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
     dplyr::mutate(
       s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path),
       s3path_minus_s3dir_outdir = fs::dir_create(
-        file.path(outdir, dirname(s3path_minus_s3dir))
+        file.path(outdir, dirname(.data$s3path_minus_s3dir))
       ),
       localpath = file.path(.data$s3path_minus_s3dir_outdir, .data$bname),
       s3path = .data$path
diff --git a/R/umccrise.R b/R/umccrise.R
index 2b962da..8f1deae 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -9,6 +9,7 @@
 #' #---- LOCAL ----#
 #' SubjectID <- "SBJ03043"
 #' SampleID_tumor <- "PRJ230004"
+#' prefix <- glue("{SubjectID}__{SampleID_tumor}")
 #' p1_local <- "~/icav1/g/production/analysis_data"
 #' p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
 #' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
@@ -26,6 +27,7 @@
 #' #---- GDS ----#
 #' SubjectID <- "SBJ03043"
 #' SampleID_tumor <- "PRJ230004"
+#' prefix <- glue("{SubjectID}__{SampleID_tumor}")
 #' p1_gds <- "gds://production/analysis_data"
 #' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
 #' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
@@ -38,6 +40,12 @@
 #'   max_files = 1000, dryrun = F
 #' )
 #' d_tidy <- um2$tidy_files(d)
+#' d_write <- um2$write(
+#'   d_tidy,
+#'   outdir = file.path(outdir, "dracarys_tidy"),
+#'   prefix = glue("{SubjectID}__{SampleID_tumor}"),
+#'   format = "tsv"
+#' )
 #' }
 #'
 #' @export
@@ -50,7 +58,8 @@ Wf_umccrise <- R6::R6Class(
     SubjectID = NULL,
     SampleID_tumor = NULL,
     #' @description Create a new Wf_umccrise object.
-    #' @param path Output directory path with results.
+    #' @param path Path to directory with raw workflow results (from GDS, S3, or
+    #' local filesystem).
     #' @param SubjectID The SubjectID of the sample (needed for path lookup).
     #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
     initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) {
@@ -64,7 +73,7 @@ Wf_umccrise <- R6::R6Class(
         "-dbs\\.tsv\\.gz$", "sigsdbstsv",
         "-indel\\.tsv\\.gz$", "sigsindeltsv",
         "-qc_summary\\.tsv\\.gz$", "qcsummarytsv",
-        "multiqc_conpair.txt", "conpairmultiqc",
+        "multiqc_conpair\\.txt$", "conpairmultiqc",
         "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson"
       ) |>
         dplyr::mutate(fun = paste0("read_", .data$fun))
@@ -260,16 +269,56 @@ Wf_umccrise <- R6::R6Class(
   ) # end public
 )
 
-#    read = function() {
-#      x <- self$path
-#      # now return all as list elements
-#      list(
-#        chord = grep_file(x, "-chord\\.tsv\\.gz$") |> self$read_chordtsv(),
-#        hrdetect = grep_file(x, "-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(),
-#        sigs2015 = grep_file(x, "-snv_2015\\.tsv\\.gz$") |> self$read_sigs(),
-#        sigs2020 = grep_file(x, "-snv_2020\\.tsv\\.gz$") |> self$read_sigs(),
-#        sigsdbs = grep_file(x, "-dbs\\.tsv\\.gz$") |> self$read_sigs(),
-#        sigsindel = grep_file(x, "-indel\\.tsv\\.gz$") |> self$read_sigs(),
-#        qcsum = grep_file(x, "-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv()
-#      )
-#    }
+#' umccrise Download Tidy and Write
+#'
+#' Downloads files from the `umccrise` workflow and writes them in a tidy format.
+#'
+#' @param path Path to directory with raw workflow results (from GDS, S3, or
+#' local filesystem).
+#' @param SubjectID The SubjectID of the sample (needed for path lookup).
+#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
+#' @param outdir Path to output directory.
+#' @param max_files Max number of files to list.
+#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
+#' @param dryrun If TRUE, just list the files that will be downloaded (don't
+#' download them).
+#' @return List where each element is a tidy tibble of a umccrise file.
+#'
+#' @examples
+#' \dontrun{
+#' SubjectID <- "SBJ03043"
+#' SampleID_tumor <- "PRJ230004"
+#' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise")
+#' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063")
+#' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
+#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
+#' d <- Wf_umccrise_download_tidy_write(
+#'   path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor,
+#'   outdir = outdir,
+#'   dryrun = F
+#' )
+#' }
+#' @export
+Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor,
+                                            outdir, max_files = 1000,
+                                            ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+                                            dryrun = FALSE) {
+  um <- Wf_umccrise$new(
+    path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor
+  )
+  d_dl <- um$download_files(
+    outdir = outdir, ica_token = ica_token,
+    max_files = max_files, dryrun = dryrun
+  )
+  if (!dryrun) {
+    d_tidy <- um$tidy_files(d_dl)
+    d_write <- um$write(
+      d_tidy,
+      outdir = file.path(outdir, "dracarys_tidy"),
+      prefix = glue("{SubjectID}__{SampleID_tumor}"),
+      format = "tsv"
+    )
+    return(d_write)
+  }
+  return(d_dl)
+}
diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd
index 1ff1a33..5fd5ade 100644
--- a/man/Wf_umccrise.Rd
+++ b/man/Wf_umccrise.Rd
@@ -12,6 +12,7 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow
 #---- LOCAL ----#
 SubjectID <- "SBJ03043"
 SampleID_tumor <- "PRJ230004"
+prefix <- glue("{SubjectID}__{SampleID_tumor}")
 p1_local <- "~/icav1/g/production/analysis_data"
 p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
 um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
@@ -29,6 +30,7 @@ d_write <- um1$write(
 #---- GDS ----#
 SubjectID <- "SBJ03043"
 SampleID_tumor <- "PRJ230004"
+prefix <- glue("{SubjectID}__{SampleID_tumor}")
 p1_gds <- "gds://production/analysis_data"
 p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
 outdir <- file.path(sub("gds:/", "~/icav1/g", p))
@@ -41,6 +43,12 @@ d <- um2$download_files(
   max_files = 1000, dryrun = F
 )
 d_tidy <- um2$tidy_files(d)
+d_write <- um2$write(
+  d_tidy,
+  outdir = file.path(outdir, "dracarys_tidy"),
+  prefix = glue("{SubjectID}__{SampleID_tumor}"),
+  format = "tsv"
+)
 }
 
 }
@@ -97,7 +105,8 @@ Create a new Wf_umccrise object.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{path}}{Output directory path with results.}
+\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or
+local filesystem).}
 
 \item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).}
 
diff --git a/man/Wf_umccrise_download_tidy_write.Rd b/man/Wf_umccrise_download_tidy_write.Rd
new file mode 100644
index 0000000..5364d61
--- /dev/null
+++ b/man/Wf_umccrise_download_tidy_write.Rd
@@ -0,0 +1,54 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/umccrise.R
+\name{Wf_umccrise_download_tidy_write}
+\alias{Wf_umccrise_download_tidy_write}
+\title{umccrise Download Tidy and Write}
+\usage{
+Wf_umccrise_download_tidy_write(
+  path,
+  SubjectID,
+  SampleID_tumor,
+  outdir,
+  max_files = 1000,
+  ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
+  dryrun = FALSE
+)
+}
+\arguments{
+\item{path}{Path to directory with raw workflow results (from GDS, S3, or
+local filesystem).}
+
+\item{SubjectID}{The SubjectID of the sample (needed for path lookup).}
+
+\item{SampleID_tumor}{The SampleID of the tumor sample (needed for path lookup).}
+
+\item{outdir}{Path to output directory.}
+
+\item{max_files}{Max number of files to list.}
+
+\item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}
+
+\item{dryrun}{If TRUE, just list the files that will be downloaded (don't
+download them).}
+}
+\value{
+List where each element is a tidy tibble of a umccrise file.
+}
+\description{
+Downloads files from the \code{umccrise} workflow and writes them in a tidy format.
+}
+\examples{
+\dontrun{
+SubjectID <- "SBJ03043"
+SampleID_tumor <- "PRJ230004"
+p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise")
+p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063")
+outdir <- file.path(sub("gds:/", "~/icav1/g", p))
+token <- Sys.getenv("ICA_ACCESS_TOKEN")
+d <- Wf_umccrise_download_tidy_write(
+  path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor,
+  outdir = outdir,
+  dryrun = F
+)
+}
+}

From ea2bb516c14dbca580face3524c8282a3b197e04 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 10 Sep 2024 21:35:44 +1000
Subject: [PATCH 22/24] umccrise: add dl_and_tidy.R script

---
 R/umccrise.R                                  |  5 +-
 inst/rmd/umccr_workflows/umccrise/.gitignore  |  1 +
 .../umccr_workflows/umccrise/dl_and_tidy.R    | 79 +++++++++++++++++++
 3 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100644 inst/rmd/umccr_workflows/umccrise/.gitignore
 create mode 100755 inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R

diff --git a/R/umccrise.R b/R/umccrise.R
index 8f1deae..ffbe5c5 100644
--- a/R/umccrise.R
+++ b/R/umccrise.R
@@ -282,6 +282,7 @@ Wf_umccrise <- R6::R6Class(
 #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
 #' @param dryrun If TRUE, just list the files that will be downloaded (don't
 #' download them).
+#' @param format Format of output files.
 #' @return List where each element is a tidy tibble of a umccrise file.
 #'
 #' @examples
@@ -300,7 +301,7 @@ Wf_umccrise <- R6::R6Class(
 #' }
 #' @export
 Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor,
-                                            outdir, max_files = 1000,
+                                            outdir, format = "rds", max_files = 1000,
                                             ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
                                             dryrun = FALSE) {
   um <- Wf_umccrise$new(
@@ -316,7 +317,7 @@ Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor,
       d_tidy,
       outdir = file.path(outdir, "dracarys_tidy"),
       prefix = glue("{SubjectID}__{SampleID_tumor}"),
-      format = "tsv"
+      format = format
     )
     return(d_write)
   }
diff --git a/inst/rmd/umccr_workflows/umccrise/.gitignore b/inst/rmd/umccr_workflows/umccrise/.gitignore
new file mode 100644
index 0000000..674563b
--- /dev/null
+++ b/inst/rmd/umccr_workflows/umccrise/.gitignore
@@ -0,0 +1 @@
+nogit
diff --git a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R
new file mode 100755
index 0000000..82f9974
--- /dev/null
+++ b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R
@@ -0,0 +1,79 @@
+#!/usr/bin/env Rscript
+
+{
+  require(dplyr)
+  require(assertthat, include.only = "assert_that")
+  require(dracarys, include.only = "Wf_umccrise_download_tidy_write")
+  require(glue, include.only = "glue")
+  require(here, include.only = "here")
+  require(rportal, include.only = c("portaldb_query_workflow"))
+  require(tidyr, include.only = "separate_wider_delim")
+}
+
+query_workflow_umccrise <- function(start_date, end_date) {
+  q1 <- glue(
+    "WHERE \"type_name\" = 'umccrise'",
+    "AND  \"start\" >= date(\'{start_date}\')",
+    "AND  \"end\" <= date(\'{end_date}\')",
+    "ORDER BY \"start\" DESC;"
+  )
+  rportal::portaldb_query_workflow(q1)
+}
+
+query_limsrow_libids <- function(libids) {
+  assertthat::assert_that(!is.null(libids), all(grepl("^L", libids)))
+  libids <- unique(libids) |>
+    paste(collapse = "|")
+  q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');")
+  rportal::portaldb_query_limsrow(q1)
+}
+
+# first read in the workflows table, extract metadata, then join with lims
+start_date <- "2024-08-29"
+end_date <- "2024-09-01"
+meta_raw <- query_workflow_umccrise(start_date, end_date)
+meta <- meta_raw |>
+  rportal::meta_umccrise()
+lims_raw <- query_limsrow_libids(meta$LibraryID_tumor)
+lims <- lims_raw |>
+  tidyr::separate_wider_delim(
+    library_id,
+    delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start"
+  ) |>
+  select(
+    subject_id, library_id, sample_id, sample_name,
+    external_subject_id, external_sample_id,
+    project_name, project_owner,
+    source, quality
+  ) |>
+  distinct()
+table(lims$library_id %in% meta$LibraryID_tumor) # double-check
+
+meta_lims <- meta |>
+  left_join(lims, by = c("LibraryID_tumor" = "library_id")) |>
+  mutate(rownum = row_number()) |>
+  select(
+    rownum, wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal,
+    SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id,
+    project_owner, project_name, source, quality
+  )
+meta_lims |>
+  saveRDS(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/meta/{start_date}_{end_date}.rds")))
+
+d <- meta_lims |>
+  rowwise() |>
+  mutate(
+    indir = .data$gds_outdir_umccrise,
+    outdir = file.path(sub("gds://", "", .data$indir)),
+    outdir = file.path(normalizePath("~/icav1/g"), .data$outdir),
+    res = list(
+      dracarys::Wf_umccrise_download_tidy_write(
+        path = .data$indir, SubjectID = .data$SubjectID, SampleID_tumor = .data$SampleID_tumor,
+        outdir = .data$outdir, max_files = 1000, dryrun = FALSE
+      )
+    )
+  ) |>
+  ungroup()
+
+d |>
+  saveRDS(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/results_{start_date}_{end_date}.rds")))

From 8baa52c4183419f6c2f680685f2e28224f2bcd33 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 10 Sep 2024 23:43:29 +1000
Subject: [PATCH 23/24] add umccrise summary report

---
 inst/rmd/umccr_workflows/umccrise/.gitignore  |   2 +
 inst/rmd/umccr_workflows/umccrise/render.sh   |   9 +
 .../umccrise/summary_umccrise.qmd             | 224 ++++++++++++++++++
 man/Wf_umccrise_download_tidy_write.Rd        |   3 +
 4 files changed, 238 insertions(+)
 create mode 100644 inst/rmd/umccr_workflows/umccrise/render.sh
 create mode 100644 inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd

diff --git a/inst/rmd/umccr_workflows/umccrise/.gitignore b/inst/rmd/umccr_workflows/umccrise/.gitignore
index 674563b..0182e8f 100644
--- a/inst/rmd/umccr_workflows/umccrise/.gitignore
+++ b/inst/rmd/umccr_workflows/umccrise/.gitignore
@@ -1 +1,3 @@
 nogit
+
+/.quarto/
diff --git a/inst/rmd/umccr_workflows/umccrise/render.sh b/inst/rmd/umccr_workflows/umccrise/render.sh
new file mode 100644
index 0000000..bfff2c7
--- /dev/null
+++ b/inst/rmd/umccr_workflows/umccrise/render.sh
@@ -0,0 +1,9 @@
+date_start="2024-08-29"
+date_end="2024-09-01"
+out="umccrise_${date_start}_${date_end}.html"
+
+quarto render summary_umccrise.qmd \
+    -P date_start:${date_start} \
+    -P date_end:${date_end} \
+    -o ${out} \
+    --output-dir nogit/html
diff --git a/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd
new file mode 100644
index 0000000..50b7329
--- /dev/null
+++ b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd
@@ -0,0 +1,224 @@
+---
+title: "{{< meta params.title >}}"
+subtitle: "Period: `r paste(params$date_start, ' to ', params$date_end)`"
+author: "UMCCR - Genomics Platform Group"
+date: now
+date-format: "YYYY-MM-DD HH:mm Z"
+execute:
+  echo: false
+format:
+  html:
+    toc: true
+    toc-expand: 1
+    toc-title: Contents
+    toc-location: body
+    highlight-style: github
+    number-sections: false
+    link-external-icon: true
+    link-external-newwindow: true
+    embed-resources: true
+    code-copy: true
+    code-link: true
+    code-fold: true
+    code-block-border-left: true
+    smooth-scroll: true
+    grid:
+      body-width: 1300px
+params:
+  title: "UMCCR umccrise Workflow Summary"
+  date_start: "XXXX-XX-XX"
+  date_end: "XXXX-XX-XX"
+---
+
+```{r}
+#| label: pkg_load
+#| message: false
+{
+  require(dplyr) # import all dplyr funcs
+  require(readr, include.only = c("read_rds"))
+  require(purrr, include.only = c("map"))
+  require(tidyr, include.only = c("unnest_wider"))
+  require(dracarys, include.only = c("session_info_kable"))
+  require(glue, include.only = "glue")
+  require(here, include.only = "here")
+  require(knitr, include.only = "kable")
+  require(reactable, include.only = "reactable")
+  require(ggplot2, include.only = c("ggplot", "aes"))
+  require(lubridate, include.only = c("as_datetime"))
+  require(plotly, include.only = c("ggplotly"))
+}
+set.seed(42)
+```
+
+```{r}
+#| label: load_data
+date_start <- "2024-08-29"
+date_end <- "2024-09-01"
+d_raw <- readr::read_rds(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/results_{date_start}_{date_end}.rds")))
+```
+
+## Results
+
+```{r}
+#| label: process
+# one row per file type - not all samples have sigsdbstsv
+d <- d_raw |>
+  tidyr::unnest_longer(res, indices_to = "filetype")
+# main_cols <- c("rownum", "portal_run_id", "SubjectID", "LibraryID_tumor")
+main_cols <- c("portal_run_id")
+qcsum <- d |>
+  filter(filetype == "qcsummarytsv") |>
+  unnest_wider(res) |>
+  select(all_of(main_cols), qc_status_hmf:bpi_enabled)
+hrd_chord <- d |>
+  filter(filetype == "chordtsv") |>
+  unnest_wider(res) |>
+  select(all_of(main_cols),
+    chord_p_hrd = "p_hrd",
+    chord_hr_status = "hr_status",
+    chord_hrd_type = "hrd_type",
+    chord_p_BRCA1 = "p_BRCA1",
+    chord_p_BRCA2 = "p_BRCA2"
+  )
+hrd_hrdetect <- d |>
+  filter(filetype == "hrdetecttsv") |>
+  unnest_wider(res) |>
+  select(all_of(main_cols), hrdetect_prob = "Probability")
+sigs_snv2015 <- d |>
+  filter(filetype == "sigssnv2015tsv") |>
+  select(all_of(main_cols), res) |>
+  tidyr::unnest_wider(res) |>
+  tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
+sigs_snv2020 <- d |>
+  filter(filetype == "sigssnv2020tsv") |>
+  select(all_of(main_cols), res) |>
+  tidyr::unnest_wider(res) |>
+  tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
+sigs_dbs <- d |>
+  filter(filetype == "sigsdbstsv") |>
+  select(all_of(main_cols), res) |>
+  tidyr::unnest_wider(res) |>
+  tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
+sigs_indel <- d |>
+  filter(filetype == "sigsindeltsv") |>
+  select(all_of(main_cols), res) |>
+  tidyr::unnest_wider(res) |>
+  tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
+
+# signatures
+dsig <- bind_rows(
+  list(
+    snv2015 = sigs_snv2015, snv2020 = sigs_snv2020, dbs = sigs_dbs, indel = sigs_indel
+  ),
+  .id = "Sig_group"
+)
+
+# keep top two ranked sigs
+dsig_filt <- dsig |>
+  group_by(Sig_group, portal_run_id) |>
+  mutate(tot_sig_vars = sum(Contribution)) |>
+  arrange(Rank) |>
+  slice_head(n = 2) |>
+  # some sigs have same Rank so use explicit sig_rank
+  mutate(sig_rank = row_number()) |>
+  ungroup() |>
+  mutate(
+    sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})")
+  ) |>
+  select(Sig_group, portal_run_id, sig_rank, sig_summary) |>
+  tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |>
+  mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |>
+  select(portal_run_id, sig_top2)
+
+dall <- d_raw |>
+  select(
+    date_analysed = "start", portal_run_id,
+    SubjectID, LibraryID_tumor, SampleID_tumor,
+    external_subject_id, external_sample_id,
+    project_owner, project_name, source, quality
+  ) |>
+  left_join(qcsum, by = "portal_run_id") |>
+  left_join(hrd_chord, by = "portal_run_id") |>
+  left_join(hrd_hrdetect, by = "portal_run_id")
+```
+
+```{r}
+#| label: hrd_plot
+#| fig-width: 15
+#| fig-height: 15
+p1 <- dall |>
+  mutate(sbj = glue("{SubjectID}_{LibraryID_tumor}")) |>
+  select(date_analysed, sbj, chord = hrd_chord, hrdetect = hrd_hrdetect) |>
+  tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") |>
+  ggplot2::ggplot(aes(x = date_analysed, y = probability, label = sbj)) +
+  ggplot2::geom_point(aes(colour = method)) +
+  ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) +
+  ggplot2::theme_bw() +
+  ggplot2::ggtitle("CHORD vs. HRDetect per SubjectID")
+
+plotly::ggplotly(p1)
+```
+
+### Signature Results
+
+```{r}
+#| label: sig_results
+#| fig-width: 15
+#| fig-height: 65
+
+sig_order2015 <- paste0("Sig", 1:30)
+sig_order2020 <- paste0(
+  "SBS",
+  c(
+    1:6,
+    paste0(7, c("a", "b", "c", "d")),
+    8:9,
+    paste0(10, c("a", "b", "c", "d")),
+    11:16,
+    paste0(17, c("a", "b")),
+    18:60,
+    84:94
+  )
+)
+
+p2_prep <- dsig |>
+  filter(
+    Sig_group == "snv2015",
+    Rank %in% c(1:3)
+  ) |>
+  left_join(dall |> select(portal_run_id, date_analysed, SubjectID, LibraryID_tumor), by = "portal_run_id") |>
+  mutate(sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}"))) |>
+  select(date_analysed, sbj, Sig_group, Rank, Signature, Contribution, RelFreq) |>
+  mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020)))
+p2 <- p2_prep |>
+  ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) +
+  ggplot2::geom_bar(position = "fill", stat = "identity") +
+  ggplot2::theme_bw(base_size = 7)
+# ggplot2::facet_wrap(~Sig_group, ncol = 1)
+
+plotly::ggplotly(p2, tooltip = c("x", "text", "fill"))
+```
+
+## Metadata Summary
+
+::: {.panel-tabset .nav-pills}
+
+### Project Name/Owner
+
+```{r}
+#| label: project_owner_name
+dall |>
+  count(project_name, project_owner) |>
+  knitr::kable()
+```
+
+### Source / Quality
+
+```{r}
+#| label: source_quality
+count(dall, source, quality) |> knitr::kable()
+```
+
+:::
+
+</div>
diff --git a/man/Wf_umccrise_download_tidy_write.Rd b/man/Wf_umccrise_download_tidy_write.Rd
index 5364d61..05ca0bf 100644
--- a/man/Wf_umccrise_download_tidy_write.Rd
+++ b/man/Wf_umccrise_download_tidy_write.Rd
@@ -9,6 +9,7 @@ Wf_umccrise_download_tidy_write(
   SubjectID,
   SampleID_tumor,
   outdir,
+  format = "rds",
   max_files = 1000,
   ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
   dryrun = FALSE
@@ -24,6 +25,8 @@ local filesystem).}
 
 \item{outdir}{Path to output directory.}
 
+\item{format}{Format of output files.}
+
 \item{max_files}{Max number of files to list.}
 
 \item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).}

From e91187dfa15806e96e35d78d6094efd43a318a8d Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Wed, 11 Sep 2024 08:44:43 +1000
Subject: [PATCH 24/24] umccrise: add reactable table viewer

---
 .../umccr_workflows/umccrise/dl_and_tidy.R    |   4 +-
 .../umccrise/summary_umccrise.qmd             | 189 ++++++++++++++++--
 2 files changed, 169 insertions(+), 24 deletions(-)

diff --git a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R
index 82f9974..3b5ede3 100755
--- a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R
+++ b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R
@@ -44,7 +44,7 @@ lims <- lims_raw |>
     subject_id, library_id, sample_id, sample_name,
     external_subject_id, external_sample_id,
     project_name, project_owner,
-    source, quality
+    source, quality, workflow
   ) |>
   distinct()
 table(lims$library_id %in% meta$LibraryID_tumor) # double-check
@@ -55,7 +55,7 @@ meta_lims <- meta |>
   select(
     rownum, wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal,
     SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id,
-    project_owner, project_name, source, quality
+    project_owner, project_name, source, quality, workflow
   )
 meta_lims |>
   saveRDS(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/meta/{start_date}_{end_date}.rds")))
diff --git a/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd
index 50b7329..0f9390e 100644
--- a/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd
+++ b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd
@@ -57,6 +57,48 @@ date_end <- "2024-09-01"
 d_raw <- readr::read_rds(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/results_{date_start}_{date_end}.rds")))
 ```
 
+```{r}
+#| label: funcs
+dt_view <- function(x, id, ...) {
+  htmltools::browsable(
+    htmltools::tagList(
+      htmltools::tags$button(
+        htmltools::tagList(fontawesome::fa("download"), "CSV"),
+        onclick = glue("Reactable.downloadDataCSV('{id}', '{id}.csv')")
+      ),
+      x |>
+        reactable::reactable(
+          bordered = TRUE,
+          filterable = TRUE,
+          fullWidth = TRUE,
+          height = 800,
+          highlight = TRUE,
+          pagination = FALSE,
+          resizable = TRUE,
+          searchable = TRUE,
+          sortable = TRUE,
+          striped = TRUE,
+          wrap = FALSE,
+          elementId = id,
+          ...
+        )
+    )
+  )
+}
+```
+
+## Metadata
+
+```{r}
+#| label: metadata
+meta <- d_raw |>
+  select(
+    rownum, portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id,
+    project_owner, project_name, source, quality, workflow
+  )
+dt_view(meta, id = "metadata")
+```
+
 ## Results
 
 ```{r}
@@ -66,10 +108,61 @@ d <- d_raw |>
   tidyr::unnest_longer(res, indices_to = "filetype")
 # main_cols <- c("rownum", "portal_run_id", "SubjectID", "LibraryID_tumor")
 main_cols <- c("portal_run_id")
+```
+
+```{r}
+#| label: qcsum
 qcsum <- d |>
   filter(filetype == "qcsummarytsv") |>
+  select(all_of(main_cols), res) |>
+  unnest_wider(res)
+```
+
+```{r}
+#| label: pcgr
+pcgr <- d |>
+  filter(filetype == "pcgrjson") |>
+  select(all_of(main_cols), res) |>
   unnest_wider(res) |>
-  select(all_of(main_cols), qc_status_hmf:bpi_enabled)
+  rename(
+    msi_fraction_indels_pcgr = "fracIndels",
+    msi_pcgr = "predicted_class",
+    tmb_pcgr = "tmb_estimate",
+    n_tmb_pcgr = "n_tmb"
+  ) |>
+  mutate(msi_pcgr = sub(" \\(.*\\)", "", msi_pcgr))
+```
+
+```{r}
+#| label: conpair
+sampleids <- d_raw |>
+  select(SampleID_tumor, SampleID_normal) |>
+  tidyr::pivot_longer(everything(), values_to = "sampleid") |>
+  mutate(phenotype = sub("SampleID_", "", .data$name)) |>
+  select(sampleid, phenotype) |>
+  distinct()
+conpair_raw <- d |>
+  filter(filetype == "conpairmultiqc") |>
+  select(all_of(main_cols), res) |>
+  tidyr::unnest(res) |>
+  left_join(sampleids, by = "sampleid") |>
+  select(-sampleid)
+conpair_tumor <- conpair_raw |>
+  filter(phenotype == "tumor")
+conpair_normal <- conpair_raw |>
+  filter(phenotype == "normal") |>
+  select(portal_run_id, contamination)
+conpair <- conpair_tumor |>
+  left_join(conpair_normal, by = "portal_run_id", suffix = c("_tumor", "_normal")) |>
+  select(portal_run_id,
+    contamination_tumor_conpair = "contamination_tumor",
+    contamination_normal_conpair = "contamination_normal",
+    concordance_conpair = "concordance"
+  )
+```
+
+```{r}
+#| label: hrd
 hrd_chord <- d |>
   filter(filetype == "chordtsv") |>
   unnest_wider(res) |>
@@ -84,6 +177,10 @@ hrd_hrdetect <- d |>
   filter(filetype == "hrdetecttsv") |>
   unnest_wider(res) |>
   select(all_of(main_cols), hrdetect_prob = "Probability")
+```
+
+```{r}
+#| label: sigs
 sigs_snv2015 <- d |>
   filter(filetype == "sigssnv2015tsv") |>
   select(all_of(main_cols), res) |>
@@ -104,8 +201,6 @@ sigs_indel <- d |>
   select(all_of(main_cols), res) |>
   tidyr::unnest_wider(res) |>
   tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
-
-# signatures
 dsig <- bind_rows(
   list(
     snv2015 = sigs_snv2015, snv2020 = sigs_snv2020, dbs = sigs_dbs, indel = sigs_indel
@@ -128,20 +223,37 @@ dsig_filt <- dsig |>
   select(Sig_group, portal_run_id, sig_rank, sig_summary) |>
   tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |>
   mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |>
-  select(portal_run_id, sig_top2)
+  select(Sig_group, portal_run_id, sig_top2) |>
+  tidyr::pivot_wider(names_from = Sig_group, values_from = sig_top2) |>
+  select(portal_run_id, snv2015, snv2020, dbs, indel)
+```
 
+```{r}
+#| label: qc_all
 dall <- d_raw |>
   select(
+    rownum,
     date_analysed = "start", portal_run_id,
     SubjectID, LibraryID_tumor, SampleID_tumor,
     external_subject_id, external_sample_id,
-    project_owner, project_name, source, quality
+    project_owner, project_name, source, quality, workflow
   ) |>
   left_join(qcsum, by = "portal_run_id") |>
   left_join(hrd_chord, by = "portal_run_id") |>
-  left_join(hrd_hrdetect, by = "portal_run_id")
+  left_join(hrd_hrdetect, by = "portal_run_id") |>
+  left_join(pcgr, by = "portal_run_id") |>
+  left_join(conpair, by = "portal_run_id")
+```
+
+### Summary Metrics
+
+```{r}
+#| label: summary_metrics
+dt_view(dall, "summary_metrics")
 ```
 
+### HRD Plot
+
 ```{r}
 #| label: hrd_plot
 #| fig-width: 15
@@ -159,7 +271,34 @@ p1 <- dall |>
 plotly::ggplotly(p1)
 ```
 
-### Signature Results
+### Signatures
+
+#### All (SNV, Indel, DBS)
+
+```{r}
+#| label: sig_results_all
+dsig |>
+  left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor),
+    by = "portal_run_id"
+  ) |>
+  select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |>
+  dt_view("sig_results_all")
+```
+
+#### Top 2
+
+```{r}
+#| label: sig_results_top2
+dsig_filt |>
+  left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor),
+    by = "portal_run_id"
+  ) |>
+  select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |>
+  arrange(rownum) |>
+  dt_view("sig_results_top2")
+```
+
+#### Top 3 SNV2015
 
 ```{r}
 #| label: sig_results
@@ -167,19 +306,19 @@ plotly::ggplotly(p1)
 #| fig-height: 65
 
 sig_order2015 <- paste0("Sig", 1:30)
-sig_order2020 <- paste0(
-  "SBS",
-  c(
-    1:6,
-    paste0(7, c("a", "b", "c", "d")),
-    8:9,
-    paste0(10, c("a", "b", "c", "d")),
-    11:16,
-    paste0(17, c("a", "b")),
-    18:60,
-    84:94
-  )
-)
+# sig_order2020 <- paste0(
+#  "SBS",
+#  c(
+#    1:6,
+#    paste0(7, c("a", "b", "c", "d")),
+#    8:9,
+#    paste0(10, c("a", "b", "c", "d")),
+#    11:16,
+#    paste0(17, c("a", "b")),
+#    18:60,
+#    84:94
+#  )
+# )
 
 p2_prep <- dsig |>
   filter(
@@ -189,12 +328,11 @@ p2_prep <- dsig |>
   left_join(dall |> select(portal_run_id, date_analysed, SubjectID, LibraryID_tumor), by = "portal_run_id") |>
   mutate(sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}"))) |>
   select(date_analysed, sbj, Sig_group, Rank, Signature, Contribution, RelFreq) |>
-  mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020)))
+  mutate(Signature = factor(Signature, levels = sig_order2015))
 p2 <- p2_prep |>
   ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) +
   ggplot2::geom_bar(position = "fill", stat = "identity") +
   ggplot2::theme_bw(base_size = 7)
-# ggplot2::facet_wrap(~Sig_group, ncol = 1)
 
 plotly::ggplotly(p2, tooltip = c("x", "text", "fill"))
 ```
@@ -219,6 +357,13 @@ dall |>
 count(dall, source, quality) |> knitr::kable()
 ```
 
+### Workflow
+
+```{r}
+#| label: workflow_summary
+count(dall, workflow) |> knitr::kable()
+```
+
 :::
 
 </div>