From 6a281fd55882d7ad852f2198233d126141cf1a2f Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 4 Sep 2024 10:21:05 +1000 Subject: [PATCH 01/24] umccrise accreditation 131 tidy meta --- R/regex.R | 10 +-- inst/scripts/umccrise_run.R | 122 ++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 72 deletions(-) diff --git a/R/regex.R b/R/regex.R index 25a2f61..4ca485b 100644 --- a/R/regex.R +++ b/R/regex.R @@ -61,11 +61,11 @@ DR_FILE_REGEX <- tibble::tribble( "multiqc_data\\.json", "MultiqcFile", "somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile", "somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile", - "chord\\.tsv\\.gz$", "UmChordTsvFile", - "hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", - "snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", - "snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", - "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", + # "chord\\.tsv\\.gz$", "UmChordTsvFile", + # "hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", + # "snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", + # "snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", + # "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", "bcftools_stats\\.txt$", "BcftoolsStatsFile" ) diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R index 6602d87..ea4d2f4 100644 --- a/inst/scripts/umccrise_run.R +++ b/inst/scripts/umccrise_run.R @@ -1,74 +1,62 @@ require(dracarys) -require(here) -require(glue) +require(rportal, include.only = "portaldb_query_workflow") +require(here, include.only = "here") +require(glue, include.only = "glue") +require(readr, include.only = "read_rds") require(dplyr) -require(readr) - -#---- GDS ----# -# read last 1000 umccrise runs from portal -# 475 from 2022-01-24 until 2023-09-03, of which 449 Succeeded -date1 <- "2023-09-04" -pmeta_raw_rds <- here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_raw.rds")) -# pmeta_raw <- dracarys::portal_meta_read(rows = 1000, params = "&type_name=umccrise") -# saveRDS(pmeta_raw, file = pmeta_raw_rds) -pmeta <- readr::read_rds(pmeta_raw_rds) |> - dracarys::meta_umccrise(status = "Succeeded") -lims_raw_rds <- here(glue("nogit/umccrise/rds/lims/{date1}_lims_raw.rds")) -# lims_raw <- dracarys::glims_read() +require(tidyr, include.only = "separate_wider_delim") + +start_date <- "2024-08-29" +query_workflow_umccrise <- function(start_date) { + q1 <- glue( + "WHERE \"type_name\" = 'umccrise' AND \"start\" > date(\'{start_date}\') ", + "ORDER BY \"start\" DESC;" + ) + rportal::portaldb_query_workflow(q1) +} + +query_limsrow_libids <- function(libids) { + assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) + libids <- unique(libids) |> + paste(collapse = "|") + q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');") + rportal::portaldb_query_limsrow(q1) +} + +# p_raw <- query_workflow_umccrise(start_date) +p_raw_rds <- here(glue("nogit/data_portal/workflows/{start_date}.rds")) +# saveRDS(p_raw, file = p_raw_rds) +p_raw <- readr::read_rds(p_raw_rds) + +p <- p_raw |> + rportal::meta_umccrise(status = "Succeeded") +# lims_raw <- query_limsrow_libids(p$LibraryID_tumor) +lims_raw_rds <- here(glue("nogit/data_portal/lims/{start_date}.rds")) # saveRDS(lims_raw, file = lims_raw_rds) +# L2100192 is L2100192_rerun in the lims, 15 libs are rerun/topup/topup2 lims_raw <- readr::read_rds(lims_raw_rds) lims <- lims_raw |> - filter(Type == "WGS") |> - filter(LibraryID %in% c(pmeta$LibraryID_normal, pmeta$LibraryID_tumor)) -table(pmeta$LibraryID_tumor %in% lims$LibraryID) -table(pmeta$LibraryID_normal %in% lims$LibraryID) - -# The final results sit under gds_outdir_umccrise/__/ -# We need to get the SampleID_tumor for runs before 2023-04-07. We can do that -# by using the LibraryID_tumor to match up with the glims. -missing_tumor_sampleid <- pmeta |> - filter(end < "2023-04-07") |> - pull(LibraryID_tumor) - -table(missing_tumor_sampleid %in% lims$LibraryID) -libid2sampid <- lims |> - filter(LibraryID %in% missing_tumor_sampleid) |> - select(LibraryID_tumor = LibraryID, SampleID_tumor = SampleID) - -d <- pmeta |> - left_join(libid2sampid, by = "LibraryID_tumor") |> - mutate(SampleID_tumor = if_else(is.na(SampleID_tumor.x), SampleID_tumor.y, SampleID_tumor.x)) |> - select(-c(SampleID_tumor.x, SampleID_tumor.y)) |> - relocate(SampleID_tumor, .before = SampleID_normal) |> - mutate(gds_outdir_umccrise = glue("{.data$gds_outdir_umccrise}/{.data$SubjectID}__{.data$SampleID_tumor}")) -d - -# final portal meta for umccrise runs -# columns: -# "id", "wfr_name", "wfr_id", "version", "end_status", "start", "end", "portal_run_id", -# "SubjectID", "LibraryID_tumor", "LibraryID_normal", "SampleID_tumor", "SampleID_normal", -# "gds_outdir_umccrise", "gds_indir_dragen_somatic", "gds_indir_dragen_germline", "gds_infile_genomes_tar" -saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds"))) - -#---- S3 ----# -pat <- "qc_summary.tsv.gz" -rows <- 1000 -d_s3_raw <- dracarys::s3_search(pat = pat, rows = rows) - -d_s3 <- d_s3_raw |> - arrange(desc(date_aest)) |> - mutate( - bname = basename(path), - dir1 = dirname(path), # path/to/dirA/cancer_report_tables - dir2 = basename(dirname(dir1)), # dirA - sbj_samp_lib = sub(".*__(.*)", "\\1", dir2), - SubjectID = sub("(SBJ[0-9]{5})_.*", "\\1", sbj_samp_lib), - SampleID_tumor = sub("SBJ.*?_(.*?)_.*", "\\1", sbj_samp_lib), - LibraryID_tumor = sub("SBJ.*?_.*?_(.*)", "\\1", sbj_samp_lib), - rerun = grepl("rerun", .data$LibraryID_tumor) + tidyr::separate_wider_delim( + library_id, + delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" ) |> - select(dir1, SubjectID, LibraryID_tumor, SampleID_tumor, date = date_aest, rerun) + select( + subject_id, library_id, sample_id, sample_name, + external_subject_id, external_sample_id, + project_name, project_owner, + source, quality + ) |> + distinct() +table(lims$library_id %in% p$LibraryID_tumor) # double-check + +d <- p |> + left_join(lims, by = c("LibraryID_tumor" = "library_id")) |> + mutate(gds_outdir_umccrise = glue("{.data$gds_outdir_umccrise}/{.data$SubjectID}__{.data$SampleID_tumor}")) |> + select( + wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal, + SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id, + project_owner, project_name, source, quality + ) +d -date2 <- "2023-09-12" -saveRDS(d_s3, file = here(glue("nogit/umccrise/rds/portal_meta/{date2}_pmeta_s3.rds"))) -# now we have S3 paths and metadata, so all we need is to generate presigned URLs and read the data +saveRDS(d, file = here(glue("nogit/data_portal/workflows/umccrise_tidy_{start_date}.rds"))) From 318c6fcf722e88b0335b78034fd087979ba46347 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 5 Sep 2024 01:09:32 +1000 Subject: [PATCH 02/24] add write_dracarys_list_of_tbls --- NAMESPACE | 2 ++ R/utils.R | 28 +++++++++++++++++++++++++++ man/write_dracarys_list_of_tbls.Rd | 31 ++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 man/write_dracarys_list_of_tbls.Rd diff --git a/NAMESPACE b/NAMESPACE index 4eb4d8d..9d913ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,7 @@ export(UmccriseCanRepTables) export(VCMetricsFile) export(Wf) export(Wf_tso_ctdna_tumor_only) +export(Wf_umccrise) export(WgsContigMeanCovFile) export(WgsCoverageMetricsFile) export(WgsFineHistFile) @@ -71,6 +72,7 @@ export(s3_search) export(session_info_kable) export(time_metrics_process) export(umccr_tidy) +export(write_dracarys_list_of_tbls) importFrom(R6,R6Class) importFrom(ggplot2,ggplot) importFrom(ggrepel,geom_text_repel) diff --git a/R/utils.R b/R/utils.R index f61fec9..45cbb54 100644 --- a/R/utils.R +++ b/R/utils.R @@ -117,6 +117,34 @@ write_dracarys <- function(obj, prefix, out_format, drid = NULL) { return(invisible(obj)) } +#' Write List of Tidy Tibbles +#' +#' @param list_of_tbls List of tidy tibbles. +#' @param out_dir Output directory. +#' @param prefix Prefix of output file(s). +#' @param out_format Format of output file(s). +#' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). +#' +#' @return Tibble with nested objects that have been written to the output directory. +#' @export +write_dracarys_list_of_tbls <- function(list_of_tbls, out_dir = NULL, prefix = NULL, out_format = "tsv", drid = NULL) { + assertthat::assert_that(!is.null(prefix)) + if (!is.null(out_dir)) { + prefix <- file.path(out_dir, prefix) + } + d_write <- list_of_tbls |> + tibble::enframe(name = "section") |> + dplyr::rowwise() |> + dplyr::mutate( + section_low = tolower(.data$section), + p = glue("{prefix}_{.data$section_low}"), + out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid)) + ) |> + dplyr::ungroup() |> + dplyr::select("section", "value") |> + tibble::deframe() + invisible(d_write) +} #' Create Empty Tibble Given Column Names #' diff --git a/man/write_dracarys_list_of_tbls.Rd b/man/write_dracarys_list_of_tbls.Rd new file mode 100644 index 0000000..e6aa8be --- /dev/null +++ b/man/write_dracarys_list_of_tbls.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{write_dracarys_list_of_tbls} +\alias{write_dracarys_list_of_tbls} +\title{Write List of Tidy Tibbles} +\usage{ +write_dracarys_list_of_tbls( + list_of_tbls, + out_dir = NULL, + prefix = NULL, + out_format = "tsv", + drid = NULL +) +} +\arguments{ +\item{list_of_tbls}{List of tidy tibbles.} + +\item{out_dir}{Output directory.} + +\item{prefix}{Prefix of output file(s).} + +\item{out_format}{Format of output file(s).} + +\item{drid}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} +} +\value{ +Tibble with nested objects that have been written to the output directory. +} +\description{ +Write List of Tidy Tibbles +} From 94ad5ab11209552ca6385cdb8325cc2998684ca2 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 5 Sep 2024 01:33:24 +1000 Subject: [PATCH 03/24] add Wf_umccrise R6 class --- R/umccrise.R | 142 +++++++++++++++++++++++++++++------- man/BclconvertReports375.Rd | 4 +- man/UmccriseCanRepTables.Rd | 32 -------- man/Wf_umccrise.Rd | 117 +++++++++++++++++++++++++++++ 4 files changed, 235 insertions(+), 60 deletions(-) create mode 100644 man/Wf_umccrise.Rd diff --git a/R/umccrise.R b/R/umccrise.R index e358d2d..a66ccad 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -1,3 +1,119 @@ +#' Wf_umccrise R6 Class +#' +#' @description +#' Reads and writes tidy versions of files from the `umccrise` workflow +#' +#' @examples +#' \dontrun{ +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") |> ica_token_validate() +#' SubjectID <- "SBJ01155" +#' SampleID_tumor <- "PRJ211091" +#' gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043" +#' gdsdir <- file.path(gdsdir1, "L2101566__L2101565") +#' obj <- Wf_umccrise$new(gdsdir) +#' gds_files <- obj$gds_list(gdsdir, token, SubjectID, SampleID_tumor) +#' outdir <- file.path(sub("gds://", "", gdsdir)) +#' outdir <- file.path(normalizePath("~/icav1/g"), outdir) +#' out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token) +#' +#' +#' +#' +#' +#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" +#' p2 <- "L2101566__L2101565" +#' p <- file.path(p1, p2) +#' obj <- Wf_umccrise$new(p) +#' obj$path +#' obj$contents +#' d <- obj$read() +#' obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") +#' } +#' +#' @export +Wf_umccrise <- R6::R6Class( + "Wf_umccrise", + public = list( + #' @field path Path to the `umccrise` directory. + #' @field contents Tibble with file path, basename, and size. + path = NULL, + contents = NULL, + #' @description Create a new Wf_umccrise object. + #' @param path Path to the `umccrise` directory. + initialize = function(path = NULL) { + stopifnot(is.character(path), length(path) == 1) + # self$path <- normalizePath(path) + self$path <- path + # self$contents <- fs::dir_info(path, type = "file", recurse = TRUE) |> + # dplyr::mutate( + # bname = basename(.data$path), + # size = as.character(trimws(.data$size)) + # ) |> + # dplyr::select("path", "bname", "size") + }, + #' @description List Relevant Files In umccrise GDS Directory + #' @param gdsdir Path to the `umccrise` directory. + #' @param SubjectID The SubjectID of the sample (used to construct path). + #' @param SampleID_tumor The SampleID of the tumor sample (used to construct path). + #' @param token ICA access token. + gds_list = function(gdsdir, SubjectID, SampleID_tumor, token = Sys.getenv("ICA_ACCESS_TOKEN")) { + reg1 <- tibble::tribble( + ~regex, ~fun, + # "-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile", + "-chord\\.tsv\\.gz$", "UmChordTsvFile", + "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", + "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", + "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", + "-dbs\\.tsv\\.gz$", "UmSigsDbsFile", + "-indel\\.tsv\\.gz$", "UmSigsIndelFile", + "-qc_summary\\.tsv\\.gz$", "UmQcSumFile" + ) + reg2 <- tibble::tribble( + ~regex, ~fun, + "-somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile" + ) + + dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}")) + dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}")) + dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json + f1 <- gds_files_list_filter_relevant(gdsdir = dir_fin, token, page_size = 300, regexes = reg1) + f2 <- gds_files_list_filter_relevant(gdsdir = dir_wrk_pcgr, token, page_size = 50, regexes = reg2) + gds_files <- dplyr::bind_rows(f1, f2) + return(gds_files) + }, + + #' @description GDS File Download via API + #' + #' @param gds_files Tibble with bname and file_id for umccrise files. + #' @param outdir Directory to output files (loosely, not in a structured manner). + #' @param token ICA access token. + gds_download = function(gds_files, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN")) { + assertthat::assert_that(all(c("bname", "file_id") %in% colnames(gds_files))) + gds_files |> + dplyr::rowwise() |> + dplyr::mutate( + out = file.path(outdir, .data$bname), + out_dl = gds_file_download_api(.data$file_id, .data$out, token) + ) + }, + #' @description + tidy = function(indir, out_files) { + obj_canrep <- UmccriseCanRepTables$new(indir) + canrep_parse <- obj_canrep$read() + pcgr_json <- out_files |> + dplyr::filter(.data$type == "PcgrJsonFile") |> + dplyr::pull("out") + pcgr_json_parse <- PcgrJsonFile$new(pcgr_json)$read() + d <- canrep_parse + d[["pcgr_json"]] <- pcgr_json_parse[["metrics"]] + d + } + ) +) + + + + #' UmccriseCanRepTables R6 Class #' #' @description @@ -163,32 +279,6 @@ UmccriseCanRepTables <- R6::R6Class( sigsindel = self$grep_file("-indel\\.tsv\\.gz$") |> self$read_sigs(), qcsum = self$grep_file("-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv() ) - }, - - #' @description - #' Writes tidied contents of `cancer_report_tables` directory output by umccrise. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - d_write <- d |> - tibble::enframe(name = "section") |> - dplyr::rowwise() |> - dplyr::mutate( - section_low = tolower(.data$section), - p = glue("{prefix}_{.data$section_low}"), - out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid)) - ) |> - dplyr::ungroup() |> - dplyr::select("section", "value") |> - tibble::deframe() - invisible(d_write) } ) ) diff --git a/man/BclconvertReports375.Rd b/man/BclconvertReports375.Rd index 01aba8b..e644a84 100644 --- a/man/BclconvertReports375.Rd +++ b/man/BclconvertReports375.Rd @@ -10,8 +10,8 @@ https://support-docs.illumina.com/SW/dragen_v42/Content/SW/DRAGEN/OutputFiles.ht } \examples{ \dontrun{ -p1 <- "240816_A01052_0220_AHM7VHDSXC/202408195d4f2fc4/Reports" -b <- here::here("nogit/bcl_convert", p1) |> +p1 <- "nogit/bcl_convert/WGS_TsqNano/Reports" +b <- here::here(p1) |> BclconvertReports375$new() b$path b$contents diff --git a/man/UmccriseCanRepTables.Rd b/man/UmccriseCanRepTables.Rd index 67e20f3..9c29fd2 100644 --- a/man/UmccriseCanRepTables.Rd +++ b/man/UmccriseCanRepTables.Rd @@ -40,7 +40,6 @@ obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") \item \href{#method-UmccriseCanRepTables-read_sigs}{\code{UmccriseCanRepTables$read_sigs()}} \item \href{#method-UmccriseCanRepTables-read_qcsummarytsv}{\code{UmccriseCanRepTables$read_qcsummarytsv()}} \item \href{#method-UmccriseCanRepTables-read}{\code{UmccriseCanRepTables$read()}} -\item \href{#method-UmccriseCanRepTables-write}{\code{UmccriseCanRepTables$write()}} \item \href{#method-UmccriseCanRepTables-clone}{\code{UmccriseCanRepTables$clone()}} } } @@ -181,37 +180,6 @@ A list of tibbles. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-write}{}}} -\subsection{Method \code{write()}}{ -Writes tidied contents of \code{cancer_report_tables} directory output by umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd new file mode 100644 index 0000000..600cec5 --- /dev/null +++ b/man/Wf_umccrise.Rd @@ -0,0 +1,117 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/umccrise.R +\name{Wf_umccrise} +\alias{Wf_umccrise} +\title{Wf_umccrise R6 Class} +\description{ +Reads and writes tidy versions of files from the \code{umccrise} workflow +} +\examples{ +\dontrun{ +token <- Sys.getenv("ICA_ACCESS_TOKEN") +SubjectID <- "SBJ01155" +SampleID_tumor <- "PRJ211091" +gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043" +gdsdir <- file.path(gdsdir1, "L2101566__L2101565") +obj <- Wf_umccrise$new(gdsdir) +gds_files <- obj$gds_list(gdsdir, token, SubjectID, SampleID_tumor) +outdir <- file.path(sub("gds://", "", gdsdir)) +outdir <- file.path(normalizePath("~/icav1/g"), outdir) +out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token) + + + + + +p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" +p2 <- "L2101566__L2101565" +p <- file.path(p1, p2) +obj <- Wf_umccrise$new(p) +obj$path +obj$contents +d <- obj$read() +obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") +} + +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path to the \code{umccrise} directory.} + +\item{\code{contents}}{Tibble with file path, basename, and size.} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}} +\item \href{#method-Wf_umccrise-gds_list}{\code{Wf_umccrise$gds_list()}} +\item \href{#method-Wf_umccrise-gds_download}{\code{Wf_umccrise$gds_download()}} +\item \href{#method-Wf_umccrise-tidy}{\code{Wf_umccrise$tidy()}} +\item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-new}{}}} +\subsection{Method \code{new()}}{ +Create a new Wf_umccrise object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$new(path = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path to the \code{umccrise} directory.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_list}{}}} +\subsection{Method \code{gds_list()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$gds_list(gdsdir, token, SubjectID, SampleID_tumor)}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_download}{}}} +\subsection{Method \code{gds_download()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$gds_download(gds_files, outdir, token)}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-tidy}{}}} +\subsection{Method \code{tidy()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$tidy(indir, out_files)}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} From ac6eb68733bddae7ded204eb687d175469fc76a4 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 5 Sep 2024 14:46:40 +1000 Subject: [PATCH 04/24] parse conpair results; add grep_file --- NAMESPACE | 1 + R/umccrise.R | 117 ++++++++++++++++++++---------------- R/utils.R | 18 ++++++ man/UmccriseCanRepTables.Rd | 20 +----- man/Wf_umccrise.Rd | 88 +++++++++++++++++++++------ man/grep_file.Rd | 19 ++++++ 6 files changed, 174 insertions(+), 89 deletions(-) create mode 100644 man/grep_file.Rd diff --git a/NAMESPACE b/NAMESPACE index 9d913ec..80cda13 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ export(gds_files_list) export(gds_files_list_fastq) export(gds_files_list_filter_relevant) export(gds_volumes_list) +export(grep_file) export(ica_token_validate) export(match_regex) export(multiqc_column_map_append) diff --git a/R/umccrise.R b/R/umccrise.R index a66ccad..8903583 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -11,23 +11,13 @@ #' gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043" #' gdsdir <- file.path(gdsdir1, "L2101566__L2101565") #' obj <- Wf_umccrise$new(gdsdir) -#' gds_files <- obj$gds_list(gdsdir, token, SubjectID, SampleID_tumor) +#' gds_files <- obj$gds_list( +#' gdsdir = gdsdir, token = token, SubjectID = SubjectID, SampleID_tumor +#' ) #' outdir <- file.path(sub("gds://", "", gdsdir)) #' outdir <- file.path(normalizePath("~/icav1/g"), outdir) #' out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token) -#' -#' -#' -#' -#' -#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" -#' p2 <- "L2101566__L2101565" -#' p <- file.path(p1, p2) -#' obj <- Wf_umccrise$new(p) -#' obj$path -#' obj$contents -#' d <- obj$read() -#' obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") +#' tidy1 <- obj$tidy(indir = outdir, out_files = out_files) #' } #' #' @export @@ -42,14 +32,7 @@ Wf_umccrise <- R6::R6Class( #' @param path Path to the `umccrise` directory. initialize = function(path = NULL) { stopifnot(is.character(path), length(path) == 1) - # self$path <- normalizePath(path) self$path <- path - # self$contents <- fs::dir_info(path, type = "file", recurse = TRUE) |> - # dplyr::mutate( - # bname = basename(.data$path), - # size = as.character(trimws(.data$size)) - # ) |> - # dplyr::select("path", "bname", "size") }, #' @description List Relevant Files In umccrise GDS Directory #' @param gdsdir Path to the `umccrise` directory. @@ -66,13 +49,13 @@ Wf_umccrise <- R6::R6Class( "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", "-dbs\\.tsv\\.gz$", "UmSigsDbsFile", "-indel\\.tsv\\.gz$", "UmSigsIndelFile", - "-qc_summary\\.tsv\\.gz$", "UmQcSumFile" + "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", + "multiqc_conpair.txt", "UmConpairMultiqc" ) reg2 <- tibble::tribble( ~regex, ~fun, "-somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile" ) - dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}")) dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}")) dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json @@ -96,24 +79,66 @@ Wf_umccrise <- R6::R6Class( out_dl = gds_file_download_api(.data$file_id, .data$out, token) ) }, - #' @description + + #' @description Tidy up the output files from umccrise + #' + #' @param indir Path to the `umccrise` directory. + #' @param out_files Tibble with file path, basename, and size. tidy = function(indir, out_files) { obj_canrep <- UmccriseCanRepTables$new(indir) canrep_parse <- obj_canrep$read() pcgr_json <- out_files |> dplyr::filter(.data$type == "PcgrJsonFile") |> - dplyr::pull("out") - pcgr_json_parse <- PcgrJsonFile$new(pcgr_json)$read() + dplyr::pull("out") |> + PcgrJsonFile$new() |> + read() + conpair_tsv <- out_files |> + dplyr::filter(.data$type == "UmConpairMultiqc") |> + dplyr::pull("out") |> + self$read_conpairmultiqc() d <- canrep_parse - d[["pcgr_json"]] <- pcgr_json_parse[["metrics"]] + d[["pcgr_json"]] <- pcgr_json[["metrics"]] + d[["conpair"]] <- conpair_tsv d + }, + + #' @description Read multiqc_conpair.txt file. + #' + #' @param x (`character(1)`)\cr + #' Path to multiqc_conpair.txt file. + read_conpairmultiqc = function(x) { + um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota") + um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples))) + cnames <- list( + old = c( + "Sample", "concordance_concordance", "concordance_used_markers", + "concordance_total_markers", "concordance_marker_threshold", + "concordance_min_mapping_quality", "concordance_min_base_quality", + "contamination" + ), + new = c( + "sampleid", "contamination", "concordance", "markers_used", + "markers_total", "marker_threshold", + "mapq_min", "baseq_min" + ) + ) + ctypes <- list( + old = c("cddddddd"), + new = c("cddddddd") + ) + if (!file.exists(x)) { + return(empty_tbl(cnames$new, ctypes$new)) + } + d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c")) + assertthat::assert_that(all(colnames(d1) == cnames$old)) + d1 |> + dplyr::filter(!.data$Sample %in% um_ref_samples) |> + dplyr::relocate("contamination", .after = "Sample") |> + rlang::set_names(cnames$new) } ) ) - - - #' UmccriseCanRepTables R6 Class #' #' @description @@ -123,7 +148,7 @@ Wf_umccrise <- R6::R6Class( #' @examples #' \dontrun{ #' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" -#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables" +#' p2 <- "L2101566__L2101565" #' p <- file.path(p1, p2) #' obj <- UmccriseCanRepTables$new(p) #' obj$path @@ -171,21 +196,6 @@ UmccriseCanRepTables <- R6::R6Class( cat(bnames, sep = "\n") invisible(self) }, - #' @description Returns file with given pattern from the cancer_report_tables directory. - #' @param pat File pattern to look for. - grep_file = function(pat) { - x <- self$contents |> - dplyr::filter(grepl(pat, .data$path)) |> - dplyr::pull(.data$path) - if (length(x) > 1) { - fnames <- paste(x, collapse = ", ") - cli::cli_abort("More than 1 match found for {pat} ({fnames}). Aborting.") - } - if (length(x) == 0) { - return("") # file.exists("") returns FALSE - } - return(x) - }, #' @description Read `chord.tsv.gz` file output from umccrise. #' @@ -269,15 +279,16 @@ UmccriseCanRepTables <- R6::R6Class( #' @return A list of tibbles. #' @export read = function() { + x <- self$path # now return all as list elements list( - chord = self$grep_file("-chord\\.tsv\\.gz$") |> self$read_chordtsv(), - hrdetect = self$grep_file("-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(), - sigs2015 = self$grep_file("-snv_2015\\.tsv\\.gz$") |> self$read_sigs(), - sigs2020 = self$grep_file("-snv_2020\\.tsv\\.gz$") |> self$read_sigs(), - sigsdbs = self$grep_file("-dbs\\.tsv\\.gz$") |> self$read_sigs(), - sigsindel = self$grep_file("-indel\\.tsv\\.gz$") |> self$read_sigs(), - qcsum = self$grep_file("-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv() + chord = grep_file(x, "-chord\\.tsv\\.gz$") |> self$read_chordtsv(), + hrdetect = grep_file(x, "-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(), + sigs2015 = grep_file(x, "-snv_2015\\.tsv\\.gz$") |> self$read_sigs(), + sigs2020 = grep_file(x, "-snv_2020\\.tsv\\.gz$") |> self$read_sigs(), + sigsdbs = grep_file(x, "-dbs\\.tsv\\.gz$") |> self$read_sigs(), + sigsindel = grep_file(x, "-indel\\.tsv\\.gz$") |> self$read_sigs(), + qcsum = grep_file(x, "-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv() ) } ) diff --git a/R/utils.R b/R/utils.R index 45cbb54..912b3c6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -193,6 +193,24 @@ read_jsongz_rjsonio <- function(x, ...) { RJSONIO::fromJSON(x, ...) } +#' Grep File Pattern +#' +#' @param path Path to look for file. +#' @param regexp A regular expression (e.g. [.]csv$) passed on to `grep()` to filter paths. +#' +#' @return The path to the file or an empty string if no match is found. +#' @export +grep_file <- function(path = ".", regexp) { + x <- fs::dir_ls(path, recurse = TRUE, type = "file", regexp = regexp) + if (length(x) > 1) { + fnames <- paste(x, collapse = ", ") + cli::cli_abort("More than 1 match found for {regexp} ({fnames}). Aborting.") + } + if (length(x) == 0) { + return("") # file.exists("") returns FALSE + } + return(x) +} #' @noRd diff --git a/man/UmccriseCanRepTables.Rd b/man/UmccriseCanRepTables.Rd index 9c29fd2..5c43358 100644 --- a/man/UmccriseCanRepTables.Rd +++ b/man/UmccriseCanRepTables.Rd @@ -10,7 +10,7 @@ output from the \code{umccrise} workflow. \examples{ \dontrun{ p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" -p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables" +p2 <- "L2101566__L2101565" p <- file.path(p1, p2) obj <- UmccriseCanRepTables$new(p) obj$path @@ -34,7 +34,6 @@ obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") \itemize{ \item \href{#method-UmccriseCanRepTables-new}{\code{UmccriseCanRepTables$new()}} \item \href{#method-UmccriseCanRepTables-print}{\code{UmccriseCanRepTables$print()}} -\item \href{#method-UmccriseCanRepTables-grep_file}{\code{UmccriseCanRepTables$grep_file()}} \item \href{#method-UmccriseCanRepTables-read_chordtsv}{\code{UmccriseCanRepTables$read_chordtsv()}} \item \href{#method-UmccriseCanRepTables-read_hrdetecttsv}{\code{UmccriseCanRepTables$read_hrdetecttsv()}} \item \href{#method-UmccriseCanRepTables-read_sigs}{\code{UmccriseCanRepTables$read_sigs()}} @@ -78,23 +77,6 @@ Print details about the cancer_report_tables directory. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-grep_file}{}}} -\subsection{Method \code{grep_file()}}{ -Returns file with given pattern from the cancer_report_tables directory. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$grep_file(pat)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{pat}}{File pattern to look for.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_chordtsv}{}}} \subsection{Method \code{read_chordtsv()}}{ diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 600cec5..9d6abfb 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -8,29 +8,19 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow } \examples{ \dontrun{ -token <- Sys.getenv("ICA_ACCESS_TOKEN") +token <- Sys.getenv("ICA_ACCESS_TOKEN") |> ica_token_validate() SubjectID <- "SBJ01155" SampleID_tumor <- "PRJ211091" gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043" gdsdir <- file.path(gdsdir1, "L2101566__L2101565") obj <- Wf_umccrise$new(gdsdir) -gds_files <- obj$gds_list(gdsdir, token, SubjectID, SampleID_tumor) +gds_files <- obj$gds_list( + gdsdir = gdsdir, token = token, SubjectID = SubjectID, SampleID_tumor +) outdir <- file.path(sub("gds://", "", gdsdir)) outdir <- file.path(normalizePath("~/icav1/g"), outdir) out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token) - - - - - -p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" -p2 <- "L2101566__L2101565" -p <- file.path(p1, p2) -obj <- Wf_umccrise$new(p) -obj$path -obj$contents -d <- obj$read() -obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") +tidy1 <- obj$tidy(indir = outdir, out_files = out_files) } } @@ -50,6 +40,7 @@ obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") \item \href{#method-Wf_umccrise-gds_list}{\code{Wf_umccrise$gds_list()}} \item \href{#method-Wf_umccrise-gds_download}{\code{Wf_umccrise$gds_download()}} \item \href{#method-Wf_umccrise-tidy}{\code{Wf_umccrise$tidy()}} +\item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}} \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} } } @@ -74,28 +65,91 @@ Create a new Wf_umccrise object. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_list}{}}} \subsection{Method \code{gds_list()}}{ +List Relevant Files In umccrise GDS Directory \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$gds_list(gdsdir, token, SubjectID, SampleID_tumor)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$gds_list( + gdsdir, + SubjectID, + SampleID_tumor, + token = Sys.getenv("ICA_ACCESS_TOKEN") +)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{gdsdir}}{Path to the \code{umccrise} directory.} + +\item{\code{SubjectID}}{The SubjectID of the sample (used to construct path).} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (used to construct path).} + +\item{\code{token}}{ICA access token.} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_download}{}}} \subsection{Method \code{gds_download()}}{ +GDS File Download via API \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$gds_download(gds_files, outdir, token)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$gds_download( + gds_files, + outdir, + token = Sys.getenv("ICA_ACCESS_TOKEN") +)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{gds_files}}{Tibble with bname and file_id for umccrise files.} + +\item{\code{outdir}}{Directory to output files (loosely, not in a structured manner).} + +\item{\code{token}}{ICA access token.} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf_umccrise-tidy}{}}} \subsection{Method \code{tidy()}}{ +Tidy up the output files from umccrise \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf_umccrise$tidy(indir, out_files)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{indir}}{Path to the \code{umccrise} directory.} + +\item{\code{out_files}}{Tibble with file path, basename, and size.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_conpairmultiqc}{}}} +\subsection{Method \code{read_conpairmultiqc()}}{ +Read multiqc_conpair.txt file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_conpairmultiqc(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{(\code{character(1)})\cr +Path to multiqc_conpair.txt file.} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/grep_file.Rd b/man/grep_file.Rd new file mode 100644 index 0000000..d0c588e --- /dev/null +++ b/man/grep_file.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{grep_file} +\alias{grep_file} +\title{Grep File Pattern} +\usage{ +grep_file(path = ".", regexp) +} +\arguments{ +\item{path}{Path to look for file.} + +\item{regexp}{A regular expression (e.g. \link{.}csv$) passed on to \code{grep()} to filter paths.} +} +\value{ +The path to the file or an empty string if no match is found. +} +\description{ +Grep File Pattern +} From 4c5b960582ff09d5cd63084b64d6c72a8075076d Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 6 Sep 2024 17:44:50 +1000 Subject: [PATCH 05/24] add local_files_list_filter_relevant --- NAMESPACE | 1 + R/ica.R | 14 ++++++++++--- R/s3.R | 11 ++++++----- R/utils.R | 26 +++++++++++++++++++++++++ man/gds_files_list_filter_relevant.Rd | 11 ++++++++--- man/local_files_list_filter_relevant.Rd | 25 ++++++++++++++++++++++++ man/s3_files_list_filter_relevant.Rd | 9 ++++++--- 7 files changed, 83 insertions(+), 14 deletions(-) create mode 100644 man/local_files_list_filter_relevant.Rd diff --git a/NAMESPACE b/NAMESPACE index 80cda13..5e7ba34 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,6 +55,7 @@ export(gds_files_list_filter_relevant) export(gds_volumes_list) export(grep_file) export(ica_token_validate) +export(local_files_list_filter_relevant) export(match_regex) export(multiqc_column_map_append) export(multiqc_date_fmt) diff --git a/R/ica.R b/R/ica.R index 0838ef0..3e3587b 100644 --- a/R/ica.R +++ b/R/ica.R @@ -7,14 +7,22 @@ #' @param pattern Pattern to further filter the returned file type tibble. #' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE). #' @param page_size Page size (def: 100). -#' @param regexes Tibble with regex and function name. +#' @param regexes Tibble with `regex` and `fun`ction name. #' #' @return A tibble with type, bname, size, file_id, path, and presigned URL. +#' \dontrun{ +#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +#' gds_files_list_filter_relevant(gdsdir) +#' } #' @export -gds_files_list_filter_relevant <- function(gdsdir, token, pattern = NULL, include_url = FALSE, page_size = 100, regexes = DR_FILE_REGEX) { +gds_files_list_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), + pattern = NULL, include_url = FALSE, + page_size = 100, regexes = DR_FILE_REGEX, ...) { pattern <- pattern %||% ".*" # keep all recognisable files by default cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url") - d <- dracarys::gds_files_list(gdsdir, token, include_url = include_url, page_size = page_size) |> + d <- dracarys::gds_files_list( + gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, ... + ) |> dplyr::rowwise() |> dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> dplyr::ungroup() |> diff --git a/R/s3.R b/R/s3.R index 6b6daaa..a941cfe 100644 --- a/R/s3.R +++ b/R/s3.R @@ -8,15 +8,16 @@ #' @param max_items The total number of items to return in the command’s output (def: 1000). #' @param presign Include presigned URLs (def: FALSE). #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)). +#' @param regexes Tibble with `regex` and `fun`ction name. #' -#' @return A tibble with path, date, file size, file type, and presigned URL if requested. +#' @return A tibble with file type, basename, file size, date, full path, and presigned URL if requested. #' @examples #' \dontrun{ #' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" -#' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) +#' s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE) #' } #' @export -s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200) { +s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200, regexes = DR_FILE_REGEX) { assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign)) pattern <- pattern %||% ".*" # keep all recognisable files by default b <- sub("s3://(.*?)/.*", "\\1", s3dir) @@ -38,11 +39,11 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 100 dplyr::rowwise() |> dplyr::mutate( bname = basename(.data$path), - type = purrr::map_chr(.data$bname, match_regex) + type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes)) ) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select("path", "bname", "date_utc", "size", "type") + dplyr::select("type", "bname", "size", "date_utc", "path") if (presign) { d <- d |> diff --git a/R/utils.R b/R/utils.R index 912b3c6..2d39dc7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,29 @@ +#' List Relevant Files In Local Directory +#' +#' Lists relevant files in a local directory. +#' +#' @param path Path to local directory. +#' @param regexes Tibble with `regex` and `fun`ction name. +#' +#' @return A tibble with type, bname, size, file_id, path, and presigned URL. +#' +#' @examples +#' \dontrun{ +#' path <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise" +#' local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX) +#' } +#' @export +local_files_list_filter_relevant <- function(path, regexes = DR_FILE_REGEX) { + fs::dir_ls(path = path, recurse = TRUE, type = "file") |> + tibble::as_tibble_col(column_name = "path") |> + dplyr::mutate( + bname = basename(.data$path), + type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes)) + ) |> + dplyr::filter(!is.na(.data$type)) |> + dplyr::select("type", "bname", "path") +} + #' Print current timestamp for logging #' #' @return Current timestamp as character. diff --git a/man/gds_files_list_filter_relevant.Rd b/man/gds_files_list_filter_relevant.Rd index 784a393..151f1e5 100644 --- a/man/gds_files_list_filter_relevant.Rd +++ b/man/gds_files_list_filter_relevant.Rd @@ -6,11 +6,12 @@ \usage{ gds_files_list_filter_relevant( gdsdir, - token, + token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, include_url = FALSE, page_size = 100, - regexes = DR_FILE_REGEX + regexes = DR_FILE_REGEX, + ... ) } \arguments{ @@ -24,10 +25,14 @@ gds_files_list_filter_relevant( \item{page_size}{Page size (def: 100).} -\item{regexes}{Tibble with regex and function name.} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} } \value{ A tibble with type, bname, size, file_id, path, and presigned URL. +\dontrun{ +gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +gds_files_list_filter_relevant(gdsdir) +} } \description{ Lists relevant files in a GDS directory. diff --git a/man/local_files_list_filter_relevant.Rd b/man/local_files_list_filter_relevant.Rd new file mode 100644 index 0000000..a54407f --- /dev/null +++ b/man/local_files_list_filter_relevant.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{local_files_list_filter_relevant} +\alias{local_files_list_filter_relevant} +\title{List Relevant Files In Local Directory} +\usage{ +local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX) +} +\arguments{ +\item{path}{Path to local directory.} + +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} +} +\value{ +A tibble with type, bname, size, file_id, path, and presigned URL. +} +\description{ +Lists relevant files in a local directory. +} +\examples{ +\dontrun{ +path <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise" +local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX) +} +} diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd index 7fc45f3..29c57e3 100644 --- a/man/s3_files_list_filter_relevant.Rd +++ b/man/s3_files_list_filter_relevant.Rd @@ -10,7 +10,8 @@ s3_files_list_filter_relevant( page_size = 1000, max_items = 1000, presign = FALSE, - expiry_sec = 43200 + expiry_sec = 43200, + regexes = DR_FILE_REGEX ) } \arguments{ @@ -25,9 +26,11 @@ s3_files_list_filter_relevant( \item{presign}{Include presigned URLs (def: FALSE).} \item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).} + +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} } \value{ -A tibble with path, date, file size, file type, and presigned URL if requested. +A tibble with file type, basename, file size, date, full path, and presigned URL if requested. } \description{ Lists relevant files in an AWS S3 directory. @@ -35,6 +38,6 @@ Lists relevant files in an AWS S3 directory. \examples{ \dontrun{ s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" -s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) +s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE) } } From 4bc19448022dea3b013198e87cca4a6e31cb3b2a Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 7 Sep 2024 13:56:23 +1000 Subject: [PATCH 06/24] add s3_list_objects_dir --- NAMESPACE | 1 + R/s3.R | 50 ++++++++++++++++++++++++++-- man/s3_files_list_filter_relevant.Rd | 4 +-- man/s3_list_objects_dir.Rd | 28 ++++++++++++++++ 4 files changed, 79 insertions(+), 4 deletions(-) create mode 100644 man/s3_list_objects_dir.Rd diff --git a/NAMESPACE b/NAMESPACE index 5e7ba34..abe8a14 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -70,6 +70,7 @@ export(multiqc_tidy_json) export(rdf2tab) export(read) export(s3_files_list_filter_relevant) +export(s3_list_objects_dir) export(s3_search) export(session_info_kable) export(time_metrics_process) diff --git a/R/s3.R b/R/s3.R index a941cfe..555d7a9 100644 --- a/R/s3.R +++ b/R/s3.R @@ -1,11 +1,57 @@ +#' List Objects in AWS S3 Directory +#' +#' Returns some or all (up to 1,000) of the objects in an S3 directory. +#' +#' @param s3dir S3 directory. +#' @param max_objects Maximum objects returned. +#' +#' +#' @return A tibble with object basename, size, last modified timestamp, and +#' full S3 path. +#' @examples +#' \dontrun{ +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +#' s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables") +#' s3_list_objects_dir(s3dir, max_objects = 15) +#' } +#' @export +s3_list_objects_dir <- function(s3dir, max_objects = 1000) { + assertthat::assert_that(grepl("^s3://", s3dir)) + bucket <- sub("s3://(.*?)/.*", "\\1", s3dir) + prefix <- sub("s3://(.*?)/(.*)", "\\2", s3dir) + s3 <- paws.storage::s3() + l <- s3$list_objects_v2(Bucket = bucket, Prefix = prefix, MaxKeys = max_objects) + assertthat::assert_that(all(c("Contents", "KeyCount") %in% names(l))) + cols_sel <- c("bname", "size", "lastmodified", "path") + # handle no results + if (l[["KeyCount"]] == 0) { + return(empty_tbl(cnames = cols_sel, ctypes = "cccc")) + } + d <- l[["Contents"]] |> + purrr::map(\(x) tibble::tibble( + Key = x[["Key"]], + Size = x[["Size"]], + lastmodified = x[["LastModified"]] + )) |> + dplyr::bind_rows() |> + dplyr::mutate( + path = glue("s3://{bucket}/{.data$Key}"), + bname = basename(.data$path), + size = fs::as_fs_bytes(.data$Size) + ) |> + dplyr::select(dplyr::all_of(cols_sel)) + return(d) +} + #' List Relevant Files In AWS S3 Directory #' #' Lists relevant files in an AWS S3 directory. #' #' @param s3dir S3 directory. #' @param pattern Pattern to further filter the returned file type tibble. -#' @param page_size The size of each page to get in the AWS service call (def: 1000). -#' @param max_items The total number of items to return in the command’s output (def: 1000). +#' @param page_size The size of each page to get in the AWS service call. +#' @param max_items The total number of items to return in the command’s output. #' @param presign Include presigned URLs (def: FALSE). #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)). #' @param regexes Tibble with `regex` and `fun`ction name. diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd index 29c57e3..a3ded1f 100644 --- a/man/s3_files_list_filter_relevant.Rd +++ b/man/s3_files_list_filter_relevant.Rd @@ -19,9 +19,9 @@ s3_files_list_filter_relevant( \item{pattern}{Pattern to further filter the returned file type tibble.} -\item{page_size}{The size of each page to get in the AWS service call (def: 1000).} +\item{page_size}{The size of each page to get in the AWS service call.} -\item{max_items}{The total number of items to return in the command’s output (def: 1000).} +\item{max_items}{The total number of items to return in the command’s output.} \item{presign}{Include presigned URLs (def: FALSE).} diff --git a/man/s3_list_objects_dir.Rd b/man/s3_list_objects_dir.Rd new file mode 100644 index 0000000..b3c3b2b --- /dev/null +++ b/man/s3_list_objects_dir.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/s3.R +\name{s3_list_objects_dir} +\alias{s3_list_objects_dir} +\title{List Objects in AWS S3 Directory} +\usage{ +s3_list_objects_dir(s3dir, max_objects = 1000) +} +\arguments{ +\item{s3dir}{S3 directory.} + +\item{max_objects}{Maximum objects returned.} +} +\value{ +A tibble with object basename, size, last modified timestamp, and +full S3 path. +} +\description{ +Returns some or all (up to 1,000) of the objects in an S3 directory. +} +\examples{ +\dontrun{ +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables") +s3_list_objects_dir(s3dir, max_objects = 15) +} +} From aa10e2e833cc1a397b4d3b5edcca446d1e3a51be Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 7 Sep 2024 16:09:36 +1000 Subject: [PATCH 07/24] use s3_file_presignedurl with s3v4 sig version --- NAMESPACE | 1 + R/s3.R | 61 +++++++++++++++++----------- man/s3_file_presignedurl.Rd | 29 +++++++++++++ man/s3_files_list_filter_relevant.Rd | 11 +++-- 4 files changed, 72 insertions(+), 30 deletions(-) create mode 100644 man/s3_file_presignedurl.Rd diff --git a/NAMESPACE b/NAMESPACE index abe8a14..c069045 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -69,6 +69,7 @@ export(multiqc_parse_xyline_plot_contig_cvg) export(multiqc_tidy_json) export(rdf2tab) export(read) +export(s3_file_presignedurl) export(s3_files_list_filter_relevant) export(s3_list_objects_dir) export(s3_search) diff --git a/R/s3.R b/R/s3.R index 555d7a9..520ba2c 100644 --- a/R/s3.R +++ b/R/s3.R @@ -63,46 +63,59 @@ s3_list_objects_dir <- function(s3dir, max_objects = 1000) { #' s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE) #' } #' @export -s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200, regexes = DR_FILE_REGEX) { - assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign)) +s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100, + presign = FALSE, expiry_sec = 43200, + regexes = DR_FILE_REGEX) { + assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000) + d_all <- s3_list_objects_dir(s3dir, max_objects = max_objects) + if (nrow(d_all) == 0) { + return(d_all) + } pattern <- pattern %||% ".*" # keep all recognisable files by default - b <- sub("s3://(.*?)/.*", "\\1", s3dir) - p <- sub("s3://(.*?)/(.*)", "\\2", s3dir) - cmd <- glue( - "aws --output json s3api list-objects-v2 --bucket {b} --prefix {p} ", - "--max-items {max_items} --page-size {page_size}" - ) - l <- system(cmd, intern = TRUE) - j <- jsonlite::fromJSON(l) - assertthat::assert_that("Contents" %in% names(j)) - d <- j[["Contents"]] |> - tibble::as_tibble() |> - dplyr::mutate( - path = glue("s3://{b}/{.data$Key}"), - date_utc = .data$LastModified, - size = fs::as_fs_bytes(.data$Size) - ) |> + d <- d_all |> dplyr::rowwise() |> dplyr::mutate( - bname = basename(.data$path), type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes)) ) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select("type", "bname", "size", "date_utc", "path") + dplyr::select("type", "bname", "size", "lastmodified", "path") if (presign) { + s3_client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) d <- d |> dplyr::rowwise() |> - dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path, expiry_seconds = expiry_sec)) |> + dplyr::mutate(presigned_url = s3_file_presignedurl( + client = s3_client, s3_path = .data$path, expiry_seconds = expiry_sec + )) |> dplyr::ungroup() } d } -s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) { - p <- system(glue("aws s3 presign {s3path} --expires-in {expiry_seconds}"), intern = TRUE) - p +#' S3 Generate Presigned URL +#' +#' @param client S3 client. Make sure you use `signature_version = "s3v4"` (see example). +#' @param s3path Full path to S3 object. +#' @param expiry_seconds Number of seconds the presigned URL is valid for (3600 = 1 hour). +#' +#' @return An S3 presigned URL. +#' @examples +#' \dontrun{ +#' client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) +#' s3path <- "s3://bucket1/path/to/file.tsv" +#' s3_file_presignedurl(client, s3path) +#' } +#' +#' @export +s3_file_presignedurl <- function(client, s3path, expiry_seconds = 3600) { + bucket <- sub("s3://(.*?)/.*", "\\1", s3path) + prefix <- sub("s3://(.*?)/(.*)", "\\2", s3path) + client$generate_presigned_url( + client_method = "get_object", + params = list(Bucket = bucket, Key = prefix), + expires_in = expiry_seconds + ) } #' Search AWS S3 Objects diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd new file mode 100644 index 0000000..b041252 --- /dev/null +++ b/man/s3_file_presignedurl.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/s3.R +\name{s3_file_presignedurl} +\alias{s3_file_presignedurl} +\title{S3 Generate Presigned URL} +\usage{ +s3_file_presignedurl(client, s3path, expiry_seconds = 3600) +} +\arguments{ +\item{s3path}{Full path to S3 object.} + +\item{expiry_seconds}{Number of seconds the presigned URL is valid for (3600 = 1 hour).} + +\item{svc}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).} +} +\value{ +An S3 presigned URL. +} +\description{ +S3 Generate Presigned URL +} +\examples{ +\dontrun{ +client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) +s3path <- "s3://bucket1/path/to/file.tsv" +s3_file_presignedurl(client, s3path) +} + +} diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd index a3ded1f..57caf1d 100644 --- a/man/s3_files_list_filter_relevant.Rd +++ b/man/s3_files_list_filter_relevant.Rd @@ -7,8 +7,7 @@ s3_files_list_filter_relevant( s3dir, pattern = NULL, - page_size = 1000, - max_items = 1000, + max_objects = 100, presign = FALSE, expiry_sec = 43200, regexes = DR_FILE_REGEX @@ -19,15 +18,15 @@ s3_files_list_filter_relevant( \item{pattern}{Pattern to further filter the returned file type tibble.} -\item{page_size}{The size of each page to get in the AWS service call.} - -\item{max_items}{The total number of items to return in the command’s output.} - \item{presign}{Include presigned URLs (def: FALSE).} \item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).} \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} + +\item{page_size}{The size of each page to get in the AWS service call.} + +\item{max_items}{The total number of items to return in the command’s output.} } \value{ A tibble with file type, basename, file size, date, full path, and presigned URL if requested. From 633fd714c641e136f0410d11889d84f9b913a692 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 7 Sep 2024 18:20:07 +1000 Subject: [PATCH 08/24] s3 wrapper fixes --- R/s3.R | 39 +++++++++++++++------------- man/dr_s3_download.Rd | 8 +++--- man/s3_file_presignedurl.Rd | 4 +-- man/s3_files_list_filter_relevant.Rd | 10 +++---- 4 files changed, 32 insertions(+), 29 deletions(-) diff --git a/R/s3.R b/R/s3.R index 520ba2c..2885615 100644 --- a/R/s3.R +++ b/R/s3.R @@ -50,10 +50,9 @@ s3_list_objects_dir <- function(s3dir, max_objects = 1000) { #' #' @param s3dir S3 directory. #' @param pattern Pattern to further filter the returned file type tibble. -#' @param page_size The size of each page to get in the AWS service call. -#' @param max_items The total number of items to return in the command’s output. +#' @param max_objects The total number of objects to return. #' @param presign Include presigned URLs (def: FALSE). -#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)). +#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated). #' @param regexes Tibble with `regex` and `fun`ction name. #' #' @return A tibble with file type, basename, file size, date, full path, and presigned URL if requested. @@ -64,10 +63,10 @@ s3_list_objects_dir <- function(s3dir, max_objects = 1000) { #' } #' @export s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100, - presign = FALSE, expiry_sec = 43200, + presign = FALSE, expiry_sec = 3600, regexes = DR_FILE_REGEX) { assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000) - d_all <- s3_list_objects_dir(s3dir, max_objects = max_objects) + d_all <- s3_list_objects_dir(s3dir = s3dir, max_objects = max_objects) if (nrow(d_all) == 0) { return(d_all) } @@ -82,11 +81,14 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1 dplyr::select("type", "bname", "size", "lastmodified", "path") if (presign) { + if (nrow(d) == 0) { + return(d) + } s3_client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) d <- d |> dplyr::rowwise() |> dplyr::mutate(presigned_url = s3_file_presignedurl( - client = s3_client, s3_path = .data$path, expiry_seconds = expiry_sec + client = s3_client, s3path = .data$path, expiry_seconds = expiry_sec )) |> dplyr::ungroup() } @@ -166,36 +168,37 @@ s3_search <- function(pat, rows) { #' #' @param s3dir Full path to S3 directory. #' @param outdir Path to output directory. -#' @param page_size Page size (def: 100). +#' @param max_objects Maximum objects returned in file listing. #' @param pattern Pattern to further filter the returned file type tibble. #' @param regexes Tibble with regex and function name. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). #' @examples +#' \dontrun{ #' s3dir <- file.path( #' "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", #' "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" #' ) #' outdir <- sub("s3:/", "~/s3", s3dir) -#' +#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F) +#' } #' @export -dr_s3_download <- function(s3dir, outdir, page_size = 100, pattern = NULL, regexes = DR_FILE_REGEX, dryrun = FALSE) { +dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, + regexes = DR_FILE_REGEX, dryrun = FALSE) { s3 <- paws.storage::s3() e <- emojifont::emoji fs::dir_create(outdir) - d <- s3_files_list_filter_relevant(s3dir, pattern = NULL, page_size = page_size, max_items = 1000, presign = FALSE, expiry_sec = 43200) + d <- s3_files_list_filter_relevant( + s3dir = s3dir, pattern = NULL, max_objects = max_objects, presign = FALSE, regexes = regexes + ) d <- d |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> - dplyr::select("type", "size", "path", "bname") + dplyr::select("type", "size", "path", "bname") |> + dplyr::mutate(out = file.path(outdir, .data$bname)) # download recognisable dracarys files to outdir/{bname} - pattern <- pattern %||% ".*" # keep all recognisable files - d_filt <- d |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::mutate(out = file.path(outdir, .data$bname)) if (!dryrun) { cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}") - d_filt |> + d |> dplyr::rowwise() |> dplyr::mutate( s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path), @@ -208,7 +211,7 @@ dr_s3_download <- function(s3dir, outdir, page_size = 100, pattern = NULL, regex ) } else { cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}") - d_filt |> + d |> dplyr::select("path", "type", "size") |> as.data.frame() |> print() diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd index 76f0339..985d415 100644 --- a/man/dr_s3_download.Rd +++ b/man/dr_s3_download.Rd @@ -7,7 +7,7 @@ dr_s3_download( s3dir, outdir, - page_size = 100, + max_objects = 100, pattern = NULL, regexes = DR_FILE_REGEX, dryrun = FALSE @@ -18,7 +18,7 @@ dr_s3_download( \item{outdir}{Path to output directory.} -\item{page_size}{Page size (def: 100).} +\item{max_objects}{Maximum objects returned in file listing.} \item{pattern}{Pattern to further filter the returned file type tibble.} @@ -31,10 +31,12 @@ download them).} Download only S3 files that can be processed by dracarys. } \examples{ +\dontrun{ s3dir <- file.path( "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" ) outdir <- sub("s3:/", "~/s3", s3dir) - +dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun=F) +} } diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd index b041252..1598a92 100644 --- a/man/s3_file_presignedurl.Rd +++ b/man/s3_file_presignedurl.Rd @@ -7,11 +7,11 @@ s3_file_presignedurl(client, s3path, expiry_seconds = 3600) } \arguments{ +\item{client}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).} + \item{s3path}{Full path to S3 object.} \item{expiry_seconds}{Number of seconds the presigned URL is valid for (3600 = 1 hour).} - -\item{svc}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).} } \value{ An S3 presigned URL. diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd index 57caf1d..927e982 100644 --- a/man/s3_files_list_filter_relevant.Rd +++ b/man/s3_files_list_filter_relevant.Rd @@ -9,7 +9,7 @@ s3_files_list_filter_relevant( pattern = NULL, max_objects = 100, presign = FALSE, - expiry_sec = 43200, + expiry_sec = 3600, regexes = DR_FILE_REGEX ) } @@ -18,15 +18,13 @@ s3_files_list_filter_relevant( \item{pattern}{Pattern to further filter the returned file type tibble.} +\item{max_objects}{The total number of objects to return.} + \item{presign}{Include presigned URLs (def: FALSE).} -\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).} +\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated).} \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} - -\item{page_size}{The size of each page to get in the AWS service call.} - -\item{max_items}{The total number of items to return in the command’s output.} } \value{ A tibble with file type, basename, file size, date, full path, and presigned URL if requested. From 6323c627665041f42f462a1bfd10201798b59ed2 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 8 Sep 2024 01:04:40 +1000 Subject: [PATCH 09/24] keep gds funcs in sync with s3 --- NAMESPACE | 8 +- R/ica.R | 338 +++++++++--------- R/s3.R | 125 +++---- man/dr_s3_download.Rd | 2 +- ...ds_files_list.Rd => gds_list_files_dir.Rd} | 28 +- ...t.Rd => gds_list_files_filter_relevant.Rd} | 13 +- ...st_objects_dir.Rd => s3_list_files_dir.Rd} | 8 +- ...nt.Rd => s3_list_files_filter_relevant.Rd} | 11 +- 8 files changed, 262 insertions(+), 271 deletions(-) rename man/{gds_files_list.Rd => gds_list_files_dir.Rd} (63%) rename man/{gds_files_list_filter_relevant.Rd => gds_list_files_filter_relevant.Rd} (72%) rename man/{s3_list_objects_dir.Rd => s3_list_files_dir.Rd} (80%) rename man/{s3_files_list_filter_relevant.Rd => s3_list_files_filter_relevant.Rd} (75%) diff --git a/NAMESPACE b/NAMESPACE index c069045..ef0ba80 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,9 +49,9 @@ export(file_regex_getter) export(gds_file_download) export(gds_file_download_api) export(gds_file_presignedurl) -export(gds_files_list) export(gds_files_list_fastq) -export(gds_files_list_filter_relevant) +export(gds_list_files_dir) +export(gds_list_files_filter_relevant) export(gds_volumes_list) export(grep_file) export(ica_token_validate) @@ -70,8 +70,8 @@ export(multiqc_tidy_json) export(rdf2tab) export(read) export(s3_file_presignedurl) -export(s3_files_list_filter_relevant) -export(s3_list_objects_dir) +export(s3_list_files_dir) +export(s3_list_files_filter_relevant) export(s3_search) export(session_info_kable) export(time_metrics_process) diff --git a/R/ica.R b/R/ica.R index 3e3587b..059377e 100644 --- a/R/ica.R +++ b/R/ica.R @@ -1,158 +1,21 @@ -#' List Relevant Files In GDS Directory +#' List files in ICAv1 GDS Directory #' -#' Lists relevant files in a GDS directory. -#' -#' @param gdsdir GDS directory. -#' @param token ICA access token. -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE). -#' @param page_size Page size (def: 100). -#' @param regexes Tibble with `regex` and `fun`ction name. -#' -#' @return A tibble with type, bname, size, file_id, path, and presigned URL. -#' \dontrun{ -#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" -#' gds_files_list_filter_relevant(gdsdir) -#' } -#' @export -gds_files_list_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), - pattern = NULL, include_url = FALSE, - page_size = 100, regexes = DR_FILE_REGEX, ...) { - pattern <- pattern %||% ".*" # keep all recognisable files by default - cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url") - d <- dracarys::gds_files_list( - gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, ... - ) |> - dplyr::rowwise() |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> - dplyr::ungroup() |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select(dplyr::any_of(cols_sel)) - d -} - -#' List FASTQs In GDS Directory -#' -#' @param gdsdir GDS directory. -#' @param token ICA access token. -#' @param include_url Include presigned URLs to all files within the GDS directory. -#' @param page_size Page size. -#' -#' @return A tibble with type, bname, size, file_id, path, and presigned URL. -#' -#' @examples -#' \dontrun{ -#' prim <- "gds://production/primary_data" -#' run <- "240719_A00130_0323_BHMCYHDSXC/202407205bad380d/BiModal_BM-5L" -#' gdsdir <- file.path(prim, run) -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' include_url <- F -#' page_size <- 100 -#' gds_files_list_fastq(gdsdir, token, include_url, page_size) -#' } -#' @export -gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = 100) { - fq_regex <- tibble::tribble( - ~regex, ~fun, - "fastq\\.gz$", "FASTQ" - ) - g <- gds_files_list_filter_relevant( - gdsdir = gdsdir, token = token, pattern = NULL, include_url = include_url, - page_size = page_size, regexes = fq_regex - ) - assertthat::assert_that( - all(colnames(g) == c("type", "bname", "size", "file_id", "path")) - ) - g |> - dplyr::mutate( - size_chr = as.character(.data$size), - size_num = as.numeric(.data$size) - ) |> - dplyr::select( - "type", "bname", "size", "size_chr", "size_num", "file_id", "path" - ) -} - -#' GDS File Presigned URL -#' -#' Returns presigned URL of given GDS file. -#' -#' @param gds_fileid GDS file ID. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @return Presigned URL if valid. -#' @export -gds_file_presignedurl <- function(gds_fileid, token) { - token <- ica_token_validate(token) - base_url <- "https://aps2.platform.illumina.com/v1" - url <- glue("{base_url}/files/{gds_fileid}") - res <- httr::GET( - url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]] - assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url)) - presigned_url -} - -#' GDS File Download via API -#' -#' @param gds_fileid GDS file ID. -#' @param out_file Path to output file. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' -#' @examples -#' \dontrun{ -#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0" -#' out <- tempfile() -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' } -#' @export -gds_file_download_api <- function(gds_fileid, out_file, token) { - presigned_url <- gds_file_presignedurl(gds_fileid, token) - # keep quiet instead of logging presigned urls - status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE) - assertthat::assert_that(status_code == 0) - out_file -} - -#' GDS File Download via CLI -#' -#' @param gds Full path to GDS file. -#' @param out Path to output file. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @export -gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) { - token <- ica_token_validate(token) - system(glue("ica files download {gds} {out} --access-token {token}")) -} - -#' GDS Files List -#' -#' List files on ICA GDS filesystem. +#' Lists files in a GDS directory. #' #' @param gdsdir Full path to GDS directory. #' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @param page_size Page size (def: 10). -#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE). +#' @param page_size Page size (def: 10 via ICA API). +#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API). #' @param page_token Page token (def: NULL). Used internally for recursion. #' @param no_recurse Do not recurse through the file list i.e. just give the first items #' without recursing further down the list using . #' @param recursive Should files be returned recursively _in and under_ the specified -#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE). +#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). #' -#' @return Tibble with file basename, file size, file full data path, file dir name. +#' @return A tibble with file ID, basename, size, last modified timestamp, full GDS path, presigned URL. #' @examples #' \dontrun{ #' gdsdir <- file.path( -#' "gds://production/primary_data", -#' "240322_A00130_0290_BH5HLLDSXC/20240323f56ec5a5/WGS_TsqNano" -#' ) -#' gdsdir <- file.path( -#' "gds://bssh.acddbfda498038ed99fa94fe79523959/Runs", -#' "240322_A00130_0290_BH5HLLDSXC_r.3TbcOsEKZUyetygkqIOXcg/InterOp" -#' ) -#' gdsdir <- file.path( #' "gds://production/analysis_data/SBJ00699/umccrise", #' "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" #' ) @@ -162,11 +25,11 @@ gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) #' page_token <- NULL #' no_recurse <- TRUE #' recursive <- NULL -#' gds_files_list(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) +#' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) #' } #' @export -gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE, - no_recurse = TRUE, page_token = NULL, recursive = NULL) { +gds_list_files_dir <- function(gdsdir, token, page_size = NULL, include_url = FALSE, + no_recurse = TRUE, page_token = NULL, recursive = NULL) { assertthat::assert_that(is.logical(no_recurse), is.logical(include_url)) assertthat::assert_that(is.null(recursive) || is.logical(recursive)) token <- ica_token_validate(token) @@ -213,7 +76,10 @@ gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE, } } # endif d <- j[["items"]] |> - purrr::map(\(x) c(file_id = x[["id"]], path = x[["path"]], size = x[["sizeInBytes"]], presigned_url = x[["presignedUrl"]])) |> + purrr::map(\(x) c( + file_id = x[["id"]], path = x[["path"]], size = x[["sizeInBytes"]], + lastmodified = x[["timeModified"]], presigned_url = x[["presignedUrl"]] + )) |> dplyr::bind_rows() if (nrow(d) == 0) { # We've iterated through all available items, and the next page has 0 items. @@ -224,12 +90,11 @@ gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE, dplyr::mutate( size = fs::as_fs_bytes(.data$size), bname = basename(.data$path), - path = glue("gds://{volname}{.data$path}"), - dname = basename(dirname(.data$path)) + path = glue("gds://{volname}{.data$path}") ) |> - dplyr::select(dplyr::any_of(c("file_id", "bname", "size", "path", "dname", "presigned_url"))) + dplyr::select(dplyr::any_of(c("file_id", "bname", "size", "lastmodified", "path", "presigned_url"))) if (!is.null(j[["nextPageToken"]]) && !no_recurse) { - res2 <- gds_files_list( + res2 <- gds_list_files_dir( gdsdir = gdsdir, token = token, page_size = NULL, include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]], recursive = NULL @@ -239,30 +104,40 @@ gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE, res } -#' List GDS Volumes +#' List Relevant Files In ICAv1 GDS Directory #' -#' Lists GDS volumes accessible by the provided ICA token. +#' Lists relevant files in a GDS directory. #' -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @param page_size Page size (def: 10). +#' @param gdsdir GDS directory. +#' @param token ICA access token. +#' @param pattern Pattern to further filter the returned file type tibble. +#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE). +#' @param page_size Page size (def: 100). +#' @param regexes Tibble with `regex` and `fun`ction name. #' -#' @return A tibble with vol name and vol id. +#' @return A tibble with file type, basename, size, file_id, full path, +#' and presigned URL if requested. +#' \dontrun{ +#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +#' gds_list_files_filter_relevant(gdsdir) +#' } #' @export -gds_volumes_list <- function(token, page_size = 10) { - token <- ica_token_validate(token) - base_url <- "https://aps2.platform.illumina.com/v1" - query_url <- glue("{base_url}/volumes?pageSize={page_size}") - - res <- httr::GET( - query_url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - j <- jsonlite::fromJSON(httr::content(x = res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE) - purrr::map_df(j[["items"]], function(x) c(name = x[["name"]], id = x[["id"]])) +gds_list_files_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), + pattern = NULL, include_url = FALSE, + page_size = 100, regexes = DR_FILE_REGEX, ...) { + pattern <- pattern %||% ".*" # keep all recognisable files by default + cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url") + d <- dracarys::gds_list_files_dir( + gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, ... + ) |> + dplyr::rowwise() |> + dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> + dplyr::ungroup() |> + dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> + dplyr::select(dplyr::any_of(cols_sel)) + d } - #' dracarys GDS Download #' #' Download only GDS files that can be processed by dracarys. @@ -283,12 +158,12 @@ dr_gds_download <- function(gdsdir, outdir, token, page_size = 100, pattern = NU dryrun = FALSE, regexes = DR_FILE_REGEX, recursive = NULL) { e <- emojifont::emoji fs::dir_create(outdir) - d <- gds_files_list( + d <- gds_list_files_dir( gdsdir = gdsdir, token = token, page_size = page_size, no_recurse = FALSE, recursive = recursive ) |> dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> - dplyr::select("file_id", "dname", "type", "size", "path", "bname") + dplyr::select("file_id", "type", "size", "path", "bname") # download recognisable dracarys files to outdir/{bname} pattern <- pattern %||% ".*" # keep all recognisable files @@ -309,6 +184,127 @@ dr_gds_download <- function(gdsdir, outdir, token, page_size = 100, pattern = NU } } + +#' GDS File Presigned URL +#' +#' Returns presigned URL of given GDS file. +#' +#' @param gds_fileid GDS file ID. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @return Presigned URL if valid. +#' @export +gds_file_presignedurl <- function(gds_fileid, token) { + token <- ica_token_validate(token) + base_url <- "https://aps2.platform.illumina.com/v1" + url <- glue("{base_url}/files/{gds_fileid}") + res <- httr::GET( + url, + httr::add_headers(Authorization = glue("Bearer {token}")), + httr::accept_json() + ) + presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]] + assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url)) + presigned_url +} + +#' List FASTQs In GDS Directory +#' +#' @param gdsdir GDS directory. +#' @param token ICA access token. +#' @param include_url Include presigned URLs to all files within the GDS directory. +#' @param page_size Page size. +#' +#' @return A tibble with type, bname, size, file_id, path, and presigned URL. +#' +#' @examples +#' \dontrun{ +#' prim <- "gds://production/primary_data" +#' run <- "240719_A00130_0323_BHMCYHDSXC/202407205bad380d/BiModal_BM-5L" +#' gdsdir <- file.path(prim, run) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' include_url <- F +#' page_size <- 100 +#' gds_files_list_fastq(gdsdir, token, include_url, page_size) +#' } +#' @export +gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = 100) { + fq_regex <- tibble::tribble( + ~regex, ~fun, + "fastq\\.gz$", "FASTQ" + ) + g <- gds_list_files_filter_relevant( + gdsdir = gdsdir, token = token, pattern = NULL, include_url = include_url, + page_size = page_size, regexes = fq_regex + ) + assertthat::assert_that( + all(colnames(g) == c("type", "bname", "size", "file_id", "path")) + ) + g |> + dplyr::mutate( + size_chr = as.character(.data$size), + size_num = as.numeric(.data$size) + ) |> + dplyr::select( + "type", "bname", "size", "size_chr", "size_num", "file_id", "path" + ) +} + +#' GDS File Download via API +#' +#' @param gds_fileid GDS file ID. +#' @param out_file Path to output file. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' +#' @examples +#' \dontrun{ +#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0" +#' out <- tempfile() +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' } +#' @export +gds_file_download_api <- function(gds_fileid, out_file, token) { + presigned_url <- gds_file_presignedurl(gds_fileid, token) + # keep quiet instead of logging presigned urls + status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE) + assertthat::assert_that(status_code == 0) + out_file +} + +#' GDS File Download via CLI +#' +#' @param gds Full path to GDS file. +#' @param out Path to output file. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @export +gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) { + token <- ica_token_validate(token) + system(glue("ica files download {gds} {out} --access-token {token}")) +} + +#' List GDS Volumes +#' +#' Lists GDS volumes accessible by the provided ICA token. +#' +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @param page_size Page size (def: 10). +#' +#' @return A tibble with vol name and vol id. +#' @export +gds_volumes_list <- function(token, page_size = 10) { + token <- ica_token_validate(token) + base_url <- "https://aps2.platform.illumina.com/v1" + query_url <- glue("{base_url}/volumes?pageSize={page_size}") + + res <- httr::GET( + query_url, + httr::add_headers(Authorization = glue("Bearer {token}")), + httr::accept_json() + ) + j <- jsonlite::fromJSON(httr::content(x = res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE) + purrr::map_df(j[["items"]], function(x) c(name = x[["name"]], id = x[["id"]])) +} + + #' Validate ICA access token #' #' Validates ICA access token by parsing it and checking its expiration date. diff --git a/R/s3.R b/R/s3.R index 2885615..fcd7802 100644 --- a/R/s3.R +++ b/R/s3.R @@ -13,10 +13,10 @@ #' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" #' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" #' s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables") -#' s3_list_objects_dir(s3dir, max_objects = 15) +#' s3_list_files_dir(s3dir, max_objects = 15) #' } #' @export -s3_list_objects_dir <- function(s3dir, max_objects = 1000) { +s3_list_files_dir <- function(s3dir, max_objects = 1000) { assertthat::assert_that(grepl("^s3://", s3dir)) bucket <- sub("s3://(.*?)/.*", "\\1", s3dir) prefix <- sub("s3://(.*?)/(.*)", "\\2", s3dir) @@ -55,18 +55,19 @@ s3_list_objects_dir <- function(s3dir, max_objects = 1000) { #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated). #' @param regexes Tibble with `regex` and `fun`ction name. #' -#' @return A tibble with file type, basename, file size, date, full path, and presigned URL if requested. +#' @return A tibble with file type, basename, size, date, full path, +#' and presigned URL if requested. #' @examples #' \dontrun{ #' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" -#' s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE) +#' s3_list_files_filter_relevant(s3dir = s3dir, presign = FALSE) #' } #' @export -s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100, +s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100, presign = FALSE, expiry_sec = 3600, regexes = DR_FILE_REGEX) { assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000) - d_all <- s3_list_objects_dir(s3dir = s3dir, max_objects = max_objects) + d_all <- s3_list_files_dir(s3dir = s3dir, max_objects = max_objects) if (nrow(d_all) == 0) { return(d_all) } @@ -95,6 +96,62 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1 d } +#' dracarys S3 Download +#' +#' Download only S3 files that can be processed by dracarys. +#' +#' @param s3dir Full path to S3 directory. +#' @param outdir Path to output directory. +#' @param max_objects Maximum objects returned in file listing. +#' @param pattern Pattern to further filter the returned file type tibble. +#' @param regexes Tibble with regex and function name. +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @examples +#' \dontrun{ +#' s3dir <- file.path( +#' "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", +#' "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" +#' ) +#' outdir <- sub("s3:/", "~/s3", s3dir) +#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F) +#' } +#' @export +dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, + regexes = DR_FILE_REGEX, dryrun = FALSE) { + s3 <- paws.storage::s3() + e <- emojifont::emoji + fs::dir_create(outdir) + d <- s3_list_files_filter_relevant( + s3dir = s3dir, pattern = NULL, max_objects = max_objects, presign = FALSE, regexes = regexes + ) + d <- d |> + dplyr::select("type", "size", "path", "bname") |> + dplyr::mutate(out = file.path(outdir, .data$bname)) + + # download recognisable dracarys files to outdir/{bname} + if (!dryrun) { + cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}") + d |> + dplyr::rowwise() |> + dplyr::mutate( + s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path), + s3key = sub("s3://(.*?)/(.*)", "\\2", .data$path), + dl = list( + s3$download_file( + Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$out + ) + ) + ) + } else { + cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}") + d |> + dplyr::select("path", "type", "size") |> + as.data.frame() |> + print() + } +} + #' S3 Generate Presigned URL #' #' @param client S3 client. Make sure you use `signature_version = "s3v4"` (see example). @@ -161,59 +218,3 @@ s3_search <- function(pat, rows) { ) |> dplyr::select("path", "size", "date_aest", "id", "unique_hash") } - -#' dracarys S3 Download -#' -#' Download only S3 files that can be processed by dracarys. -#' -#' @param s3dir Full path to S3 directory. -#' @param outdir Path to output directory. -#' @param max_objects Maximum objects returned in file listing. -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param regexes Tibble with regex and function name. -#' @param dryrun If TRUE, just list the files that will be downloaded (don't -#' download them). -#' @examples -#' \dontrun{ -#' s3dir <- file.path( -#' "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", -#' "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" -#' ) -#' outdir <- sub("s3:/", "~/s3", s3dir) -#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F) -#' } -#' @export -dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, - regexes = DR_FILE_REGEX, dryrun = FALSE) { - s3 <- paws.storage::s3() - e <- emojifont::emoji - fs::dir_create(outdir) - d <- s3_files_list_filter_relevant( - s3dir = s3dir, pattern = NULL, max_objects = max_objects, presign = FALSE, regexes = regexes - ) - d <- d |> - dplyr::select("type", "size", "path", "bname") |> - dplyr::mutate(out = file.path(outdir, .data$bname)) - - # download recognisable dracarys files to outdir/{bname} - if (!dryrun) { - cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}") - d |> - dplyr::rowwise() |> - dplyr::mutate( - s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path), - s3key = sub("s3://(.*?)/(.*)", "\\2", .data$path), - dl = list( - s3$download_file( - Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$out - ) - ) - ) - } else { - cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}") - d |> - dplyr::select("path", "type", "size") |> - as.data.frame() |> - print() - } -} diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd index 985d415..d572c10 100644 --- a/man/dr_s3_download.Rd +++ b/man/dr_s3_download.Rd @@ -37,6 +37,6 @@ s3dir <- file.path( "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" ) outdir <- sub("s3:/", "~/s3", s3dir) -dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun=F) +dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F) } } diff --git a/man/gds_files_list.Rd b/man/gds_list_files_dir.Rd similarity index 63% rename from man/gds_files_list.Rd rename to man/gds_list_files_dir.Rd index 30e3bd9..2a80f47 100644 --- a/man/gds_files_list.Rd +++ b/man/gds_list_files_dir.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ica.R -\name{gds_files_list} -\alias{gds_files_list} -\title{GDS Files List} +\name{gds_list_files_dir} +\alias{gds_list_files_dir} +\title{List files in ICAv1 GDS Directory} \usage{ -gds_files_list( +gds_list_files_dir( gdsdir, token, page_size = NULL, @@ -19,9 +19,9 @@ gds_files_list( \item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -\item{page_size}{Page size (def: 10).} +\item{page_size}{Page size (def: 10 via ICA API).} -\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE).} +\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).} \item{no_recurse}{Do not recurse through the file list i.e. just give the first items without recursing further down the list using .} @@ -29,24 +29,16 @@ without recursing further down the list using .} \item{page_token}{Page token (def: NULL). Used internally for recursion.} \item{recursive}{Should files be returned recursively \emph{in and under} the specified -GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE).} +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} } \value{ -Tibble with file basename, file size, file full data path, file dir name. +A tibble with file ID, basename, size, last modified timestamp, full GDS path, presigned URL. } \description{ -List files on ICA GDS filesystem. +Lists files in a GDS directory. } \examples{ \dontrun{ -gdsdir <- file.path( - "gds://production/primary_data", - "240322_A00130_0290_BH5HLLDSXC/20240323f56ec5a5/WGS_TsqNano" -) -gdsdir <- file.path( - "gds://bssh.acddbfda498038ed99fa94fe79523959/Runs", - "240322_A00130_0290_BH5HLLDSXC_r.3TbcOsEKZUyetygkqIOXcg/InterOp" -) gdsdir <- file.path( "gds://production/analysis_data/SBJ00699/umccrise", "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" @@ -57,6 +49,6 @@ include_url <- TRUE page_token <- NULL no_recurse <- TRUE recursive <- NULL -gds_files_list(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) +gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) } } diff --git a/man/gds_files_list_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd similarity index 72% rename from man/gds_files_list_filter_relevant.Rd rename to man/gds_list_files_filter_relevant.Rd index 151f1e5..4ae0612 100644 --- a/man/gds_files_list_filter_relevant.Rd +++ b/man/gds_list_files_filter_relevant.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ica.R -\name{gds_files_list_filter_relevant} -\alias{gds_files_list_filter_relevant} -\title{List Relevant Files In GDS Directory} +\name{gds_list_files_filter_relevant} +\alias{gds_list_files_filter_relevant} +\title{List Relevant Files In ICAv1 GDS Directory} \usage{ -gds_files_list_filter_relevant( +gds_list_files_filter_relevant( gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, @@ -28,10 +28,11 @@ gds_files_list_filter_relevant( \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} } \value{ -A tibble with type, bname, size, file_id, path, and presigned URL. +A tibble with file type, basename, size, file_id, full path, +and presigned URL if requested. \dontrun{ gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" -gds_files_list_filter_relevant(gdsdir) +gds_list_files_filter_relevant(gdsdir) } } \description{ diff --git a/man/s3_list_objects_dir.Rd b/man/s3_list_files_dir.Rd similarity index 80% rename from man/s3_list_objects_dir.Rd rename to man/s3_list_files_dir.Rd index b3c3b2b..055692f 100644 --- a/man/s3_list_objects_dir.Rd +++ b/man/s3_list_files_dir.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/s3.R -\name{s3_list_objects_dir} -\alias{s3_list_objects_dir} +\name{s3_list_files_dir} +\alias{s3_list_files_dir} \title{List Objects in AWS S3 Directory} \usage{ -s3_list_objects_dir(s3dir, max_objects = 1000) +s3_list_files_dir(s3dir, max_objects = 1000) } \arguments{ \item{s3dir}{S3 directory.} @@ -23,6 +23,6 @@ Returns some or all (up to 1,000) of the objects in an S3 directory. p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables") -s3_list_objects_dir(s3dir, max_objects = 15) +s3_list_files_dir(s3dir, max_objects = 15) } } diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd similarity index 75% rename from man/s3_files_list_filter_relevant.Rd rename to man/s3_list_files_filter_relevant.Rd index 927e982..1904ea5 100644 --- a/man/s3_files_list_filter_relevant.Rd +++ b/man/s3_list_files_filter_relevant.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/s3.R -\name{s3_files_list_filter_relevant} -\alias{s3_files_list_filter_relevant} +\name{s3_list_files_filter_relevant} +\alias{s3_list_files_filter_relevant} \title{List Relevant Files In AWS S3 Directory} \usage{ -s3_files_list_filter_relevant( +s3_list_files_filter_relevant( s3dir, pattern = NULL, max_objects = 100, @@ -27,7 +27,8 @@ s3_files_list_filter_relevant( \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} } \value{ -A tibble with file type, basename, file size, date, full path, and presigned URL if requested. +A tibble with file type, basename, size, date, full path, +and presigned URL if requested. } \description{ Lists relevant files in an AWS S3 directory. @@ -35,6 +36,6 @@ Lists relevant files in an AWS S3 directory. \examples{ \dontrun{ s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" -s3_files_list_filter_relevant(s3dir = s3dir, presign = FALSE) +s3_list_files_filter_relevant(s3dir = s3dir, presign = FALSE) } } From c28d8fa0bdd90375ea1b8e4de2f48e3dc3862fae Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 8 Sep 2024 14:14:42 +1000 Subject: [PATCH 10/24] more gds/s3/local filesystem tweaks --- NAMESPACE | 3 +- R/Wf.R | 123 ++++++++++++++--------- R/ica.R | 91 +++++++++-------- R/s3.R | 12 ++- R/utils.R | 47 ++++++--- man/Wf.Rd | 128 ++++++++++++++++-------- man/dr_gds_download.Rd | 22 ++-- man/gds_list_files_dir.Rd | 9 +- man/gds_list_files_filter_relevant.Rd | 42 +++++--- man/local_files_list_filter_relevant.Rd | 25 ----- man/local_list_files_dir.Rd | 22 ++++ man/local_list_files_filter_relevant.Rd | 25 +++++ man/s3_list_files_filter_relevant.Rd | 4 +- 13 files changed, 355 insertions(+), 198 deletions(-) delete mode 100644 man/local_files_list_filter_relevant.Rd create mode 100644 man/local_list_files_dir.Rd create mode 100644 man/local_list_files_filter_relevant.Rd diff --git a/NAMESPACE b/NAMESPACE index ef0ba80..8c6d3ab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,7 +55,8 @@ export(gds_list_files_filter_relevant) export(gds_volumes_list) export(grep_file) export(ica_token_validate) -export(local_files_list_filter_relevant) +export(local_list_files_dir) +export(local_list_files_filter_relevant) export(match_regex) export(multiqc_column_map_append) export(multiqc_date_fmt) diff --git a/R/Wf.R b/R/Wf.R index 317100f..80f6c08 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -1,46 +1,42 @@ -#' Workflow R6 Class +#' @title Workflow #' #' @description Workflow is a base R6 class representing a bioinformatic #' workflow run from a UMCCR workflow manager. #' +#' A workflow has: +#' +#' - an output directory path with all the result output files (either on GDS, S3 or +#' local filesystem) +#' - a subset of files that are of interest for ingestion +#' - tibble with full path and basename columns +#' - a set of parsers that can parse and tidy those files +#' - each parser takes a path and returns a tidy tibble +#' - a list of tidy tibbles (or a tibble with nested tibbles) +#' #' @examples -#' p1 <- system.file("extdata/portaldb_workflow_top4.rds", package = "rportal") |> -#' readRDS() |> -#' dplyr::filter(type_name == "umccrise") |> -#' dplyr::slice(1) -#' w <- Wf$new( -#' prid = p1$portal_run_id, type = p1$type_name, start = p1$start, end = p1$end, -#' status = p1$end_status, input = p1$input, output = p1$output -#' ) -#' w +#' \dontrun{ +#' p1 <- "~/icav1/g/production/analysis_data" +#' p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +#' um <- Wf$new(p, "umccrise") +#' } +#' #' @export Wf <- R6::R6Class( "Wf", public = list( - #' @field prid Portal run ID. - #' @field type Workflow type. - #' @field start Workflow start datetime. - #' @field end Workflow end datetime. - #' @field status Workflow end status. - #' @field input Workflow input JSON string. - #' @field output Workflow output JSON string. - prid = NULL, + #' @field path (`character(1)`)\cr + #' Output directory path with results. + #' @field type (`character(1)`)\cr + #' Type of workflow (e.g. umccrise, sash). + #' @field filesystem (`character(1)`)\cr + #' Filesystem of `path`. + path = NULL, type = NULL, - start = NULL, - end = NULL, - status = NULL, - input = NULL, - output = NULL, + filesystem = NULL, #' @description Create a new Workflow object. - #' @param prid Portal run ID. - #' @param type Workflow type. - #' @param start Workflow start datetime. - #' @param end Workflow end datetime. - #' @param status Workflow end status. - #' @param input Workflow input JSON string. - #' @param output Workflow output JSON string. - initialize = function(prid = NULL, type = NULL, start = NULL, end = NULL, - status = NULL, input = NULL, output = NULL) { + #' @param path Output directory path with results. + #' @param type Type of workflow. + initialize = function(path = NULL, type = NULL) { types <- c( "bcl_convert", "tso_ctdna_tumor_only", @@ -56,30 +52,67 @@ Wf <- R6::R6Class( "oncoanalyser_wgts_existing_both", "sash" ) - assertthat::assert_that( - type %in% types - ) - self$prid <- prid + assertthat::assert_that(type %in% types) + self$path <- path self$type <- type - self$start <- start - self$end <- end - self$status <- status - self$input <- input - self$output <- output + self$filesystem <- dplyr::case_when( + grepl("^gds://", path) ~ "gds", + grepl("^s3://", path) ~ "s3", + .default = "local" + ) }, #' @description Print details about the Workflow. #' @param ... (ignored). print = function(...) { res <- tibble::tribble( ~var, ~value, - "prid", self$prid, + "path", self$path, "type", self$type, - "start", as.character(self$start), - "end", as.character(self$end), - "status", self$status, + "filesystem", self$filesystem ) print(res) invisible(self) + }, + #' @description List all files under given path. + #' @param max_objects Maximum number of objects to list. + list_files = function(max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN")) { + path <- self$path + if (self$filesystem == "gds") { + d <- gds_list_files_dir(gdsdir = path, page_size = max_objects, token = ica_token) + } else if (self$filesystem == "s3") { + d <- s3_list_files_dir(s3dir = path, max_objects = max_objects) + } else { + d <- local_list_files_dir(localdir = path) + } + return(d) + }, + #' @description List dracarys files under given path + #' @param page_size Page size + #' @param regexes Tibble with `regex` and `fun`ction name. + list_files_filter_relevant = function(regexes = NULL, max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + assertthat::assert_that(!is.null(regexes)) + path <- self$path + if (self$filesystem == "gds") { + d <- gds_list_files_filter_relevant(gdsdir = path, token = ica_token, page_size = max_objects, regexes = regexes, ...) + } else if (self$filesystem == "s3") { + # "type", "bname", "size", "date_utc", "path" + d <- s3_list_files_filter_relevant(s3dir = path, max_objects = max_objects, regexes = regexes, ...) + } else { + # "type", "bname", "path" + d <- local_list_files_filter_relevant(path = path, regexes = regexes) + } + d + }, + download_files = function(ica_token = Sys.getenv("ICA_ACCESS_TOKEN")) { + # TODO: add envvar checker + path <- self$path + if (self$filesystem == "gds") { + d <- dr_gds_download(gdsdir = path, token = ica_token) + } else if (self$filesystem == "s3") { + d <- s3_download_files(s3dir = path) + } else { + d <- local_download_files(localdir = path) + } } ) # end public ) diff --git a/R/ica.R b/R/ica.R index 059377e..84b9350 100644 --- a/R/ica.R +++ b/R/ica.R @@ -1,4 +1,4 @@ -#' List files in ICAv1 GDS Directory +#' List Files in ICAv1 GDS Directory #' #' Lists files in a GDS directory. #' @@ -12,7 +12,8 @@ #' @param recursive Should files be returned recursively _in and under_ the specified #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). #' -#' @return A tibble with file ID, basename, size, last modified timestamp, full GDS path, presigned URL. +#' @return A tibble with file ID, basename, size, last modified timestamp, +#' full GDS path, and presigned URL if requested. #' @examples #' \dontrun{ #' gdsdir <- file.path( @@ -21,15 +22,16 @@ #' ) #' token <- ica_token_validate() #' page_size <- 11 -#' include_url <- TRUE +#' include_url <- F #' page_token <- NULL #' no_recurse <- TRUE #' recursive <- NULL #' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) #' } #' @export -gds_list_files_dir <- function(gdsdir, token, page_size = NULL, include_url = FALSE, - no_recurse = TRUE, page_token = NULL, recursive = NULL) { +gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL, + include_url = FALSE, no_recurse = TRUE, page_token = NULL, + recursive = NULL) { assertthat::assert_that(is.logical(no_recurse), is.logical(include_url)) assertthat::assert_that(is.null(recursive) || is.logical(recursive)) token <- ica_token_validate(token) @@ -108,27 +110,30 @@ gds_list_files_dir <- function(gdsdir, token, page_size = NULL, include_url = FA #' #' Lists relevant files in a GDS directory. #' -#' @param gdsdir GDS directory. -#' @param token ICA access token. +#' @inheritParams gds_list_files_dir #' @param pattern Pattern to further filter the returned file type tibble. -#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE). -#' @param page_size Page size (def: 100). #' @param regexes Tibble with `regex` and `fun`ction name. +#' @param ... Passed into `gds_list_files_dir`. #' -#' @return A tibble with file type, basename, size, file_id, full path, +#' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path, #' and presigned URL if requested. +#' @examples #' \dontrun{ #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" #' gds_list_files_filter_relevant(gdsdir) #' } #' @export -gds_list_files_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), - pattern = NULL, include_url = FALSE, - page_size = 100, regexes = DR_FILE_REGEX, ...) { +gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_FILE_REGEX, + token = Sys.getenv("ICA_ACCESS_TOKEN"), + page_size = 100, include_url = FALSE, + no_recurse = TRUE, page_token = NULL, + recursive = NULL) { pattern <- pattern %||% ".*" # keep all recognisable files by default - cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url") + assertthat::assert_that(all(colnames(regexes) == c("regex", "fun"))) + cols_sel <- c("type", "bname", "size", "lastmodified", "file_id", "path", "presigned_url") d <- dracarys::gds_list_files_dir( - gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, ... + gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, + no_recurse = no_recurse, page_token = page_token, recursive = recursive ) |> dplyr::rowwise() |> dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> @@ -142,43 +147,49 @@ gds_list_files_filter_relevant <- function(gdsdir, token = Sys.getenv("ICA_ACCES #' #' Download only GDS files that can be processed by dracarys. #' -#' @param gdsdir Full path to GDS directory. -#' @param outdir Path to output directory. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @param page_size Page size (def: 100). -#' @param pattern Pattern to further filter the returned file type tibble. +#' @inheritParams gds_list_files_dir +#' @inheritParams gds_list_files_filter_relevant +#' @param outdir Local output directory. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @param regexes Tibble with regex and function name. -#' @param recursive Should files be returned recursively _in and under_ the specified -#' GDS directory (TRUE), or _only directly in_ the specified GDS directory (FALSE) (def: TRUE). +#' @examples +#' \dontrun{ +#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +#' outdir <- sub("gds:/", "~/icav1/g", gdsdir) +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") +#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F) +#' } #' #' @export -dr_gds_download <- function(gdsdir, outdir, token, page_size = 100, pattern = NULL, - dryrun = FALSE, regexes = DR_FILE_REGEX, recursive = NULL) { +dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), + pattern = NULL, page_size = 100, dryrun = FALSE, + regexes = DR_FILE_REGEX, recursive = NULL) { e <- emojifont::emoji fs::dir_create(outdir) - d <- gds_list_files_dir( - gdsdir = gdsdir, token = token, page_size = page_size, - no_recurse = FALSE, recursive = recursive - ) |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> - dplyr::select("file_id", "type", "size", "path", "bname") + d <- gds_list_files_filter_relevant( + gdsdir = gdsdir, pattern = pattern, regexes = regexes, + token = token, page_size = page_size, include_url = FALSE, + no_recurse = FALSE, page_token = NULL, + recursive = recursive + ) - # download recognisable dracarys files to outdir/{bname} - pattern <- pattern %||% ".*" # keep all recognisable files - d_filt <- d |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::mutate(out = file.path(outdir, .data$bname)) + d <- d |> + dplyr::mutate( + localpath = file.path(outdir, .data$bname), + gdspath = .data$path + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath") if (!dryrun) { cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}") - d_filt |> + d |> dplyr::rowwise() |> - dplyr::mutate(out_dl = gds_file_download_api(.data$file_id, .data$out, token)) + dplyr::mutate( + dl = gds_file_download_api(gds_fileid = .data$file_id, out_file = .data$out, token = token) + ) } else { cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}") - d_filt |> - dplyr::select("path", "type", "size") |> + d |> + dplyr::select("type", "bname", "size", "gdspath", localpath2be = "localpath") |> as.data.frame() |> print() } diff --git a/R/s3.R b/R/s3.R index fcd7802..fd67d88 100644 --- a/R/s3.R +++ b/R/s3.R @@ -55,8 +55,8 @@ s3_list_files_dir <- function(s3dir, max_objects = 1000) { #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated). #' @param regexes Tibble with `regex` and `fun`ction name. #' -#' @return A tibble with file type, basename, size, date, full path, -#' and presigned URL if requested. +#' @return A tibble with file type, basename, size, last modified timestamp, +#' full path, and presigned URL if requested. #' @examples #' \dontrun{ #' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" @@ -72,6 +72,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1 return(d_all) } pattern <- pattern %||% ".*" # keep all recognisable files by default + cols_sel <- c("type", "bname", "size", "lastmodified", "path") d <- d_all |> dplyr::rowwise() |> dplyr::mutate( @@ -79,7 +80,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1 ) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select("type", "bname", "size", "lastmodified", "path") + dplyr::select(dplyr::all_of(cols_sel)) if (presign) { if (nrow(d) == 0) { @@ -91,7 +92,8 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1 dplyr::mutate(presigned_url = s3_file_presignedurl( client = s3_client, s3path = .data$path, expiry_seconds = expiry_sec )) |> - dplyr::ungroup() + dplyr::ungroup() |> + dplyr::select(dplyr::all_of(c(cols_sel, "presigned_url"))) } d } @@ -146,7 +148,7 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, } else { cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}") d |> - dplyr::select("path", "type", "size") |> + dplyr::select("type", "bname", "size", "path") |> as.data.frame() |> print() } diff --git a/R/utils.R b/R/utils.R index 2d39dc7..99c6550 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,27 +1,49 @@ +#' List Files in Local Directory +#' +#' Lists files in a local directory. +#' +#' @param localdir Path to local directory. +#' +#' @return A tibble with file basename, size, last modification timestamp +#' and full path. +#' @examples +#' localdir <- system.file("R", package = "dracarys") +#' x <- local_list_files_dir(localdir) +#' @testexamples +#' expect_equal(names(x), c("bname", "size", "lastmodified", "path")) +#' @export +local_list_files_dir <- function(localdir) { + fs::dir_info(path = localdir, recurse = TRUE, type = "file") |> + dplyr::mutate( + bname = basename(.data$path), + lastmodified = .data$modification_time + ) |> + dplyr::select("bname", "size", "lastmodified", "path") +} + #' List Relevant Files In Local Directory #' #' Lists relevant files in a local directory. #' #' @param path Path to local directory. -#' @param regexes Tibble with `regex` and `fun`ction name. -#' -#' @return A tibble with type, bname, size, file_id, path, and presigned URL. +#' @param regexes Tibble with `regex` and `fun`ction name (see example). +#' @return A tibble with file type, basename, size, last modified timestamp, and +#' path. #' #' @examples -#' \dontrun{ -#' path <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise" -#' local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX) -#' } +#' path <- system.file("extdata/tso", package = "dracarys") +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile") +#' x <- local_list_files_filter_relevant(path, regexes) +#' @testexamples +#' expect_equal(nrow(x), 1) #' @export -local_files_list_filter_relevant <- function(path, regexes = DR_FILE_REGEX) { - fs::dir_ls(path = path, recurse = TRUE, type = "file") |> - tibble::as_tibble_col(column_name = "path") |> +local_list_files_filter_relevant <- function(path, regexes = DR_FILE_REGEX) { + local_list_files_dir(localdir = path) |> dplyr::mutate( - bname = basename(.data$path), type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes)) ) |> dplyr::filter(!is.na(.data$type)) |> - dplyr::select("type", "bname", "path") + dplyr::select("type", "bname", "size", "lastmodified", "path") } #' Print current timestamp for logging @@ -238,7 +260,6 @@ grep_file <- function(path = ".", regexp) { return(x) } - #' @noRd dummy1 <- function() { # Solves R CMD check: Namespaces in Imports field not imported from diff --git a/man/Wf.Rd b/man/Wf.Rd index d5c99ae..ce44791 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -2,38 +2,47 @@ % Please edit documentation in R/Wf.R \name{Wf} \alias{Wf} -\title{Workflow R6 Class} +\title{Workflow} \description{ Workflow is a base R6 class representing a bioinformatic workflow run from a UMCCR workflow manager. + +A workflow has: +\itemize{ +\item an output directory path with all the result output files (either on GDS, S3 or +local filesystem) +\item a subset of files that are of interest for ingestion +\itemize{ +\item tibble with full path and basename columns +} +\item a set of parsers that can parse and tidy those files +\itemize{ +\item each parser takes a path and returns a tidy tibble +} +\item a list of tidy tibbles (or a tibble with nested tibbles) +} } \examples{ -p1 <- system.file("extdata/portaldb_workflow_top4.rds", package = "rportal") |> - readRDS() |> - dplyr::filter(type_name == "umccrise") |> - dplyr::slice(1) -w <- Wf$new( - prid = p1$portal_run_id, type = p1$type_name, start = p1$start, end = p1$end, - status = p1$end_status, input = p1$input, output = p1$output -) -w +\dontrun{ +p1 <- "~/icav1/g/production/analysis_data" +p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +um <- Wf$new(p, "umccrise") + +} + + } \section{Public fields}{ \if{html}{\out{
}} \describe{ -\item{\code{prid}}{Portal run ID.} - -\item{\code{type}}{Workflow type.} - -\item{\code{start}}{Workflow start datetime.} +\item{\code{path}}{(\code{character(1)})\cr +Output directory path with results.} -\item{\code{end}}{Workflow end datetime.} +\item{\code{type}}{(\code{character(1)})\cr +Type of workflow (e.g. umccrise, sash).} -\item{\code{status}}{Workflow end status.} - -\item{\code{input}}{Workflow input JSON string.} - -\item{\code{output}}{Workflow output JSON string.} +\item{\code{filesystem}}{(\code{character(1)})\cr +Filesystem of \code{path}.} } \if{html}{\out{
}} } @@ -42,6 +51,9 @@ w \itemize{ \item \href{#method-Wf-new}{\code{Wf$new()}} \item \href{#method-Wf-print}{\code{Wf$print()}} +\item \href{#method-Wf-list_files}{\code{Wf$list_files()}} +\item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}} +\item \href{#method-Wf-download_files}{\code{Wf$download_files()}} \item \href{#method-Wf-clone}{\code{Wf$clone()}} } } @@ -51,33 +63,15 @@ w \subsection{Method \code{new()}}{ Create a new Workflow object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$new( - prid = NULL, - type = NULL, - start = NULL, - end = NULL, - status = NULL, - input = NULL, - output = NULL -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$new(path = NULL, type = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{prid}}{Portal run ID.} - -\item{\code{type}}{Workflow type.} - -\item{\code{start}}{Workflow start datetime.} - -\item{\code{end}}{Workflow end datetime.} +\item{\code{path}}{Output directory path with results.} -\item{\code{status}}{Workflow end status.} - -\item{\code{input}}{Workflow input JSON string.} - -\item{\code{output}}{Workflow output JSON string.} +\item{\code{type}}{Type of workflow.} } \if{html}{\out{
}} } @@ -98,6 +92,56 @@ Print details about the Workflow. } \if{html}{\out{}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-list_files}{}}} +\subsection{Method \code{list_files()}}{ +List all files under given path. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$list_files(max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"))}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{max_objects}}{Maximum number of objects to list.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-list_files_filter_relevant}{}}} +\subsection{Method \code{list_files_filter_relevant()}}{ +List dracarys files under given path +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$list_files_filter_relevant( + regexes = NULL, + max_objects = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.} + +\item{\code{page_size}}{Page size} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-download_files}{}}} +\subsection{Method \code{download_files()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$download_files(ica_token = Sys.getenv("ICA_ACCESS_TOKEN"))}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd index 087a7ed..0d09aa7 100644 --- a/man/dr_gds_download.Rd +++ b/man/dr_gds_download.Rd @@ -7,9 +7,9 @@ dr_gds_download( gdsdir, outdir, - token, - page_size = 100, + token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, + page_size = 100, dryrun = FALSE, regexes = DR_FILE_REGEX, recursive = NULL @@ -18,22 +18,30 @@ dr_gds_download( \arguments{ \item{gdsdir}{Full path to GDS directory.} -\item{outdir}{Path to output directory.} +\item{outdir}{Local output directory.} \item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -\item{page_size}{Page size (def: 100).} - \item{pattern}{Pattern to further filter the returned file type tibble.} +\item{page_size}{Page size (def: 10 via ICA API).} + \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} -\item{regexes}{Tibble with regex and function name.} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} \item{recursive}{Should files be returned recursively \emph{in and under} the specified -GDS directory (TRUE), or \emph{only directly in} the specified GDS directory (FALSE) (def: TRUE).} +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} } \description{ Download only GDS files that can be processed by dracarys. } +\examples{ +\dontrun{ +gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +outdir <- sub("gds:/", "~/icav1/g", gdsdir) +dr_gds_download(gdsdir, outdir) +} + +} diff --git a/man/gds_list_files_dir.Rd b/man/gds_list_files_dir.Rd index 2a80f47..7d52c1d 100644 --- a/man/gds_list_files_dir.Rd +++ b/man/gds_list_files_dir.Rd @@ -2,11 +2,11 @@ % Please edit documentation in R/ica.R \name{gds_list_files_dir} \alias{gds_list_files_dir} -\title{List files in ICAv1 GDS Directory} +\title{List Files in ICAv1 GDS Directory} \usage{ gds_list_files_dir( gdsdir, - token, + token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL, include_url = FALSE, no_recurse = TRUE, @@ -32,7 +32,8 @@ without recursing further down the list using .} GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} } \value{ -A tibble with file ID, basename, size, last modified timestamp, full GDS path, presigned URL. +A tibble with file ID, basename, size, last modified timestamp, +full GDS path, and presigned URL if requested. } \description{ Lists files in a GDS directory. @@ -45,7 +46,7 @@ gdsdir <- file.path( ) token <- ica_token_validate() page_size <- 11 -include_url <- TRUE +include_url <- F page_token <- NULL no_recurse <- TRUE recursive <- NULL diff --git a/man/gds_list_files_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd index 4ae0612..ec57909 100644 --- a/man/gds_list_files_filter_relevant.Rd +++ b/man/gds_list_files_filter_relevant.Rd @@ -6,35 +6,49 @@ \usage{ gds_list_files_filter_relevant( gdsdir, - token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, - include_url = FALSE, - page_size = 100, regexes = DR_FILE_REGEX, - ... + token = Sys.getenv("ICA_ACCESS_TOKEN"), + page_size = 100, + include_url = FALSE, + no_recurse = TRUE, + page_token = NULL, + recursive = NULL ) } \arguments{ -\item{gdsdir}{GDS directory.} - -\item{token}{ICA access token.} +\item{gdsdir}{Full path to GDS directory.} \item{pattern}{Pattern to further filter the returned file type tibble.} -\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE).} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} -\item{page_size}{Page size (def: 100).} +\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} +\item{page_size}{Page size (def: 10 via ICA API).} + +\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).} + +\item{no_recurse}{Do not recurse through the file list i.e. just give the first items +without recursing further down the list using .} + +\item{page_token}{Page token (def: NULL). Used internally for recursion.} + +\item{recursive}{Should files be returned recursively \emph{in and under} the specified +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} + +\item{...}{Passed into \code{gds_list_files_dir}.} } \value{ -A tibble with file type, basename, size, file_id, full path, +A tibble with file type, basename, size, last modified timestamp, file_id, full path, and presigned URL if requested. +} +\description{ +Lists relevant files in a GDS directory. +} +\examples{ \dontrun{ gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" gds_list_files_filter_relevant(gdsdir) } } -\description{ -Lists relevant files in a GDS directory. -} diff --git a/man/local_files_list_filter_relevant.Rd b/man/local_files_list_filter_relevant.Rd deleted file mode 100644 index a54407f..0000000 --- a/man/local_files_list_filter_relevant.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{local_files_list_filter_relevant} -\alias{local_files_list_filter_relevant} -\title{List Relevant Files In Local Directory} -\usage{ -local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX) -} -\arguments{ -\item{path}{Path to local directory.} - -\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} -} -\value{ -A tibble with type, bname, size, file_id, path, and presigned URL. -} -\description{ -Lists relevant files in a local directory. -} -\examples{ -\dontrun{ -path <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise" -local_files_list_filter_relevant(path, regexes = DR_FILE_REGEX) -} -} diff --git a/man/local_list_files_dir.Rd b/man/local_list_files_dir.Rd new file mode 100644 index 0000000..771aa9b --- /dev/null +++ b/man/local_list_files_dir.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{local_list_files_dir} +\alias{local_list_files_dir} +\title{List Files in Local Directory} +\usage{ +local_list_files_dir(localdir) +} +\arguments{ +\item{localdir}{Path to local directory.} +} +\value{ +A tibble with file basename, size, last modification timestamp +and full path. +} +\description{ +Lists files in a local directory. +} +\examples{ +localdir <- system.file("R", package = "dracarys") +x <- local_list_files_dir(localdir) +} diff --git a/man/local_list_files_filter_relevant.Rd b/man/local_list_files_filter_relevant.Rd new file mode 100644 index 0000000..4e17351 --- /dev/null +++ b/man/local_list_files_filter_relevant.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{local_list_files_filter_relevant} +\alias{local_list_files_filter_relevant} +\title{List Relevant Files In Local Directory} +\usage{ +local_list_files_filter_relevant(path, regexes = DR_FILE_REGEX) +} +\arguments{ +\item{path}{Path to local directory.} + +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} +} +\value{ +A tibble with file type, basename, size, last modified timestamp, and +path. +} +\description{ +Lists relevant files in a local directory. +} +\examples{ +path <- system.file("extdata/tso", package = "dracarys") +regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcFile") +x <- local_list_files_filter_relevant(path, regexes) +} diff --git a/man/s3_list_files_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd index 1904ea5..8839bde 100644 --- a/man/s3_list_files_filter_relevant.Rd +++ b/man/s3_list_files_filter_relevant.Rd @@ -27,8 +27,8 @@ s3_list_files_filter_relevant( \item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} } \value{ -A tibble with file type, basename, size, date, full path, -and presigned URL if requested. +A tibble with file type, basename, size, last modified timestamp, +full path, and presigned URL if requested. } \description{ Lists relevant files in an AWS S3 directory. From c0d57a1c4878beb2a679ea1412374c3d664ceb87 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 8 Sep 2024 15:02:32 +1000 Subject: [PATCH 11/24] more gds/s3 filesystem tweaks --- R/ica.R | 21 ++++++---- R/s3.R | 59 +++++++++++++++------------ man/Wf.Rd | 2 - man/dr_gds_download.Rd | 3 +- man/dr_s3_download.Rd | 16 ++++---- man/gds_list_files_filter_relevant.Rd | 3 +- man/s3_list_files_filter_relevant.Rd | 17 ++++---- 7 files changed, 65 insertions(+), 56 deletions(-) diff --git a/R/ica.R b/R/ica.R index 84b9350..3589688 100644 --- a/R/ica.R +++ b/R/ica.R @@ -113,12 +113,11 @@ gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), p #' @inheritParams gds_list_files_dir #' @param pattern Pattern to further filter the returned file type tibble. #' @param regexes Tibble with `regex` and `fun`ction name. -#' @param ... Passed into `gds_list_files_dir`. -#' #' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path, #' and presigned URL if requested. #' @examples #' \dontrun{ +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" #' gds_list_files_filter_relevant(gdsdir) #' } @@ -172,30 +171,34 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN no_recurse = FALSE, page_token = NULL, recursive = recursive ) - d <- d |> dplyr::mutate( localpath = file.path(outdir, .data$bname), gdspath = .data$path ) |> dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath") + # download recognisable dracarys files to outdir/{bname} if (!dryrun) { cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}") - d |> + res <- d |> dplyr::rowwise() |> dplyr::mutate( - dl = gds_file_download_api(gds_fileid = .data$file_id, out_file = .data$out, token = token) - ) + dl = gds_file_download_api( + gds_fileid = .data$file_id, out_file = .data$localpath, token = token + ), + localpath = normalizePath(.data$localpath) + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id") + return(res) } else { cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}") d |> - dplyr::select("type", "bname", "size", "gdspath", localpath2be = "localpath") |> + dplyr::select("type", "bname", "size", "lastmodified", "gdspath", "file_id", localpath2be = "localpath") |> as.data.frame() |> print() } } - #' GDS File Presigned URL #' #' Returns presigned URL of given GDS file. @@ -278,7 +281,7 @@ gds_file_download_api <- function(gds_fileid, out_file, token) { # keep quiet instead of logging presigned urls status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE) assertthat::assert_that(status_code == 0) - out_file + normalizePath(out_file) } #' GDS File Download via CLI diff --git a/R/s3.R b/R/s3.R index fd67d88..1bf8f4a 100644 --- a/R/s3.R +++ b/R/s3.R @@ -48,24 +48,26 @@ s3_list_files_dir <- function(s3dir, max_objects = 1000) { #' #' Lists relevant files in an AWS S3 directory. #' -#' @param s3dir S3 directory. +#' @inheritParams s3_list_files_dir #' @param pattern Pattern to further filter the returned file type tibble. -#' @param max_objects The total number of objects to return. +#' @param regexes Tibble with `regex` and `fun`ction name. #' @param presign Include presigned URLs (def: FALSE). #' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated). -#' @param regexes Tibble with `regex` and `fun`ction name. #' #' @return A tibble with file type, basename, size, last modified timestamp, #' full path, and presigned URL if requested. #' @examples #' \dontrun{ -#' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" -#' s3_list_files_filter_relevant(s3dir = s3dir, presign = FALSE) +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +#' s3dir <- file.path(p1, p2) +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") +#' s3_list_files_filter_relevant(s3dir = s3dir, regexes = regexes, max_objects = 300) #' } #' @export -s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 100, - presign = FALSE, expiry_sec = 3600, - regexes = DR_FILE_REGEX) { +s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, + regexes = DR_FILE_REGEX, max_objects = 100, + presign = FALSE, expiry_sec = 3600) { assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000) d_all <- s3_list_files_dir(s3dir = s3dir, max_objects = max_objects) if (nrow(d_all) == 0) { @@ -102,21 +104,19 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, max_objects = 1 #' #' Download only S3 files that can be processed by dracarys. #' -#' @param s3dir Full path to S3 directory. +#' @inheritParams s3_list_files_dir +#' @inheritParams s3_list_files_filter_relevant #' @param outdir Path to output directory. -#' @param max_objects Maximum objects returned in file listing. -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param regexes Tibble with regex and function name. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). #' @examples #' \dontrun{ -#' s3dir <- file.path( -#' "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", -#' "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" -#' ) +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +#' s3dir <- file.path(p1, p2) +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") #' outdir <- sub("s3:/", "~/s3", s3dir) -#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F) +#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = F) #' } #' @export dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, @@ -125,30 +125,35 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, e <- emojifont::emoji fs::dir_create(outdir) d <- s3_list_files_filter_relevant( - s3dir = s3dir, pattern = NULL, max_objects = max_objects, presign = FALSE, regexes = regexes + s3dir = s3dir, pattern = NULL, regexes = regexes, + max_objects = max_objects, presign = FALSE ) d <- d |> - dplyr::select("type", "size", "path", "bname") |> - dplyr::mutate(out = file.path(outdir, .data$bname)) - + dplyr::mutate( + localpath = file.path(outdir, .data$bname), + s3path = .data$path + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path") # download recognisable dracarys files to outdir/{bname} if (!dryrun) { cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}") d |> dplyr::rowwise() |> dplyr::mutate( - s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path), - s3key = sub("s3://(.*?)/(.*)", "\\2", .data$path), + s3bucket = sub("s3://(.*?)/.*", "\\1", .data$s3path), + s3key = sub("s3://(.*?)/(.*)", "\\2", .data$s3path), dl = list( s3$download_file( - Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$out + Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$localpath ) - ) - ) + ), + localpath = normalizePath(.data$localpath) + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path") } else { cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}") d |> - dplyr::select("type", "bname", "size", "path") |> + dplyr::select("type", "bname", "size", "lastmodified", "s3path", localpath2be = "localpath") |> as.data.frame() |> print() } diff --git a/man/Wf.Rd b/man/Wf.Rd index ce44791..a99ffbb 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -27,10 +27,8 @@ local filesystem) p1 <- "~/icav1/g/production/analysis_data" p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") um <- Wf$new(p, "umccrise") - } - } \section{Public fields}{ \if{html}{\out{
}} diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd index 0d09aa7..14a25fd 100644 --- a/man/dr_gds_download.Rd +++ b/man/dr_gds_download.Rd @@ -41,7 +41,8 @@ Download only GDS files that can be processed by dracarys. \dontrun{ gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" outdir <- sub("gds:/", "~/icav1/g", gdsdir) -dr_gds_download(gdsdir, outdir) +regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") +dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F) } } diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd index d572c10..6619188 100644 --- a/man/dr_s3_download.Rd +++ b/man/dr_s3_download.Rd @@ -14,15 +14,15 @@ dr_s3_download( ) } \arguments{ -\item{s3dir}{Full path to S3 directory.} +\item{s3dir}{S3 directory.} \item{outdir}{Path to output directory.} -\item{max_objects}{Maximum objects returned in file listing.} +\item{max_objects}{Maximum objects returned.} \item{pattern}{Pattern to further filter the returned file type tibble.} -\item{regexes}{Tibble with regex and function name.} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} @@ -32,11 +32,11 @@ Download only S3 files that can be processed by dracarys. } \examples{ \dontrun{ -s3dir <- file.path( - "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", - "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" -) +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +s3dir <- file.path(p1, p2) +regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") outdir <- sub("s3:/", "~/s3", s3dir) -dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 1000, dryrun = F) +dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = F) } } diff --git a/man/gds_list_files_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd index ec57909..88a6748 100644 --- a/man/gds_list_files_filter_relevant.Rd +++ b/man/gds_list_files_filter_relevant.Rd @@ -36,8 +36,6 @@ without recursing further down the list using .} \item{recursive}{Should files be returned recursively \emph{in and under} the specified GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} - -\item{...}{Passed into \code{gds_list_files_dir}.} } \value{ A tibble with file type, basename, size, last modified timestamp, file_id, full path, @@ -48,6 +46,7 @@ Lists relevant files in a GDS directory. } \examples{ \dontrun{ +regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" gds_list_files_filter_relevant(gdsdir) } diff --git a/man/s3_list_files_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd index 8839bde..bc55db3 100644 --- a/man/s3_list_files_filter_relevant.Rd +++ b/man/s3_list_files_filter_relevant.Rd @@ -7,10 +7,10 @@ s3_list_files_filter_relevant( s3dir, pattern = NULL, + regexes = DR_FILE_REGEX, max_objects = 100, presign = FALSE, - expiry_sec = 3600, - regexes = DR_FILE_REGEX + expiry_sec = 3600 ) } \arguments{ @@ -18,13 +18,13 @@ s3_list_files_filter_relevant( \item{pattern}{Pattern to further filter the returned file type tibble.} -\item{max_objects}{The total number of objects to return.} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} + +\item{max_objects}{Maximum objects returned.} \item{presign}{Include presigned URLs (def: FALSE).} \item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated).} - -\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} } \value{ A tibble with file type, basename, size, last modified timestamp, @@ -35,7 +35,10 @@ Lists relevant files in an AWS S3 directory. } \examples{ \dontrun{ -s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" -s3_list_files_filter_relevant(s3dir = s3dir, presign = FALSE) +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +s3dir <- file.path(p1, p2) +regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") +s3_list_files_filter_relevant(s3dir = s3dir, regexes = regexes, max_objects = 300) } } From 61b4bf2747158a29d6433c9544da01b7f54e9559 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 8 Sep 2024 17:54:22 +1000 Subject: [PATCH 12/24] more gds/s3/local filesystem tweaks --- NAMESPACE | 2 +- R/Wf.R | 84 ++++-- R/fs_icav1.R | 258 +++++++++++++++++ R/fs_local.R | 46 +++ R/{s3.R => fs_s3.R} | 0 R/ica.R | 266 +----------------- R/utils.R | 48 ---- man/Wf.Rd | 60 +++- man/dr_gds_download.Rd | 4 +- man/dr_s3_download.Rd | 2 +- man/gds_file_download_api.Rd | 2 +- ...e_download.Rd => gds_file_download_cli.Rd} | 8 +- man/gds_file_presignedurl.Rd | 2 +- man/gds_list_files_dir.Rd | 2 +- man/gds_list_files_filter_relevant.Rd | 4 +- man/local_list_files_dir.Rd | 2 +- man/local_list_files_filter_relevant.Rd | 10 +- man/s3_file_presignedurl.Rd | 2 +- man/s3_list_files_dir.Rd | 2 +- man/s3_list_files_filter_relevant.Rd | 2 +- man/s3_search.Rd | 2 +- 21 files changed, 440 insertions(+), 368 deletions(-) create mode 100644 R/fs_icav1.R create mode 100644 R/fs_local.R rename R/{s3.R => fs_s3.R} (100%) rename man/{gds_file_download.Rd => gds_file_download_cli.Rd} (61%) diff --git a/NAMESPACE b/NAMESPACE index 8c6d3ab..aa3957e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,8 +46,8 @@ export(dr_output_format_valid) export(dr_s3_download) export(empty_tbl) export(file_regex_getter) -export(gds_file_download) export(gds_file_download_api) +export(gds_file_download_cli) export(gds_file_presignedurl) export(gds_files_list_fastq) export(gds_list_files_dir) diff --git a/R/Wf.R b/R/Wf.R index 80f6c08..0ddc2ee 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -25,19 +25,19 @@ Wf <- R6::R6Class( "Wf", public = list( #' @field path (`character(1)`)\cr - #' Output directory path with results. - #' @field type (`character(1)`)\cr - #' Type of workflow (e.g. umccrise, sash). + #' Path to directory with raw workflow results (from GDS, S3, or local filesystem). + #' @field wname (`character(1)`)\cr + #' Name of workflow (e.g. umccrise, sash). #' @field filesystem (`character(1)`)\cr #' Filesystem of `path`. path = NULL, - type = NULL, + wname = NULL, filesystem = NULL, #' @description Create a new Workflow object. #' @param path Output directory path with results. - #' @param type Type of workflow. - initialize = function(path = NULL, type = NULL) { - types <- c( + #' @param wname Name of workflow. + initialize = function(path = NULL, wname = NULL) { + wnames <- c( "bcl_convert", "tso_ctdna_tumor_only", "wgs_alignment_qc", @@ -52,9 +52,9 @@ Wf <- R6::R6Class( "oncoanalyser_wgts_existing_both", "sash" ) - assertthat::assert_that(type %in% types) + assertthat::assert_that(wname %in% wnames) self$path <- path - self$type <- type + self$wname <- wname self$filesystem <- dplyr::case_when( grepl("^gds://", path) ~ "gds", grepl("^s3://", path) ~ "s3", @@ -67,52 +67,86 @@ Wf <- R6::R6Class( res <- tibble::tribble( ~var, ~value, "path", self$path, - "type", self$type, + "wname", self$wname, "filesystem", self$filesystem ) print(res) invisible(self) }, #' @description List all files under given path. - #' @param max_objects Maximum number of objects to list. - list_files = function(max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN")) { + #' @param max_files Maximum number of files to list. + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param ... Passed on to `gds_list_files_dir` function. + list_files = function(max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { path <- self$path if (self$filesystem == "gds") { - d <- gds_list_files_dir(gdsdir = path, page_size = max_objects, token = ica_token) + d <- gds_list_files_dir( + gdsdir = path, token = ica_token, page_size = max_files, ... + ) } else if (self$filesystem == "s3") { - d <- s3_list_files_dir(s3dir = path, max_objects = max_objects) + d <- s3_list_files_dir(s3dir = path, max_objects = max_files) } else { d <- local_list_files_dir(localdir = path) } return(d) }, #' @description List dracarys files under given path - #' @param page_size Page size #' @param regexes Tibble with `regex` and `fun`ction name. - list_files_filter_relevant = function(regexes = NULL, max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + #' @param max_files Maximum number of files to list. + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param ... Passed on to the `gds_list_files_filter_relevant` or + #' the `s3_list_files_filter_relevant` function. + list_files_filter_relevant = function(regexes = NULL, + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { assertthat::assert_that(!is.null(regexes)) path <- self$path if (self$filesystem == "gds") { - d <- gds_list_files_filter_relevant(gdsdir = path, token = ica_token, page_size = max_objects, regexes = regexes, ...) + d <- gds_list_files_filter_relevant( + gdsdir = path, regexes = regexes, token = ica_token, page_size = max_files, ... + ) } else if (self$filesystem == "s3") { - # "type", "bname", "size", "date_utc", "path" - d <- s3_list_files_filter_relevant(s3dir = path, max_objects = max_objects, regexes = regexes, ...) + d <- s3_list_files_filter_relevant( + s3dir = path, regexes = regexes, max_objects = max_files, ... + ) } else { - # "type", "bname", "path" - d <- local_list_files_filter_relevant(path = path, regexes = regexes) + d <- local_list_files_filter_relevant(localdir = path, regexes = regexes) } d }, - download_files = function(ica_token = Sys.getenv("ICA_ACCESS_TOKEN")) { + #' @description Download files from GDS/S3 to local filesystem. + #' @param outdir Path to output directory. + #' @param regexes Tibble with `regex` and `fun`ction name. + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param max_files Maximum number of files to list. + #' @param dryrun If TRUE, just list the files that will be downloaded (don't + #' download them). + #' @param recursive Should files be returned recursively _in and under_ the specified + #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). + download_files = function(outdir, regexes = NULL, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, dryrun = FALSE, recursive = NULL) { # TODO: add envvar checker path <- self$path + assertthat::assert_that(!is.null(regexes)) if (self$filesystem == "gds") { - d <- dr_gds_download(gdsdir = path, token = ica_token) + d <- dr_gds_download( + gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token, + page_size = max_files, dryrun = dryrun, recursive = recursive + ) + self$filesystem <- "local" + self$path <- outdir } else if (self$filesystem == "s3") { - d <- s3_download_files(s3dir = path) + d <- dr_s3_download( + s3dir = path, outdir = outdir, regexes = regexes, + max_objects = max_files, dryrun = dryrun + ) + self$filesystem <- "local" + self$path <- outdir } else { - d <- local_download_files(localdir = path) + d <- self$list_files_filter_relevant(regexes = regexes) } + return(d) } ) # end public ) diff --git a/R/fs_icav1.R b/R/fs_icav1.R new file mode 100644 index 0000000..8f36e81 --- /dev/null +++ b/R/fs_icav1.R @@ -0,0 +1,258 @@ +#' List Files in ICAv1 GDS Directory +#' +#' Lists files in a GDS directory. +#' +#' @param gdsdir Full path to GDS directory. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @param page_size Page size (def: 10 via ICA API). +#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API). +#' @param page_token Page token (def: NULL). Used internally for recursion. +#' @param no_recurse Do not recurse through the file list i.e. just give the first items +#' without recursing further down the list using . +#' @param recursive Should files be returned recursively _in and under_ the specified +#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). +#' +#' @return A tibble with file ID, basename, size, last modified timestamp, +#' full GDS path, and presigned URL if requested. +#' @examples +#' \dontrun{ +#' gdsdir <- file.path( +#' "gds://production/analysis_data/SBJ00699/umccrise", +#' "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" +#' ) +#' token <- ica_token_validate() +#' page_size <- 11 +#' include_url <- F +#' page_token <- NULL +#' no_recurse <- TRUE +#' recursive <- NULL +#' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) +#' } +#' @export +gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL, + include_url = FALSE, no_recurse = TRUE, page_token = NULL, + recursive = NULL) { + assertthat::assert_that(is.logical(no_recurse), is.logical(include_url)) + assertthat::assert_that(is.null(recursive) || is.logical(recursive)) + token <- ica_token_validate(token) + assertthat::assert_that(grepl("^gds://", gdsdir)) + gdsdir_original <- gdsdir + if (!grepl("/$", gdsdir)) { + gdsdir <- glue("{gdsdir}/") + } + base_url <- "https://aps2.platform.illumina.com/v1" + volname <- sub("gds://(.*?)/.*", "\\1", gdsdir) + path2 <- sub("gds://(.*?)/(.*)", "\\2", gdsdir) + page_size <- ifelse(is.null(page_size), "", glue("&pageSize={page_size}")) + query_url <- glue("{base_url}/files?volume.name={volname}&path=/{path2}*{page_size}") + if (include_url) { + query_url <- glue("{query_url}&include=PresignedUrl") + } + if (!is.null(page_token)) { + query_url <- glue("{query_url}&pageToken={page_token}") + } + if (!is.null(recursive)) { + # without specifying recursive, it's true by default + recursive <- ifelse(recursive, "true", "false") + query_url <- glue("{query_url}&recursive={recursive}") + } + query_res <- httr::GET( + query_url, + httr::add_headers(Authorization = glue("Bearer {token}")), + httr::accept_json() + ) + j <- jsonlite::fromJSON(httr::content(x = query_res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE) + if (j[["itemCount"]] == 0) { + if (gds_likely_file(gdsdir_original)) { + cli::cli_abort("{date_log()} ERROR: Is the input directory a file perhaps?\n{.file {gdsdir_original}}") + } + # if there is a nextPageToken then abort, else continue + if (!is.null(j[["nextPageToken"]])) { + msg <- paste0( + "{date_log()} ERROR: ", + "No GDS files listed in the input directory. Please confirm you can ", + "access the following GDS input directory with your token: ", + "{.file {gdsdir_original}}" + ) + cli::cli_abort(msg) + } + } # endif + d <- j[["items"]] |> + purrr::map(\(x) c( + file_id = x[["id"]], + path = x[["path"]], + size = x[["sizeInBytes"]], + lastmodified = x[["timeModified"]], + presigned_url = x[["presignedUrl"]] + )) |> + dplyr::bind_rows() + if (nrow(d) == 0) { + # We've iterated through all available items, and the next page has 0 items. + # So dplyr::bind_rows(d, NULL) will return d. + return(NULL) + } + res <- d |> + dplyr::mutate( + size = fs::as_fs_bytes(.data$size), + bname = basename(.data$path), + path = glue("gds://{volname}{.data$path}") + ) |> + dplyr::select(dplyr::any_of(c("bname", "size", "lastmodified", "file_id", "path", "presigned_url"))) + + if (!is.null(j[["nextPageToken"]]) && !no_recurse) { + res2 <- gds_list_files_dir( + gdsdir = gdsdir, token = token, page_size = NULL, + include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]], + recursive = NULL + ) + res <- dplyr::bind_rows(res, res2) + } + res +} + +#' List Relevant Files In ICAv1 GDS Directory +#' +#' Lists relevant files in a GDS directory. +#' +#' @inheritParams gds_list_files_dir +#' @param pattern Pattern to further filter the returned file type tibble. +#' @param regexes Tibble with `regex` and `fun`ction name (see example). +#' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path, +#' and presigned URL if requested. +#' @examples +#' \dontrun{ +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") +#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +#' gds_list_files_filter_relevant(gdsdir) +#' } +#' @export +gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_FILE_REGEX, + token = Sys.getenv("ICA_ACCESS_TOKEN"), + page_size = 100, include_url = FALSE, + no_recurse = TRUE, page_token = NULL, + recursive = NULL) { + pattern <- pattern %||% ".*" # keep all recognisable files by default + assertthat::assert_that(all(colnames(regexes) == c("regex", "fun"))) + cols_sel <- c("type", "bname", "size", "lastmodified", "file_id", "path", "presigned_url") + d <- dracarys::gds_list_files_dir( + gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, + no_recurse = no_recurse, page_token = page_token, recursive = recursive + ) |> + dplyr::rowwise() |> + dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> + dplyr::ungroup() |> + dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> + dplyr::select(dplyr::any_of(cols_sel)) + d +} + +#' dracarys GDS Download +#' +#' Download only GDS files that can be processed by dracarys. +#' +#' @inheritParams gds_list_files_dir +#' @inheritParams gds_list_files_filter_relevant +#' @param outdir Local output directory. +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @examples +#' \dontrun{ +#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +#' outdir <- sub("gds:/", "~/icav1/g", gdsdir) +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") +#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F) +#' } +#' +#' @export +dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), + pattern = NULL, page_size = 100, dryrun = FALSE, + regexes = DR_FILE_REGEX, recursive = NULL) { + e <- emojifont::emoji + fs::dir_create(outdir) + d <- gds_list_files_filter_relevant( + gdsdir = gdsdir, pattern = pattern, regexes = regexes, + token = token, page_size = page_size, include_url = FALSE, + no_recurse = FALSE, page_token = NULL, + recursive = recursive + ) + d <- d |> + dplyr::mutate( + localpath = file.path(outdir, .data$bname), + gdspath = .data$path + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath") + # download recognisable dracarys files to outdir/{bname} + if (!dryrun) { + cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}") + res <- d |> + dplyr::rowwise() |> + dplyr::mutate( + dl = gds_file_download_api( + gds_fileid = .data$file_id, out_file = .data$localpath, token = token + ), + localpath = normalizePath(.data$localpath) + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id") + return(res) + } else { + cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}") + d |> + dplyr::select("type", "bname", "size", "lastmodified", "gdspath", "file_id", localpath2be = "localpath") |> + as.data.frame() |> + print() + } +} + +#' GDS File Presigned URL +#' +#' Returns presigned URL of given GDS file. +#' +#' @param gds_fileid GDS file ID. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @return Presigned URL if valid. +#' @export +gds_file_presignedurl <- function(gds_fileid, token) { + token <- ica_token_validate(token) + base_url <- "https://aps2.platform.illumina.com/v1" + url <- glue("{base_url}/files/{gds_fileid}") + res <- httr::GET( + url, + httr::add_headers(Authorization = glue("Bearer {token}")), + httr::accept_json() + ) + presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]] + assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url)) + presigned_url +} + +#' GDS File Download via API +#' +#' @param gds_fileid GDS file ID. +#' @param out_file Path to output file. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' +#' @examples +#' \dontrun{ +#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0" +#' out <- tempfile() +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' } +#' @export +gds_file_download_api <- function(gds_fileid, out_file, token) { + presigned_url <- gds_file_presignedurl(gds_fileid, token) + # keep quiet instead of logging presigned urls + status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE) + assertthat::assert_that(status_code == 0) + normalizePath(out_file) +} + +#' GDS File Download via CLI +#' +#' @param gds Full path to GDS file. +#' @param out Path to output file. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @export +gds_file_download_cli <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) { + token <- ica_token_validate(token) + system(glue("ica files download {gds} {out} --access-token {token}")) +} diff --git a/R/fs_local.R b/R/fs_local.R new file mode 100644 index 0000000..b90d94c --- /dev/null +++ b/R/fs_local.R @@ -0,0 +1,46 @@ +#' List Files in Local Directory +#' +#' Lists files in a local directory. +#' +#' @param localdir Path to local directory. +#' @return A tibble with file basename, size, last modification timestamp +#' and full path. +#' @examples +#' localdir <- system.file("R", package = "dracarys") +#' x <- local_list_files_dir(localdir) +#' @testexamples +#' expect_equal(names(x), c("bname", "size", "lastmodified", "path")) +#' @export +local_list_files_dir <- function(localdir) { + fs::dir_info(path = localdir, recurse = TRUE, type = "file") |> + dplyr::mutate( + bname = basename(.data$path), + lastmodified = .data$modification_time + ) |> + dplyr::select("bname", "size", "lastmodified", "path") +} + +#' List Relevant Files In Local Directory +#' +#' Lists relevant files in a local directory. +#' +#' @inheritParams local_list_files_dir +#' @param regexes Tibble with `regex` and `fun`ction name (see example). +#' @return A tibble with file type, basename, size, last modified timestamp, and +#' path. +#' +#' @examples +#' localdir <- system.file("extdata/tso", package = "dracarys") +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile") +#' x <- local_list_files_filter_relevant(localdir, regexes) +#' @testexamples +#' expect_equal(nrow(x), 1) +#' @export +local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX) { + local_list_files_dir(localdir = localdir) |> + dplyr::mutate( + type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes)) + ) |> + dplyr::filter(!is.na(.data$type)) |> + dplyr::select("type", "bname", "size", "lastmodified", localpath = "path") +} diff --git a/R/s3.R b/R/fs_s3.R similarity index 100% rename from R/s3.R rename to R/fs_s3.R diff --git a/R/ica.R b/R/ica.R index 3589688..951095b 100644 --- a/R/ica.R +++ b/R/ica.R @@ -1,226 +1,3 @@ -#' List Files in ICAv1 GDS Directory -#' -#' Lists files in a GDS directory. -#' -#' @param gdsdir Full path to GDS directory. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @param page_size Page size (def: 10 via ICA API). -#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API). -#' @param page_token Page token (def: NULL). Used internally for recursion. -#' @param no_recurse Do not recurse through the file list i.e. just give the first items -#' without recursing further down the list using . -#' @param recursive Should files be returned recursively _in and under_ the specified -#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). -#' -#' @return A tibble with file ID, basename, size, last modified timestamp, -#' full GDS path, and presigned URL if requested. -#' @examples -#' \dontrun{ -#' gdsdir <- file.path( -#' "gds://production/analysis_data/SBJ00699/umccrise", -#' "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" -#' ) -#' token <- ica_token_validate() -#' page_size <- 11 -#' include_url <- F -#' page_token <- NULL -#' no_recurse <- TRUE -#' recursive <- NULL -#' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) -#' } -#' @export -gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL, - include_url = FALSE, no_recurse = TRUE, page_token = NULL, - recursive = NULL) { - assertthat::assert_that(is.logical(no_recurse), is.logical(include_url)) - assertthat::assert_that(is.null(recursive) || is.logical(recursive)) - token <- ica_token_validate(token) - assertthat::assert_that(grepl("^gds://", gdsdir)) - gdsdir_original <- gdsdir - if (!grepl("/$", gdsdir)) { - gdsdir <- glue("{gdsdir}/") - } - base_url <- "https://aps2.platform.illumina.com/v1" - volname <- sub("gds://(.*?)/.*", "\\1", gdsdir) - path2 <- sub("gds://(.*?)/(.*)", "\\2", gdsdir) - page_size <- ifelse(is.null(page_size), "", glue("&pageSize={page_size}")) - query_url <- glue("{base_url}/files?volume.name={volname}&path=/{path2}*{page_size}") - if (include_url) { - query_url <- glue("{query_url}&include=PresignedUrl") - } - if (!is.null(page_token)) { - query_url <- glue("{query_url}&pageToken={page_token}") - } - if (!is.null(recursive)) { - # without specifying recursive, it's true by default - recursive <- ifelse(recursive, "true", "false") - query_url <- glue("{query_url}&recursive={recursive}") - } - query_res <- httr::GET( - query_url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - j <- jsonlite::fromJSON(httr::content(x = query_res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE) - if (j[["itemCount"]] == 0) { - if (likely_file(gdsdir_original)) { - cli::cli_abort("{date_log()} ERROR: Is the input directory a file perhaps?\n{.file {gdsdir_original}}") - } - # if there is a nextPageToken then abort, else continue - if (!is.null(j[["nextPageToken"]])) { - msg <- paste0( - "{date_log()} ERROR: ", - "No GDS files listed in the input directory. Please confirm you can ", - "access the following GDS input directory with your token: ", - "{.file {gdsdir_original}}" - ) - cli::cli_abort(msg) - } - } # endif - d <- j[["items"]] |> - purrr::map(\(x) c( - file_id = x[["id"]], path = x[["path"]], size = x[["sizeInBytes"]], - lastmodified = x[["timeModified"]], presigned_url = x[["presignedUrl"]] - )) |> - dplyr::bind_rows() - if (nrow(d) == 0) { - # We've iterated through all available items, and the next page has 0 items. - # So dplyr::bind_rows(d, NULL) will return d. - return(NULL) - } - res <- d |> - dplyr::mutate( - size = fs::as_fs_bytes(.data$size), - bname = basename(.data$path), - path = glue("gds://{volname}{.data$path}") - ) |> - dplyr::select(dplyr::any_of(c("file_id", "bname", "size", "lastmodified", "path", "presigned_url"))) - if (!is.null(j[["nextPageToken"]]) && !no_recurse) { - res2 <- gds_list_files_dir( - gdsdir = gdsdir, token = token, page_size = NULL, - include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]], - recursive = NULL - ) - res <- dplyr::bind_rows(res, res2) - } - res -} - -#' List Relevant Files In ICAv1 GDS Directory -#' -#' Lists relevant files in a GDS directory. -#' -#' @inheritParams gds_list_files_dir -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param regexes Tibble with `regex` and `fun`ction name. -#' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path, -#' and presigned URL if requested. -#' @examples -#' \dontrun{ -#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") -#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" -#' gds_list_files_filter_relevant(gdsdir) -#' } -#' @export -gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_FILE_REGEX, - token = Sys.getenv("ICA_ACCESS_TOKEN"), - page_size = 100, include_url = FALSE, - no_recurse = TRUE, page_token = NULL, - recursive = NULL) { - pattern <- pattern %||% ".*" # keep all recognisable files by default - assertthat::assert_that(all(colnames(regexes) == c("regex", "fun"))) - cols_sel <- c("type", "bname", "size", "lastmodified", "file_id", "path", "presigned_url") - d <- dracarys::gds_list_files_dir( - gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, - no_recurse = no_recurse, page_token = page_token, recursive = recursive - ) |> - dplyr::rowwise() |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> - dplyr::ungroup() |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select(dplyr::any_of(cols_sel)) - d -} - -#' dracarys GDS Download -#' -#' Download only GDS files that can be processed by dracarys. -#' -#' @inheritParams gds_list_files_dir -#' @inheritParams gds_list_files_filter_relevant -#' @param outdir Local output directory. -#' @param dryrun If TRUE, just list the files that will be downloaded (don't -#' download them). -#' @examples -#' \dontrun{ -#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" -#' outdir <- sub("gds:/", "~/icav1/g", gdsdir) -#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") -#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F) -#' } -#' -#' @export -dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), - pattern = NULL, page_size = 100, dryrun = FALSE, - regexes = DR_FILE_REGEX, recursive = NULL) { - e <- emojifont::emoji - fs::dir_create(outdir) - d <- gds_list_files_filter_relevant( - gdsdir = gdsdir, pattern = pattern, regexes = regexes, - token = token, page_size = page_size, include_url = FALSE, - no_recurse = FALSE, page_token = NULL, - recursive = recursive - ) - d <- d |> - dplyr::mutate( - localpath = file.path(outdir, .data$bname), - gdspath = .data$path - ) |> - dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath") - # download recognisable dracarys files to outdir/{bname} - if (!dryrun) { - cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}") - res <- d |> - dplyr::rowwise() |> - dplyr::mutate( - dl = gds_file_download_api( - gds_fileid = .data$file_id, out_file = .data$localpath, token = token - ), - localpath = normalizePath(.data$localpath) - ) |> - dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id") - return(res) - } else { - cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}") - d |> - dplyr::select("type", "bname", "size", "lastmodified", "gdspath", "file_id", localpath2be = "localpath") |> - as.data.frame() |> - print() - } -} - -#' GDS File Presigned URL -#' -#' Returns presigned URL of given GDS file. -#' -#' @param gds_fileid GDS file ID. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @return Presigned URL if valid. -#' @export -gds_file_presignedurl <- function(gds_fileid, token) { - token <- ica_token_validate(token) - base_url <- "https://aps2.platform.illumina.com/v1" - url <- glue("{base_url}/files/{gds_fileid}") - res <- httr::GET( - url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]] - assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url)) - presigned_url -} - #' List FASTQs In GDS Directory #' #' @param gdsdir GDS directory. @@ -247,11 +24,8 @@ gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = "fastq\\.gz$", "FASTQ" ) g <- gds_list_files_filter_relevant( - gdsdir = gdsdir, token = token, pattern = NULL, include_url = include_url, - page_size = page_size, regexes = fq_regex - ) - assertthat::assert_that( - all(colnames(g) == c("type", "bname", "size", "file_id", "path")) + gdsdir = gdsdir, pattern = NULL, regexes = fq_regex, + token = token, page_size = page_size, include_url = include_url ) g |> dplyr::mutate( @@ -259,42 +33,10 @@ gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = size_num = as.numeric(.data$size) ) |> dplyr::select( - "type", "bname", "size", "size_chr", "size_num", "file_id", "path" + "type", "bname", "size", "lastmodified", "size_chr", "size_num", "file_id", "path" ) } -#' GDS File Download via API -#' -#' @param gds_fileid GDS file ID. -#' @param out_file Path to output file. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' -#' @examples -#' \dontrun{ -#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0" -#' out <- tempfile() -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' } -#' @export -gds_file_download_api <- function(gds_fileid, out_file, token) { - presigned_url <- gds_file_presignedurl(gds_fileid, token) - # keep quiet instead of logging presigned urls - status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE) - assertthat::assert_that(status_code == 0) - normalizePath(out_file) -} - -#' GDS File Download via CLI -#' -#' @param gds Full path to GDS file. -#' @param out Path to output file. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @export -gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) { - token <- ica_token_validate(token) - system(glue("ica files download {gds} {out} --access-token {token}")) -} - #' List GDS Volumes #' #' Lists GDS volumes accessible by the provided ICA token. @@ -357,7 +99,7 @@ ica_token_exp <- function(token = Sys.getenv("ICA_ACCESS_TOKEN")) { structure(l$payload$exp, class = c("POSIXct", "POSIXt")) } -likely_file <- function(x) { +gds_likely_file <- function(x) { e <- c( "txt", "tsv", "csv", "html", "json", "stdout", "stderr", "stdouterr", "log", "vcf", "gz", "bam", "bai" diff --git a/R/utils.R b/R/utils.R index 99c6550..378588b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,51 +1,3 @@ -#' List Files in Local Directory -#' -#' Lists files in a local directory. -#' -#' @param localdir Path to local directory. -#' -#' @return A tibble with file basename, size, last modification timestamp -#' and full path. -#' @examples -#' localdir <- system.file("R", package = "dracarys") -#' x <- local_list_files_dir(localdir) -#' @testexamples -#' expect_equal(names(x), c("bname", "size", "lastmodified", "path")) -#' @export -local_list_files_dir <- function(localdir) { - fs::dir_info(path = localdir, recurse = TRUE, type = "file") |> - dplyr::mutate( - bname = basename(.data$path), - lastmodified = .data$modification_time - ) |> - dplyr::select("bname", "size", "lastmodified", "path") -} - -#' List Relevant Files In Local Directory -#' -#' Lists relevant files in a local directory. -#' -#' @param path Path to local directory. -#' @param regexes Tibble with `regex` and `fun`ction name (see example). -#' @return A tibble with file type, basename, size, last modified timestamp, and -#' path. -#' -#' @examples -#' path <- system.file("extdata/tso", package = "dracarys") -#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile") -#' x <- local_list_files_filter_relevant(path, regexes) -#' @testexamples -#' expect_equal(nrow(x), 1) -#' @export -local_list_files_filter_relevant <- function(path, regexes = DR_FILE_REGEX) { - local_list_files_dir(localdir = path) |> - dplyr::mutate( - type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes)) - ) |> - dplyr::filter(!is.na(.data$type)) |> - dplyr::select("type", "bname", "size", "lastmodified", "path") -} - #' Print current timestamp for logging #' #' @return Current timestamp as character. diff --git a/man/Wf.Rd b/man/Wf.Rd index a99ffbb..7efeb70 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -34,10 +34,10 @@ um <- Wf$new(p, "umccrise") \if{html}{\out{
}} \describe{ \item{\code{path}}{(\code{character(1)})\cr -Output directory path with results.} +Path to directory with raw workflow results (from GDS, S3, or local filesystem).} -\item{\code{type}}{(\code{character(1)})\cr -Type of workflow (e.g. umccrise, sash).} +\item{\code{wname}}{(\code{character(1)})\cr +Name of workflow (e.g. umccrise, sash).} \item{\code{filesystem}}{(\code{character(1)})\cr Filesystem of \code{path}.} @@ -61,7 +61,7 @@ Filesystem of \code{path}.} \subsection{Method \code{new()}}{ Create a new Workflow object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$new(path = NULL, type = NULL)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$new(path = NULL, wname = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -69,7 +69,7 @@ Create a new Workflow object. \describe{ \item{\code{path}}{Output directory path with results.} -\item{\code{type}}{Type of workflow.} +\item{\code{wname}}{Name of workflow.} } \if{html}{\out{
}} } @@ -97,13 +97,21 @@ Print details about the Workflow. \subsection{Method \code{list_files()}}{ List all files under given path. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$list_files(max_objects = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"))}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$list_files( + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + ... +)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{max_objects}}{Maximum number of objects to list.} +\item{\code{max_files}}{Maximum number of files to list.} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{...}}{Passed on to \code{gds_list_files_dir} function.} } \if{html}{\out{
}} } @@ -116,7 +124,7 @@ List dracarys files under given path \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$list_files_filter_relevant( regexes = NULL, - max_objects = 1000, + max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ... )}\if{html}{\out{
}} @@ -127,7 +135,12 @@ List dracarys files under given path \describe{ \item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.} -\item{\code{page_size}}{Page size} +\item{\code{max_files}}{Maximum number of files to list.} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or +the \code{s3_list_files_filter_relevant} function.} } \if{html}{\out{
}} } @@ -136,10 +149,37 @@ List dracarys files under given path \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf-download_files}{}}} \subsection{Method \code{download_files()}}{ +Download files from GDS/S3 to local filesystem. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$download_files(ica_token = Sys.getenv("ICA_ACCESS_TOKEN"))}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$download_files( + outdir, + regexes = NULL, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, + dryrun = FALSE, + recursive = NULL +)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{outdir}}{Path to output directory.} + +\item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{max_files}}{Maximum number of files to list.} + +\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't +download them).} + +\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd index 14a25fd..f748163 100644 --- a/man/dr_gds_download.Rd +++ b/man/dr_gds_download.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R +% Please edit documentation in R/fs_icav1.R \name{dr_gds_download} \alias{dr_gds_download} \title{dracarys GDS Download} @@ -29,7 +29,7 @@ dr_gds_download( \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} -\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} \item{recursive}{Should files be returned recursively \emph{in and under} the specified GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd index 6619188..86ff3f8 100644 --- a/man/dr_s3_download.Rd +++ b/man/dr_s3_download.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3.R +% Please edit documentation in R/fs_s3.R \name{dr_s3_download} \alias{dr_s3_download} \title{dracarys S3 Download} diff --git a/man/gds_file_download_api.Rd b/man/gds_file_download_api.Rd index 1397860..b10a438 100644 --- a/man/gds_file_download_api.Rd +++ b/man/gds_file_download_api.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R +% Please edit documentation in R/fs_icav1.R \name{gds_file_download_api} \alias{gds_file_download_api} \title{GDS File Download via API} diff --git a/man/gds_file_download.Rd b/man/gds_file_download_cli.Rd similarity index 61% rename from man/gds_file_download.Rd rename to man/gds_file_download_cli.Rd index 10ab7b5..f5946fb 100644 --- a/man/gds_file_download.Rd +++ b/man/gds_file_download_cli.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R -\name{gds_file_download} -\alias{gds_file_download} +% Please edit documentation in R/fs_icav1.R +\name{gds_file_download_cli} +\alias{gds_file_download_cli} \title{GDS File Download via CLI} \usage{ -gds_file_download(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) +gds_file_download_cli(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) } \arguments{ \item{gds}{Full path to GDS file.} diff --git a/man/gds_file_presignedurl.Rd b/man/gds_file_presignedurl.Rd index 064adb5..841db4a 100644 --- a/man/gds_file_presignedurl.Rd +++ b/man/gds_file_presignedurl.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R +% Please edit documentation in R/fs_icav1.R \name{gds_file_presignedurl} \alias{gds_file_presignedurl} \title{GDS File Presigned URL} diff --git a/man/gds_list_files_dir.Rd b/man/gds_list_files_dir.Rd index 7d52c1d..d60b546 100644 --- a/man/gds_list_files_dir.Rd +++ b/man/gds_list_files_dir.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R +% Please edit documentation in R/fs_icav1.R \name{gds_list_files_dir} \alias{gds_list_files_dir} \title{List Files in ICAv1 GDS Directory} diff --git a/man/gds_list_files_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd index 88a6748..1e280b4 100644 --- a/man/gds_list_files_filter_relevant.Rd +++ b/man/gds_list_files_filter_relevant.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R +% Please edit documentation in R/fs_icav1.R \name{gds_list_files_filter_relevant} \alias{gds_list_files_filter_relevant} \title{List Relevant Files In ICAv1 GDS Directory} @@ -21,7 +21,7 @@ gds_list_files_filter_relevant( \item{pattern}{Pattern to further filter the returned file type tibble.} -\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} \item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} diff --git a/man/local_list_files_dir.Rd b/man/local_list_files_dir.Rd index 771aa9b..c66a6c3 100644 --- a/man/local_list_files_dir.Rd +++ b/man/local_list_files_dir.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/fs_local.R \name{local_list_files_dir} \alias{local_list_files_dir} \title{List Files in Local Directory} diff --git a/man/local_list_files_filter_relevant.Rd b/man/local_list_files_filter_relevant.Rd index 4e17351..caf0832 100644 --- a/man/local_list_files_filter_relevant.Rd +++ b/man/local_list_files_filter_relevant.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/fs_local.R \name{local_list_files_filter_relevant} \alias{local_list_files_filter_relevant} \title{List Relevant Files In Local Directory} \usage{ -local_list_files_filter_relevant(path, regexes = DR_FILE_REGEX) +local_list_files_filter_relevant(localdir, regexes = DR_FILE_REGEX) } \arguments{ -\item{path}{Path to local directory.} +\item{localdir}{Path to local directory.} \item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} } @@ -19,7 +19,7 @@ path. Lists relevant files in a local directory. } \examples{ -path <- system.file("extdata/tso", package = "dracarys") +localdir <- system.file("extdata/tso", package = "dracarys") regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcFile") -x <- local_list_files_filter_relevant(path, regexes) +x <- local_list_files_filter_relevant(localdir, regexes) } diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd index 1598a92..79eb7d6 100644 --- a/man/s3_file_presignedurl.Rd +++ b/man/s3_file_presignedurl.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3.R +% Please edit documentation in R/fs_s3.R \name{s3_file_presignedurl} \alias{s3_file_presignedurl} \title{S3 Generate Presigned URL} diff --git a/man/s3_list_files_dir.Rd b/man/s3_list_files_dir.Rd index 055692f..820da1d 100644 --- a/man/s3_list_files_dir.Rd +++ b/man/s3_list_files_dir.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3.R +% Please edit documentation in R/fs_s3.R \name{s3_list_files_dir} \alias{s3_list_files_dir} \title{List Objects in AWS S3 Directory} diff --git a/man/s3_list_files_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd index bc55db3..1716825 100644 --- a/man/s3_list_files_filter_relevant.Rd +++ b/man/s3_list_files_filter_relevant.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3.R +% Please edit documentation in R/fs_s3.R \name{s3_list_files_filter_relevant} \alias{s3_list_files_filter_relevant} \title{List Relevant Files In AWS S3 Directory} diff --git a/man/s3_search.Rd b/man/s3_search.Rd index c0d9f64..c4a1db2 100644 --- a/man/s3_search.Rd +++ b/man/s3_search.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3.R +% Please edit documentation in R/fs_s3.R \name{s3_search} \alias{s3_search} \title{Search AWS S3 Objects} From b8cc5cde6ee839f6d62d831668f2cfd14f8eaa90 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 8 Sep 2024 20:12:27 +1000 Subject: [PATCH 13/24] Wf.R: downloading works across gds/s3 --- R/Wf.R | 61 ++++++++++++++++++++++++++++++++++++++++++++++--------- man/Wf.Rd | 56 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 101 insertions(+), 16 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index 0ddc2ee..0dce589 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -5,7 +5,7 @@ #' #' A workflow has: #' -#' - an output directory path with all the result output files (either on GDS, S3 or +#' - a directory path with all the raw output files (either on GDS, S3 or #' local filesystem) #' - a subset of files that are of interest for ingestion #' - tibble with full path and basename columns @@ -15,9 +15,43 @@ #' #' @examples #' \dontrun{ -#' p1 <- "~/icav1/g/production/analysis_data" -#' p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") -#' um <- Wf$new(p, "umccrise") +#' regexes <- tibble::tribble( +#' ~regex, ~fun, +#' "-chord\\.tsv\\.gz$", "UmChordTsvFile", +#' "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", +#' "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", +#' "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", +#' "-dbs\\.tsv\\.gz$", "UmSigsDbsFile", +#' "-indel\\.tsv\\.gz$", "UmSigsIndelFile", +#' "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", +#' ) +#' +#' #---- LOCAL ----# +#' p1_local <- "~/icav1/g/production/analysis_data" +#' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +#' um1 <- Wf$new(path = p, wname = "umccrise") +#' um1$list_files(max_files = 10) +#' um1$list_files_filter_relevant(regexes = regexes) +#' +#' #---- GDS ----# +#' p1_gds <- "gds://production/analysis_data" +#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' um2 <- Wf$new(path = p, wname = "umccrise") +#' um2$list_files(max_files = 10) +#' um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500) +#' d <- um2$download_files(outdir = outdir, regexes = regexes, ica_token = token, max_files = 1000, dryrun = F) +#' +#' #---- S3 ----# +#' p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" +#' p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables" +#' p <- file.path(p1_s3, p2_s3) +#' outdir <- sub("s3:/", "~/s3", p) +#' um3 <- Wf$new(path = p, wname = "sash") +#' um3$list_files(max_files = 10) +#' um3$list_files_filter_relevant(regexes = regexes, max_files = 50) +#' d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F) #' } #' #' @export @@ -74,7 +108,7 @@ Wf <- R6::R6Class( invisible(self) }, #' @description List all files under given path. - #' @param max_files Maximum number of files to list. + #' @param max_files Max number of files to list (for gds/s3 only). #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param ... Passed on to `gds_list_files_dir` function. list_files = function(max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { @@ -92,7 +126,7 @@ Wf <- R6::R6Class( }, #' @description List dracarys files under given path #' @param regexes Tibble with `regex` and `fun`ction name. - #' @param max_files Maximum number of files to list. + #' @param max_files Max number of files to list (for gds/s3 only). #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param ... Passed on to the `gds_list_files_filter_relevant` or #' the `s3_list_files_filter_relevant` function. @@ -134,19 +168,26 @@ Wf <- R6::R6Class( gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token, page_size = max_files, dryrun = dryrun, recursive = recursive ) - self$filesystem <- "local" - self$path <- outdir + if (!dryrun) { + self$filesystem <- "local" + self$path <- outdir + } } else if (self$filesystem == "s3") { d <- dr_s3_download( s3dir = path, outdir = outdir, regexes = regexes, max_objects = max_files, dryrun = dryrun ) - self$filesystem <- "local" - self$path <- outdir + if (!dryrun) { + self$filesystem <- "local" + self$path <- outdir + } } else { d <- self$list_files_filter_relevant(regexes = regexes) } return(d) + }, + tidy_files = function() { + } ) # end public ) diff --git a/man/Wf.Rd b/man/Wf.Rd index 7efeb70..19dd472 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -9,7 +9,7 @@ workflow run from a UMCCR workflow manager. A workflow has: \itemize{ -\item an output directory path with all the result output files (either on GDS, S3 or +\item a directory path with all the raw output files (either on GDS, S3 or local filesystem) \item a subset of files that are of interest for ingestion \itemize{ @@ -24,9 +24,43 @@ local filesystem) } \examples{ \dontrun{ -p1 <- "~/icav1/g/production/analysis_data" -p <- file.path(p1, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") -um <- Wf$new(p, "umccrise") +regexes <- tibble::tribble( + ~regex, ~fun, + "-chord\\\\.tsv\\\\.gz$", "UmChordTsvFile", + "-hrdetect\\\\.tsv\\\\.gz$", "UmHrdetectTsvFile", + "-snv_2015\\\\.tsv\\\\.gz$", "UmSigsSnvFile", + "-snv_2020\\\\.tsv\\\\.gz$", "UmSigsSnvFile", + "-dbs\\\\.tsv\\\\.gz$", "UmSigsDbsFile", + "-indel\\\\.tsv\\\\.gz$", "UmSigsIndelFile", + "-qc_summary\\\\.tsv\\\\.gz$", "UmQcSumFile", + ) + +#---- LOCAL ----# +p1_local <- "~/icav1/g/production/analysis_data" +p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +um1 <- Wf$new(path = p, wname = "umccrise") +um1$list_files(max_files = 10) +um1$list_files_filter_relevant(regexes = regexes) + +#---- GDS ----# +p1_gds <- "gds://production/analysis_data" +p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +token <- Sys.getenv("ICA_ACCESS_TOKEN") +um2 <- Wf$new(path = p, wname = "umccrise") +um2$list_files(max_files = 10) +um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500) +d <- um2$download_files(outdir = outdir, regexes = regexes, ica_token = token, max_files = 1000, dryrun = F) + +#---- S3 ----# +p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" +p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables" +p <- file.path(p1_s3, p2_s3) +outdir <- sub("s3:/", "~/s3", p) +um3 <- Wf$new(path = p, wname = "sash") +um3$list_files(max_files = 10) +um3$list_files_filter_relevant(regexes = regexes, max_files = 50) +d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F) } } @@ -52,6 +86,7 @@ Filesystem of \code{path}.} \item \href{#method-Wf-list_files}{\code{Wf$list_files()}} \item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}} \item \href{#method-Wf-download_files}{\code{Wf$download_files()}} +\item \href{#method-Wf-tidy_files}{\code{Wf$tidy_files()}} \item \href{#method-Wf-clone}{\code{Wf$clone()}} } } @@ -107,7 +142,7 @@ List all files under given path. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{max_files}}{Maximum number of files to list.} +\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} @@ -135,7 +170,7 @@ List dracarys files under given path \describe{ \item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.} -\item{\code{max_files}}{Maximum number of files to list.} +\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} @@ -180,6 +215,15 @@ GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE } \if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-tidy_files}{}}} +\subsection{Method \code{tidy_files()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$tidy_files()}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} From 7074df8122f0bca05d0f010cbff9c56db4b84360 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 8 Sep 2024 21:31:34 +1000 Subject: [PATCH 14/24] add tidy_files function --- NAMESPACE | 1 + R/Wf.R | 24 ++++++++---- R/fs_local.R | 16 ++++++-- man/Wf.Rd | 38 ++++++++++++------- man/local_list_files_dir.Rd | 4 +- man/local_list_files_filter_relevant.Rd | 8 +++- man/tidy_files.Rd | 27 +++++++++++++ .../test-roxytest-testexamples-fs_local.R | 20 ++++++++++ 8 files changed, 111 insertions(+), 27 deletions(-) create mode 100644 man/tidy_files.Rd create mode 100644 tests/testthat/test-roxytest-testexamples-fs_local.R diff --git a/NAMESPACE b/NAMESPACE index aa3957e..416306c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -75,6 +75,7 @@ export(s3_list_files_dir) export(s3_list_files_filter_relevant) export(s3_search) export(session_info_kable) +export(tidy_files) export(time_metrics_process) export(umccr_tidy) export(write_dracarys_list_of_tbls) diff --git a/R/Wf.R b/R/Wf.R index 0dce589..3c2c3bd 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -30,8 +30,9 @@ #' p1_local <- "~/icav1/g/production/analysis_data" #' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") #' um1 <- Wf$new(path = p, wname = "umccrise") -#' um1$list_files(max_files = 10) -#' um1$list_files_filter_relevant(regexes = regexes) +#' um1$list_files(max_files = 100) +#' um1$list_files_filter_relevant(regexes = regexes, max_files = 100) +#' #' #' #---- GDS ----# #' p1_gds <- "gds://production/analysis_data" @@ -41,7 +42,10 @@ #' um2 <- Wf$new(path = p, wname = "umccrise") #' um2$list_files(max_files = 10) #' um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500) -#' d <- um2$download_files(outdir = outdir, regexes = regexes, ica_token = token, max_files = 1000, dryrun = F) +#' d <- um2$download_files( +#' outdir = outdir, regexes = regexes, ica_token = token, +#' max_files = 1000, dryrun = F +#' ) #' #' #---- S3 ----# #' p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" @@ -120,7 +124,7 @@ Wf <- R6::R6Class( } else if (self$filesystem == "s3") { d <- s3_list_files_dir(s3dir = path, max_objects = max_files) } else { - d <- local_list_files_dir(localdir = path) + d <- local_list_files_dir(localdir = path, max_files = max_files) } return(d) }, @@ -144,7 +148,9 @@ Wf <- R6::R6Class( s3dir = path, regexes = regexes, max_objects = max_files, ... ) } else { - d <- local_list_files_filter_relevant(localdir = path, regexes = regexes) + d <- local_list_files_filter_relevant( + localdir = path, regexes = regexes, max_files = max_files + ) } d }, @@ -182,12 +188,14 @@ Wf <- R6::R6Class( self$path <- outdir } } else { - d <- self$list_files_filter_relevant(regexes = regexes) + d <- self$list_files_filter_relevant(regexes = regexes, max_files = max_files) } return(d) }, - tidy_files = function() { - + #' @description Tidy given files. + #' @param x Tibble with `fun`ction to parse the file and `localpath` to the file. + tidy_files = function(x) { + tidy_files(x) } ) # end public ) diff --git a/R/fs_local.R b/R/fs_local.R index b90d94c..07397aa 100644 --- a/R/fs_local.R +++ b/R/fs_local.R @@ -3,6 +3,8 @@ #' Lists files in a local directory. #' #' @param localdir Path to local directory. +#' @param max_files Max files returned. +#' #' @return A tibble with file basename, size, last modification timestamp #' and full path. #' @examples @@ -11,13 +13,18 @@ #' @testexamples #' expect_equal(names(x), c("bname", "size", "lastmodified", "path")) #' @export -local_list_files_dir <- function(localdir) { - fs::dir_info(path = localdir, recurse = TRUE, type = "file") |> +local_list_files_dir <- function(localdir, max_files = NULL) { + d <- fs::dir_info(path = localdir, recurse = TRUE, type = "file") |> dplyr::mutate( bname = basename(.data$path), lastmodified = .data$modification_time ) |> dplyr::select("bname", "size", "lastmodified", "path") + if (!is.null(max_files)) { + d <- d |> + dplyr::slice_head(n = max_files) + } + d } #' List Relevant Files In Local Directory @@ -26,6 +33,7 @@ local_list_files_dir <- function(localdir) { #' #' @inheritParams local_list_files_dir #' @param regexes Tibble with `regex` and `fun`ction name (see example). +#' #' @return A tibble with file type, basename, size, last modified timestamp, and #' path. #' @@ -36,8 +44,8 @@ local_list_files_dir <- function(localdir) { #' @testexamples #' expect_equal(nrow(x), 1) #' @export -local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX) { - local_list_files_dir(localdir = localdir) |> +local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX, max_files = NULL) { + local_list_files_dir(localdir = localdir, max_files = max_files) |> dplyr::mutate( type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes)) ) |> diff --git a/man/Wf.Rd b/man/Wf.Rd index 19dd472..bc9b4b3 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -25,22 +25,23 @@ local filesystem) \examples{ \dontrun{ regexes <- tibble::tribble( - ~regex, ~fun, - "-chord\\\\.tsv\\\\.gz$", "UmChordTsvFile", - "-hrdetect\\\\.tsv\\\\.gz$", "UmHrdetectTsvFile", - "-snv_2015\\\\.tsv\\\\.gz$", "UmSigsSnvFile", - "-snv_2020\\\\.tsv\\\\.gz$", "UmSigsSnvFile", - "-dbs\\\\.tsv\\\\.gz$", "UmSigsDbsFile", - "-indel\\\\.tsv\\\\.gz$", "UmSigsIndelFile", - "-qc_summary\\\\.tsv\\\\.gz$", "UmQcSumFile", - ) + ~regex, ~fun, + "-chord\\\\.tsv\\\\.gz$", "UmChordTsvFile", + "-hrdetect\\\\.tsv\\\\.gz$", "UmHrdetectTsvFile", + "-snv_2015\\\\.tsv\\\\.gz$", "UmSigsSnvFile", + "-snv_2020\\\\.tsv\\\\.gz$", "UmSigsSnvFile", + "-dbs\\\\.tsv\\\\.gz$", "UmSigsDbsFile", + "-indel\\\\.tsv\\\\.gz$", "UmSigsIndelFile", + "-qc_summary\\\\.tsv\\\\.gz$", "UmQcSumFile", +) #---- LOCAL ----# p1_local <- "~/icav1/g/production/analysis_data" p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") um1 <- Wf$new(path = p, wname = "umccrise") -um1$list_files(max_files = 10) -um1$list_files_filter_relevant(regexes = regexes) +um1$list_files(max_files = 100) +um1$list_files_filter_relevant(regexes = regexes, max_files = 100) + #---- GDS ----# p1_gds <- "gds://production/analysis_data" @@ -50,7 +51,10 @@ token <- Sys.getenv("ICA_ACCESS_TOKEN") um2 <- Wf$new(path = p, wname = "umccrise") um2$list_files(max_files = 10) um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500) -d <- um2$download_files(outdir = outdir, regexes = regexes, ica_token = token, max_files = 1000, dryrun = F) +d <- um2$download_files( + outdir = outdir, regexes = regexes, ica_token = token, + max_files = 1000, dryrun = F +) #---- S3 ----# p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" @@ -220,10 +224,18 @@ GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf-tidy_files}{}}} \subsection{Method \code{tidy_files()}}{ +Tidy given files. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$tidy_files()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$tidy_files(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Tibble with \code{fun}ction to parse the file and \code{localpath} to the file.} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/local_list_files_dir.Rd b/man/local_list_files_dir.Rd index c66a6c3..d77ca63 100644 --- a/man/local_list_files_dir.Rd +++ b/man/local_list_files_dir.Rd @@ -4,10 +4,12 @@ \alias{local_list_files_dir} \title{List Files in Local Directory} \usage{ -local_list_files_dir(localdir) +local_list_files_dir(localdir, max_files = NULL) } \arguments{ \item{localdir}{Path to local directory.} + +\item{max_files}{Max files returned.} } \value{ A tibble with file basename, size, last modification timestamp diff --git a/man/local_list_files_filter_relevant.Rd b/man/local_list_files_filter_relevant.Rd index caf0832..ac62065 100644 --- a/man/local_list_files_filter_relevant.Rd +++ b/man/local_list_files_filter_relevant.Rd @@ -4,12 +4,18 @@ \alias{local_list_files_filter_relevant} \title{List Relevant Files In Local Directory} \usage{ -local_list_files_filter_relevant(localdir, regexes = DR_FILE_REGEX) +local_list_files_filter_relevant( + localdir, + regexes = DR_FILE_REGEX, + max_files = NULL +) } \arguments{ \item{localdir}{Path to local directory.} \item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} + +\item{max_files}{Max files returned.} } \value{ A tibble with file type, basename, size, last modified timestamp, and diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd new file mode 100644 index 0000000..de96245 --- /dev/null +++ b/man/tidy_files.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidy.R +\name{tidy_files} +\alias{tidy_files} +\title{Tidy Files} +\usage{ +tidy_files(x) +} +\arguments{ +\item{x}{Tibble with \code{fun}ction to parse the file and \code{localpath} to the file.} +} +\value{ +Tibble with parsed data in a \code{data} list-column. +} +\description{ +Tidy Files +} +\examples{ +\dontrun{ +p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" +p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz" +p <- file.path(p1, p2) +x <- tibble::tibble(fun = "readr::read_tsv", localpath = p) +) +} + +} diff --git a/tests/testthat/test-roxytest-testexamples-fs_local.R b/tests/testthat/test-roxytest-testexamples-fs_local.R new file mode 100644 index 0000000..764c4aa --- /dev/null +++ b/tests/testthat/test-roxytest-testexamples-fs_local.R @@ -0,0 +1,20 @@ +# Generated by roxytest: do not edit by hand! + +# File R/fs_local.R: @testexamples + +test_that("Function local_list_files_dir() @ L16", { + + localdir <- system.file("R", package = "dracarys") + x <- local_list_files_dir(localdir) + expect_equal(names(x), c("bname", "size", "lastmodified", "path")) +}) + + +test_that("Function local_list_files_filter_relevant() @ L47", { + + localdir <- system.file("extdata/tso", package = "dracarys") + regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile") + x <- local_list_files_filter_relevant(localdir, regexes) + expect_equal(nrow(x), 1) +}) + From 3be8ad574e7eb14c09b8152571b6b610a8d921ca Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 8 Sep 2024 22:33:13 +1000 Subject: [PATCH 15/24] initialise Wf superclass with regexes --- R/Wf.R | 31 +++++++++++++++++-------------- man/Wf.Rd | 27 +++++++++++++-------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index 3c2c3bd..634c6ad 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -29,9 +29,9 @@ #' #---- LOCAL ----# #' p1_local <- "~/icav1/g/production/analysis_data" #' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") -#' um1 <- Wf$new(path = p, wname = "umccrise") +#' um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) #' um1$list_files(max_files = 100) -#' um1$list_files_filter_relevant(regexes = regexes, max_files = 100) +#' um1$list_files_filter_relevant(max_files = 100) #' #' #' #---- GDS ----# @@ -39,11 +39,11 @@ #' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") #' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) #' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' um2 <- Wf$new(path = p, wname = "umccrise") +#' um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) #' um2$list_files(max_files = 10) -#' um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500) +#' um2$list_files_filter_relevant(ica_token = token, max_files = 500) #' d <- um2$download_files( -#' outdir = outdir, regexes = regexes, ica_token = token, +#' outdir = outdir, ica_token = token, #' max_files = 1000, dryrun = F #' ) #' @@ -52,9 +52,9 @@ #' p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables" #' p <- file.path(p1_s3, p2_s3) #' outdir <- sub("s3:/", "~/s3", p) -#' um3 <- Wf$new(path = p, wname = "sash") +#' um3 <- Wf$new(path = p, wname = "sash", regexes = regexes) #' um3$list_files(max_files = 10) -#' um3$list_files_filter_relevant(regexes = regexes, max_files = 50) +#' um3$list_files_filter_relevant(max_files = 50) #' d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F) #' } #' @@ -68,13 +68,17 @@ Wf <- R6::R6Class( #' Name of workflow (e.g. umccrise, sash). #' @field filesystem (`character(1)`)\cr #' Filesystem of `path`. + #' @field regexes (`tibble()`)\cr + #' Tibble with file `regex` and `fun`ction to parse it. path = NULL, wname = NULL, filesystem = NULL, + regexes = NULL, #' @description Create a new Workflow object. #' @param path Output directory path with results. #' @param wname Name of workflow. - initialize = function(path = NULL, wname = NULL) { + #' @param regexes Tibble with file `regex` and `fun`ction to parse it. + initialize = function(path = NULL, wname = NULL, regexes = NULL) { wnames <- c( "bcl_convert", "tso_ctdna_tumor_only", @@ -98,6 +102,7 @@ Wf <- R6::R6Class( grepl("^s3://", path) ~ "s3", .default = "local" ) + self$regexes <- regexes }, #' @description Print details about the Workflow. #' @param ... (ignored). @@ -129,14 +134,13 @@ Wf <- R6::R6Class( return(d) }, #' @description List dracarys files under given path - #' @param regexes Tibble with `regex` and `fun`ction name. #' @param max_files Max number of files to list (for gds/s3 only). #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param ... Passed on to the `gds_list_files_filter_relevant` or #' the `s3_list_files_filter_relevant` function. - list_files_filter_relevant = function(regexes = NULL, - max_files = 1000, + list_files_filter_relevant = function(max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + regexes <- self$regexes assertthat::assert_that(!is.null(regexes)) path <- self$path if (self$filesystem == "gds") { @@ -156,18 +160,17 @@ Wf <- R6::R6Class( }, #' @description Download files from GDS/S3 to local filesystem. #' @param outdir Path to output directory. - #' @param regexes Tibble with `regex` and `fun`ction name. #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param max_files Maximum number of files to list. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). #' @param recursive Should files be returned recursively _in and under_ the specified #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - download_files = function(outdir, regexes = NULL, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, dryrun = FALSE, recursive = NULL) { # TODO: add envvar checker path <- self$path + regexes <- self$regexes assertthat::assert_that(!is.null(regexes)) if (self$filesystem == "gds") { d <- dr_gds_download( diff --git a/man/Wf.Rd b/man/Wf.Rd index bc9b4b3..243f68b 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -38,9 +38,9 @@ regexes <- tibble::tribble( #---- LOCAL ----# p1_local <- "~/icav1/g/production/analysis_data" p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") -um1 <- Wf$new(path = p, wname = "umccrise") +um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) um1$list_files(max_files = 100) -um1$list_files_filter_relevant(regexes = regexes, max_files = 100) +um1$list_files_filter_relevant(max_files = 100) #---- GDS ----# @@ -48,11 +48,11 @@ p1_gds <- "gds://production/analysis_data" p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") outdir <- file.path(sub("gds:/", "~/icav1/g", p)) token <- Sys.getenv("ICA_ACCESS_TOKEN") -um2 <- Wf$new(path = p, wname = "umccrise") +um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) um2$list_files(max_files = 10) -um2$list_files_filter_relevant(regexes = regexes, ica_token = token, max_files = 500) +um2$list_files_filter_relevant(ica_token = token, max_files = 500) d <- um2$download_files( - outdir = outdir, regexes = regexes, ica_token = token, + outdir = outdir, ica_token = token, max_files = 1000, dryrun = F ) @@ -61,9 +61,9 @@ p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275 p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables" p <- file.path(p1_s3, p2_s3) outdir <- sub("s3:/", "~/s3", p) -um3 <- Wf$new(path = p, wname = "sash") +um3 <- Wf$new(path = p, wname = "sash", regexes = regexes) um3$list_files(max_files = 10) -um3$list_files_filter_relevant(regexes = regexes, max_files = 50) +um3$list_files_filter_relevant(max_files = 50) d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F) } @@ -79,6 +79,9 @@ Name of workflow (e.g. umccrise, sash).} \item{\code{filesystem}}{(\code{character(1)})\cr Filesystem of \code{path}.} + +\item{\code{regexes}}{(\code{tibble()})\cr +Tibble with file \code{regex} and \code{fun}ction to parse it.} } \if{html}{\out{}} } @@ -100,7 +103,7 @@ Filesystem of \code{path}.} \subsection{Method \code{new()}}{ Create a new Workflow object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$new(path = NULL, wname = NULL)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$new(path = NULL, wname = NULL, regexes = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -109,6 +112,8 @@ Create a new Workflow object. \item{\code{path}}{Output directory path with results.} \item{\code{wname}}{Name of workflow.} + +\item{\code{regexes}}{Tibble with file \code{regex} and \code{fun}ction to parse it.} } \if{html}{\out{}} } @@ -162,7 +167,6 @@ List all files under given path. List dracarys files under given path \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$list_files_filter_relevant( - regexes = NULL, max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ... @@ -172,8 +176,6 @@ List dracarys files under given path \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.} - \item{\code{max_files}}{Max number of files to list (for gds/s3 only).} \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} @@ -192,7 +194,6 @@ Download files from GDS/S3 to local filesystem. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$download_files( outdir, - regexes = NULL, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, dryrun = FALSE, @@ -205,8 +206,6 @@ Download files from GDS/S3 to local filesystem. \describe{ \item{\code{outdir}}{Path to output directory.} -\item{\code{regexes}}{Tibble with \code{regex} and \code{fun}ction name.} - \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} \item{\code{max_files}}{Maximum number of files to list.} From f57f1012d3546f884c001db50377d0c3cf30b859 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 8 Sep 2024 22:34:17 +1000 Subject: [PATCH 16/24] Wf_umccrise class cleanup --- R/umccrise.R | 80 ++++++++++-------------------------- man/Wf_umccrise.Rd | 100 ++++++++++----------------------------------- 2 files changed, 43 insertions(+), 137 deletions(-) diff --git a/R/umccrise.R b/R/umccrise.R index 8903583..bfb2902 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -5,44 +5,26 @@ #' #' @examples #' \dontrun{ -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") |> ica_token_validate() -#' SubjectID <- "SBJ01155" -#' SampleID_tumor <- "PRJ211091" -#' gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043" -#' gdsdir <- file.path(gdsdir1, "L2101566__L2101565") -#' obj <- Wf_umccrise$new(gdsdir) -#' gds_files <- obj$gds_list( -#' gdsdir = gdsdir, token = token, SubjectID = SubjectID, SampleID_tumor -#' ) -#' outdir <- file.path(sub("gds://", "", gdsdir)) -#' outdir <- file.path(normalizePath("~/icav1/g"), outdir) -#' out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token) -#' tidy1 <- obj$tidy(indir = outdir, out_files = out_files) +#' #---- LOCAL ----# +#' p1_local <- "~/icav1/g/production/analysis_data" +#' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +#' um1 <- Wf_umccrise$new(path = p) +#' um1$list_files(max_files = 10) +#' um1$list_files_filter_relevant(max_files = 100) #' } #' #' @export Wf_umccrise <- R6::R6Class( "Wf_umccrise", + inherit = Wf, public = list( - #' @field path Path to the `umccrise` directory. - #' @field contents Tibble with file path, basename, and size. - path = NULL, - contents = NULL, #' @description Create a new Wf_umccrise object. - #' @param path Path to the `umccrise` directory. + #' @param path Output directory path with results. + #' @param wname Name of workflow. initialize = function(path = NULL) { - stopifnot(is.character(path), length(path) == 1) - self$path <- path - }, - #' @description List Relevant Files In umccrise GDS Directory - #' @param gdsdir Path to the `umccrise` directory. - #' @param SubjectID The SubjectID of the sample (used to construct path). - #' @param SampleID_tumor The SampleID of the tumor sample (used to construct path). - #' @param token ICA access token. - gds_list = function(gdsdir, SubjectID, SampleID_tumor, token = Sys.getenv("ICA_ACCESS_TOKEN")) { - reg1 <- tibble::tribble( + wname <- "umccrise" + regexes <- tibble::tribble( ~regex, ~fun, - # "-somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile", "-chord\\.tsv\\.gz$", "UmChordTsvFile", "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", @@ -50,36 +32,18 @@ Wf_umccrise <- R6::R6Class( "-dbs\\.tsv\\.gz$", "UmSigsDbsFile", "-indel\\.tsv\\.gz$", "UmSigsIndelFile", "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", - "multiqc_conpair.txt", "UmConpairMultiqc" - ) - reg2 <- tibble::tribble( - ~regex, ~fun, + "multiqc_conpair.txt", "UmConpairMultiqc", "-somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile" ) - dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}")) - dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}")) - dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json - f1 <- gds_files_list_filter_relevant(gdsdir = dir_fin, token, page_size = 300, regexes = reg1) - f2 <- gds_files_list_filter_relevant(gdsdir = dir_wrk_pcgr, token, page_size = 50, regexes = reg2) - gds_files <- dplyr::bind_rows(f1, f2) - return(gds_files) + super$initialize(path = path, wname = wname, regexes = regexes) }, - - #' @description GDS File Download via API - #' - #' @param gds_files Tibble with bname and file_id for umccrise files. - #' @param outdir Directory to output files (loosely, not in a structured manner). - #' @param token ICA access token. - gds_download = function(gds_files, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN")) { - assertthat::assert_that(all(c("bname", "file_id") %in% colnames(gds_files))) - gds_files |> - dplyr::rowwise() |> - dplyr::mutate( - out = file.path(outdir, .data$bname), - out_dl = gds_file_download_api(.data$file_id, .data$out, token) - ) - }, - + # dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}")) + # dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}")) + # dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json + # f1 <- gds_files_list_filter_relevant(gdsdir = dir_fin, token, page_size = 300, regexes = reg1) + # f2 <- gds_files_list_filter_relevant(gdsdir = dir_wrk_pcgr, token, page_size = 50, regexes = reg2) + # gds_files <- dplyr::bind_rows(f1, f2) + # return(gds_files) #' @description Tidy up the output files from umccrise #' #' @param indir Path to the `umccrise` directory. @@ -103,9 +67,7 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read multiqc_conpair.txt file. - #' - #' @param x (`character(1)`)\cr - #' Path to multiqc_conpair.txt file. + #' @param x Path to file. read_conpairmultiqc = function(x) { um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota") um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples))) diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 9d6abfb..7be8d45 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -8,42 +8,38 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow } \examples{ \dontrun{ -token <- Sys.getenv("ICA_ACCESS_TOKEN") |> ica_token_validate() -SubjectID <- "SBJ01155" -SampleID_tumor <- "PRJ211091" -gdsdir1 <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043" -gdsdir <- file.path(gdsdir1, "L2101566__L2101565") -obj <- Wf_umccrise$new(gdsdir) -gds_files <- obj$gds_list( - gdsdir = gdsdir, token = token, SubjectID = SubjectID, SampleID_tumor -) -outdir <- file.path(sub("gds://", "", gdsdir)) -outdir <- file.path(normalizePath("~/icav1/g"), outdir) -out_files <- obj$gds_download(gds_files = gds_files, outdir = outdir, token = token) -tidy1 <- obj$tidy(indir = outdir, out_files = out_files) +#---- LOCAL ----# +p1_local <- "~/icav1/g/production/analysis_data" +p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +um1 <- Wf_umccrise$new(path = p) +um1$list_files(max_files = 10) +um1$list_files_filter_relevant(max_files = 100) } } -\section{Public fields}{ -\if{html}{\out{
}} -\describe{ -\item{\code{path}}{Path to the \code{umccrise} directory.} - -\item{\code{contents}}{Tibble with file path, basename, and size.} -} -\if{html}{\out{
}} +\section{Super class}{ +\code{\link[dracarys:Wf]{dracarys::Wf}} -> \code{Wf_umccrise} } \section{Methods}{ \subsection{Public methods}{ \itemize{ \item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}} -\item \href{#method-Wf_umccrise-gds_list}{\code{Wf_umccrise$gds_list()}} -\item \href{#method-Wf_umccrise-gds_download}{\code{Wf_umccrise$gds_download()}} \item \href{#method-Wf_umccrise-tidy}{\code{Wf_umccrise$tidy()}} \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}} \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} } } +\if{html}{\out{ +
Inherited methods + +
+}} \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf_umccrise-new}{}}} @@ -56,60 +52,9 @@ Create a new Wf_umccrise object. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{path}}{Path to the \code{umccrise} directory.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_list}{}}} -\subsection{Method \code{gds_list()}}{ -List Relevant Files In umccrise GDS Directory -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$gds_list( - gdsdir, - SubjectID, - SampleID_tumor, - token = Sys.getenv("ICA_ACCESS_TOKEN") -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{gdsdir}}{Path to the \code{umccrise} directory.} - -\item{\code{SubjectID}}{The SubjectID of the sample (used to construct path).} - -\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (used to construct path).} - -\item{\code{token}}{ICA access token.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-gds_download}{}}} -\subsection{Method \code{gds_download()}}{ -GDS File Download via API -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$gds_download( - gds_files, - outdir, - token = Sys.getenv("ICA_ACCESS_TOKEN") -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{gds_files}}{Tibble with bname and file_id for umccrise files.} - -\item{\code{outdir}}{Directory to output files (loosely, not in a structured manner).} +\item{\code{path}}{Output directory path with results.} -\item{\code{token}}{ICA access token.} +\item{\code{wname}}{Name of workflow.} } \if{html}{\out{
}} } @@ -145,8 +90,7 @@ Read multiqc_conpair.txt file. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to multiqc_conpair.txt file.} +\item{\code{x}}{Path to file.} } \if{html}{\out{
}} } From d7230feb6fd024d4a89e32b455ffa1d0ecf1c6fe Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 9 Sep 2024 01:35:08 +1000 Subject: [PATCH 17/24] evaluate tidiers inside R6 env --- NAMESPACE | 1 - R/Wf.R | 54 +-- R/fs_icav1.R | 8 +- R/fs_s3.R | 8 +- R/regex.R | 6 +- R/tidy.R | 27 ++ R/umccrise.R | 328 +++++++++--------- man/UmccriseCanRepTables.Rd | 181 ---------- man/Wf.Rd | 40 ++- man/Wf_umccrise.Rd | 209 ++++++++++- man/dr_func_eval.Rd | 5 +- man/dr_gds_download.Rd | 7 +- man/dr_s3_download.Rd | 7 +- man/tidy_files.Rd | 10 +- .../test-roxytest-testexamples-regex.R | 2 +- 15 files changed, 466 insertions(+), 427 deletions(-) delete mode 100644 man/UmccriseCanRepTables.Rd diff --git a/NAMESPACE b/NAMESPACE index 416306c..c2f6f66 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,7 +28,6 @@ export(TsoSampleAnalysisResultsFile) export(TsoTargetRegionCoverageFile) export(TsoTmbFile) export(TsoTmbTraceTsvFile) -export(UmccriseCanRepTables) export(VCMetricsFile) export(Wf) export(Wf_tso_ctdna_tumor_only) diff --git a/R/Wf.R b/R/Wf.R index 634c6ad..bb18f62 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -30,9 +30,8 @@ #' p1_local <- "~/icav1/g/production/analysis_data" #' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") #' um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) -#' um1$list_files(max_files = 100) -#' um1$list_files_filter_relevant(max_files = 100) -#' +#' um1$list_files(max_files = 10) +#' um1$list_files_filter_relevant(max_files = 10) #' #' #---- GDS ----# #' p1_gds <- "gds://production/analysis_data" @@ -44,7 +43,7 @@ #' um2$list_files_filter_relevant(ica_token = token, max_files = 500) #' d <- um2$download_files( #' outdir = outdir, ica_token = token, -#' max_files = 1000, dryrun = F +#' max_files = 1000, dryrun = T #' ) #' #' #---- S3 ----# @@ -62,20 +61,17 @@ Wf <- R6::R6Class( "Wf", public = list( - #' @field path (`character(1)`)\cr - #' Path to directory with raw workflow results (from GDS, S3, or local filesystem). - #' @field wname (`character(1)`)\cr - #' Name of workflow (e.g. umccrise, sash). - #' @field filesystem (`character(1)`)\cr - #' Filesystem of `path`. - #' @field regexes (`tibble()`)\cr - #' Tibble with file `regex` and `fun`ction to parse it. + #' @field path Path to directory with raw workflow results (from GDS, S3, or + #' local filesystem). + #' @field wname Name of workflow (e.g. umccrise, sash). + #' @field filesystem Filesystem of `path` (gds/s3/local). + #' @field regexes Tibble with file `regex` and `fun`ction to parse it. path = NULL, wname = NULL, filesystem = NULL, regexes = NULL, #' @description Create a new Workflow object. - #' @param path Output directory path with results. + #' @param path Path to directory with raw workflow results. #' @param wname Name of workflow. #' @param regexes Tibble with file `regex` and `fun`ction to parse it. initialize = function(path = NULL, wname = NULL, regexes = NULL) { @@ -117,11 +113,12 @@ Wf <- R6::R6Class( invisible(self) }, #' @description List all files under given path. + #' @param path Path with raw results. #' @param max_files Max number of files to list (for gds/s3 only). #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param ... Passed on to `gds_list_files_dir` function. - list_files = function(max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { - path <- self$path + list_files = function(path = self$path, max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { if (self$filesystem == "gds") { d <- gds_list_files_dir( gdsdir = path, token = ica_token, page_size = max_files, ... @@ -134,15 +131,15 @@ Wf <- R6::R6Class( return(d) }, #' @description List dracarys files under given path + #' @param path Path with raw results. #' @param max_files Max number of files to list (for gds/s3 only). #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param ... Passed on to the `gds_list_files_filter_relevant` or #' the `s3_list_files_filter_relevant` function. - list_files_filter_relevant = function(max_files = 1000, + list_files_filter_relevant = function(path = self$path, max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { regexes <- self$regexes assertthat::assert_that(!is.null(regexes)) - path <- self$path if (self$filesystem == "gds") { d <- gds_list_files_filter_relevant( gdsdir = path, regexes = regexes, token = ica_token, page_size = max_files, ... @@ -159,23 +156,26 @@ Wf <- R6::R6Class( d }, #' @description Download files from GDS/S3 to local filesystem. + #' @param path Path with raw results. #' @param outdir Path to output directory. #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param max_files Maximum number of files to list. + #' @param max_files Max number of files to list. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). #' @param recursive Should files be returned recursively _in and under_ the specified #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, dryrun = FALSE, recursive = NULL) { + #' @param list_filter_fun Function to filter relevant files. + download_files = function(path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, dryrun = FALSE, recursive = NULL, + list_filter_fun = NULL) { # TODO: add envvar checker - path <- self$path regexes <- self$regexes - assertthat::assert_that(!is.null(regexes)) + assertthat::assert_that(!is.null(regexes), !is.null(list_filter_fun)) if (self$filesystem == "gds") { d <- dr_gds_download( gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token, - page_size = max_files, dryrun = dryrun, recursive = recursive + page_size = max_files, dryrun = dryrun, recursive = recursive, + list_filter_fun = list_filter_fun ) if (!dryrun) { self$filesystem <- "local" @@ -184,7 +184,8 @@ Wf <- R6::R6Class( } else if (self$filesystem == "s3") { d <- dr_s3_download( s3dir = path, outdir = outdir, regexes = regexes, - max_objects = max_files, dryrun = dryrun + max_objects = max_files, dryrun = dryrun, + list_filter_fun = list_filter_fun ) if (!dryrun) { self$filesystem <- "local" @@ -196,9 +197,10 @@ Wf <- R6::R6Class( return(d) }, #' @description Tidy given files. - #' @param x Tibble with `fun`ction to parse the file and `localpath` to the file. + #' @param x Tibble with `localpath` to file and the function `type` to parse it. tidy_files = function(x) { - tidy_files(x) + # awesomeness + tidy_files(x, envir = self) } ) # end public ) diff --git a/R/fs_icav1.R b/R/fs_icav1.R index 8f36e81..487669d 100644 --- a/R/fs_icav1.R +++ b/R/fs_icav1.R @@ -155,21 +155,23 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_ #' @param outdir Local output directory. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). +#' @param list_filter_fun Function to filter relevant GDS files. #' @examples #' \dontrun{ #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" #' outdir <- sub("gds:/", "~/icav1/g", gdsdir) #' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") -#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F) +#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T) #' } #' #' @export dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, page_size = 100, dryrun = FALSE, - regexes = DR_FILE_REGEX, recursive = NULL) { + regexes = DR_FILE_REGEX, recursive = NULL, + list_filter_fun = gds_list_files_filter_relevant) { e <- emojifont::emoji fs::dir_create(outdir) - d <- gds_list_files_filter_relevant( + d <- list_filter_fun( gdsdir = gdsdir, pattern = pattern, regexes = regexes, token = token, page_size = page_size, include_url = FALSE, no_recurse = FALSE, page_token = NULL, diff --git a/R/fs_s3.R b/R/fs_s3.R index 1bf8f4a..a4ca9ef 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -109,6 +109,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, #' @param outdir Path to output directory. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). +#' @param list_filter_fun Function to filter relevant S3 files. #' @examples #' \dontrun{ #' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" @@ -116,15 +117,16 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, #' s3dir <- file.path(p1, p2) #' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") #' outdir <- sub("s3:/", "~/s3", s3dir) -#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = F) +#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = T) #' } #' @export dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, - regexes = DR_FILE_REGEX, dryrun = FALSE) { + regexes = DR_FILE_REGEX, dryrun = FALSE, + list_filter_fun = s3_list_files_filter_relevant) { s3 <- paws.storage::s3() e <- emojifont::emoji fs::dir_create(outdir) - d <- s3_list_files_filter_relevant( + d <- list_filter_fun( s3dir = s3dir, pattern = NULL, regexes = regexes, max_objects = max_objects, presign = FALSE ) diff --git a/R/regex.R b/R/regex.R index 4ca485b..34f4079 100644 --- a/R/regex.R +++ b/R/regex.R @@ -83,6 +83,8 @@ FILES_DOWNLOAD_BUT_IGNORE <- c( #' @param f Name of function to evaluate. #' @param v Character vector of strings evaluating to functions. By default, #' this points to the functions in the DR_FILE_REGEX dracarys tibble. +#' @param envir the environment in which to evaluate the function e.g. use `self` +#' when using inside R6 classes. #' #' @return Evaluated function. #' @examples @@ -94,13 +96,13 @@ FILES_DOWNLOAD_BUT_IGNORE <- c( #' expect_equal(mean_1_to_10, base::mean(1:10)) #' expect_null(dr_func_eval("foo")) #' @export -dr_func_eval <- function(f, v = NULL) { +dr_func_eval <- function(f, v = NULL, envir = parent.frame()) { v <- v %||% DR_FILE_REGEX[["fun"]] if (!f %in% v) { return(NULL) } # evaluate string - eval(parse(text = f)) + eval(parse(text = f), envir = envir) } #' Get dracarys `DR_FILE_REGEX` diff --git a/R/tidy.R b/R/tidy.R index a1b9313..06fac82 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -1,3 +1,30 @@ +#' Tidy Files +#' +#' @param x Tibble with `localpath` to file and the function `type` to parse it. +#' @param envir the environment in which to evaluate the function e.g. use `self` +#' when using inside R6 classes. +#' +#' @return Tibble with parsed data in a `data` list-column. +#' @examples +#' \dontrun{ +#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" +#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz" +#' p <- file.path(p1, p2) +#' x <- tibble::tibble(type = "readr::read_tsv", localpath = p) +#' tidy_files(x) +#' } +#' +#' @export +tidy_files <- function(x, envir = parent.frame()) { + assertthat::assert_that(is.data.frame(x)) + assertthat::assert_that(all(c("type", "localpath") %in% colnames(x))) + x |> + dplyr::rowwise() |> + dplyr::mutate( + data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath)) + ) +} + #' Tidy UMCCR Results #' #' Tidies UMCCR workflow results into a list of tibbles and writes individual diff --git a/R/umccrise.R b/R/umccrise.R index bfb2902..9c465c9 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -5,12 +5,29 @@ #' #' @examples #' \dontrun{ +#' #' #---- LOCAL ----# +#' SubjectID <- "SBJ01155" +#' SampleID_tumor <- "PRJ211091" #' p1_local <- "~/icav1/g/production/analysis_data" #' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") -#' um1 <- Wf_umccrise$new(path = p) -#' um1$list_files(max_files = 10) -#' um1$list_files_filter_relevant(max_files = 100) +#' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' um1$list_files(max_files = 100) +#' +#' #---- GDS ----# +#' SubjectID <- "SBJ03043" +#' SampleID_tumor <- "PRJ230004" +#' p1_gds <- "gds://production/analysis_data" +#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' um2$list_files(max_files = 8) +#' um2$list_files_filter_relevant(ica_token = token, max_files = 500) +#' d <- um2$download_files( +#' outdir = outdir, ica_token = token, +#' max_files = 1000, dryrun = F +#' ) #' } #' #' @export @@ -18,151 +35,103 @@ Wf_umccrise <- R6::R6Class( "Wf_umccrise", inherit = Wf, public = list( + #' @field SubjectID The SubjectID of the sample (needed for path lookup). + #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + SubjectID = NULL, + SampleID_tumor = NULL, #' @description Create a new Wf_umccrise object. #' @param path Output directory path with results. - #' @param wname Name of workflow. - initialize = function(path = NULL) { + #' @param SubjectID The SubjectID of the sample (needed for path lookup). + #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { wname <- "umccrise" regexes <- tibble::tribble( ~regex, ~fun, - "-chord\\.tsv\\.gz$", "UmChordTsvFile", - "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", - "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", - "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", - "-dbs\\.tsv\\.gz$", "UmSigsDbsFile", - "-indel\\.tsv\\.gz$", "UmSigsIndelFile", - "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", - "multiqc_conpair.txt", "UmConpairMultiqc", - "-somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile" - ) - super$initialize(path = path, wname = wname, regexes = regexes) - }, - # dir_fin <- file.path(gdsdir, glue("{SubjectID}__{SampleID_tumor}")) - # dir_wrk <- file.path(gdsdir, "work", glue("{SubjectID}__{SampleID_tumor}")) - # dir_wrk_pcgr <- file.path(dir_wrk, "pcgr") # for pcgr json - # f1 <- gds_files_list_filter_relevant(gdsdir = dir_fin, token, page_size = 300, regexes = reg1) - # f2 <- gds_files_list_filter_relevant(gdsdir = dir_wrk_pcgr, token, page_size = 50, regexes = reg2) - # gds_files <- dplyr::bind_rows(f1, f2) - # return(gds_files) - #' @description Tidy up the output files from umccrise - #' - #' @param indir Path to the `umccrise` directory. - #' @param out_files Tibble with file path, basename, and size. - tidy = function(indir, out_files) { - obj_canrep <- UmccriseCanRepTables$new(indir) - canrep_parse <- obj_canrep$read() - pcgr_json <- out_files |> - dplyr::filter(.data$type == "PcgrJsonFile") |> - dplyr::pull("out") |> - PcgrJsonFile$new() |> - read() - conpair_tsv <- out_files |> - dplyr::filter(.data$type == "UmConpairMultiqc") |> - dplyr::pull("out") |> - self$read_conpairmultiqc() - d <- canrep_parse - d[["pcgr_json"]] <- pcgr_json[["metrics"]] - d[["conpair"]] <- conpair_tsv - d - }, + "-chord\\.tsv\\.gz$", "chordtsv", + "-hrdetect\\.tsv\\.gz$", "hrdetecttsv", + "-snv_2015\\.tsv\\.gz$", "sigstsv", + "-snv_2020\\.tsv\\.gz$", "sigstsv", + "-dbs\\.tsv\\.gz$", "sigstsv", + "-indel\\.tsv\\.gz$", "sigstsv", + "-qc_summary\\.tsv\\.gz$", "qcsummarytsv", + "multiqc_conpair.txt", "conpairmultiqc", + "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" + ) |> + dplyr::mutate(fun = paste0("read_", .data$fun)) - #' @description Read multiqc_conpair.txt file. - #' @param x Path to file. - read_conpairmultiqc = function(x) { - um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota") - um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples))) - cnames <- list( - old = c( - "Sample", "concordance_concordance", "concordance_used_markers", - "concordance_total_markers", "concordance_marker_threshold", - "concordance_min_mapping_quality", "concordance_min_base_quality", - "contamination" - ), - new = c( - "sampleid", "contamination", "concordance", "markers_used", - "markers_total", "marker_threshold", - "mapq_min", "baseq_min" - ) - ) - ctypes <- list( - old = c("cddddddd"), - new = c("cddddddd") - ) - if (!file.exists(x)) { - return(empty_tbl(cnames$new, ctypes$new)) - } - d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c")) - assertthat::assert_that(all(colnames(d1) == cnames$old)) - d1 |> - dplyr::filter(!.data$Sample %in% um_ref_samples) |> - dplyr::relocate("contamination", .after = "Sample") |> - rlang::set_names(cnames$new) - } - ) -) - -#' UmccriseCanRepTables R6 Class -#' -#' @description -#' Reads and writes tidy versions of files within the `cancer_report_tables` directory -#' output from the `umccrise` workflow. -#' -#' @examples -#' \dontrun{ -#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" -#' p2 <- "L2101566__L2101565" -#' p <- file.path(p1, p2) -#' obj <- UmccriseCanRepTables$new(p) -#' obj$path -#' obj$contents -#' d <- obj$read() -#' obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") -#' } -#' -#' @export -UmccriseCanRepTables <- R6::R6Class( - "UmccriseCanRepTables", - public = list( - #' @field path Path to the `cancer_report_tables` directory. - #' @field contents Tibble with file path, basename, and size. - path = NULL, - contents = NULL, - #' @description Create a new UmccriseCanRepTables object. - #' @param path Path to the `cancer_report_tables` directory. - initialize = function(path = NULL) { - stopifnot(is.character(path), length(path) == 1) - self$path <- normalizePath(path) - self$contents <- fs::dir_info(path, type = "file", recurse = TRUE) |> - dplyr::mutate( - bname = basename(.data$path), - size = as.character(trimws(.data$size)) - ) |> - dplyr::select("path", "bname", "size") + super$initialize(path = path, wname = wname, regexes = regexes) + self$SubjectID <- SubjectID + self$SampleID_tumor <- SampleID_tumor }, - #' @description Print details about the cancer_report_tables directory. + #' @description Print details about the Workflow. #' @param ... (ignored). print = function(...) { - bnames <- self$contents |> - dplyr::mutate( - low = tolower(.data$bname), - ) |> - dplyr::arrange(.data$low) |> - dplyr::mutate( - n = dplyr::row_number(), - bn = glue("{.data$n}. {.data$bname} ({.data$size})") - ) |> - dplyr::pull("bn") - cat("#--- UmccriseCanRepTables ---#\n") - cat(glue("Path: {self$path}"), "\n") - cat("Contents:\n") - cat(bnames, sep = "\n") + res <- tibble::tribble( + ~var, ~value, + "path", self$path, + "wname", self$wname, + "filesystem", self$filesystem, + "SubjectID", self$SubjectID, + "SampleID_tumor", self$SampleID_tumor + ) + print(res) invisible(self) }, - - #' @description Read `chord.tsv.gz` file output from umccrise. - #' - #' @param x (`character(1)`)\cr - #' Path to `chord.tsv.gz` file. + #' @description List dracarys files under given path + #' @param max_files Max number of files to list (for gds/s3 only). + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param ... Passed on to the `gds_list_files_filter_relevant` or + #' the `s3_list_files_filter_relevant` function. + list_files_filter_relevant = function(max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + path <- self$path + dir_final <- file.path(path, glue("{self$SubjectID}__{self$SampleID_tumor}")) + dir_work <- file.path(path, "work", glue("{self$SubjectID}__{self$SampleID_tumor}")) + dir_work_pcgr <- file.path(dir_work, "pcgr") # for pcgr json + f1 <- super$list_files_filter_relevant(path = dir_final, max_files = 300, ica_token = ica_token) + f2 <- super$list_files_filter_relevant(path = dir_work_pcgr, max_files = 50, ica_token = ica_token) + f_all <- dplyr::bind_rows(f1, f2) + return(f_all) + }, + #' @description Download files from GDS/S3 to local filesystem. + #' @param outdir Path to output directory. + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param max_files Max number of files to list. + #' @param dryrun If TRUE, just list the files that will be downloaded (don't + #' download them). + #' @param recursive Should files be returned recursively _in and under_ the specified + #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). + #' @param list_filter_fun Function to filter relevant files. + download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, dryrun = FALSE, recursive = NULL) { + super$download_files( + outdir = outdir, ica_token = ica_token, max_files = max_files, + dryrun = dryrun, recursive = recursive, + list_filter_fun = self$list_files_filter_relevant + ) + }, + #' @description Read `pcgr.json.gz` file. + #' @param x Path to file. + read_pcgrjson = function(x) { + j <- read_jsongz_jsonlite(x) + tmb <- + j[["content"]][["tmb"]][["variant_statistic"]] %||% + j[["content"]][["tmb"]][["v_stat"]] %||% + list(tmb_estimate = NA, n_tmb = NA) + tmb <- purrr::flatten(tmb) |> + tibble::as_tibble_row() |> + dplyr::select("tmb_estimate", "n_tmb") + msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]] + # handle nulls + msi <- msi %||% list(fracIndels = NA, predicted_class = NA) + msi <- purrr::flatten(msi) |> + tibble::as_tibble_row() |> + dplyr::select("fracIndels", "predicted_class") + metrics <- dplyr::bind_cols(msi, tmb) + return(metrics) + }, + #' @description Read `chord.tsv.gz` cancer report file. + #' @param x Path to file. read_chordtsv = function(x) { ct <- readr::cols_only( p_hrd = "d", @@ -173,10 +142,8 @@ UmccriseCanRepTables <- R6::R6Class( ) read_tsvgz(x, col_types = ct) }, - #' @description Read `hrdetect.tsv.gz` file output from umccrise. - #' - #' @param x (`character(1)`)\cr - #' Path to `hrdetect.tsv.gz` file. + #' @description Read `hrdetect.tsv.gz` cancer report file. + #' @param x Path to file. read_hrdetecttsv = function(x) { ct <- readr::cols( .default = "d", @@ -185,24 +152,17 @@ UmccriseCanRepTables <- R6::R6Class( read_tsvgz(x, col_types = ct) |> dplyr::select(-c("sample")) }, - - - #' @description Read `snv_20XX.tsv.gz` file output from umccrise. - #' - #' @param x (`character(1)`)\cr - #' Path to `snv_20XX.tsv.gz` file. - read_sigs = function(x) { + #' @description Read `snv_20XX.tsv.gz` cancer report file. + #' @param x Path to file. + read_sigstsv = function(x) { ct <- readr::cols( .default = "d", Signature = "c" ) read_tsvgz(x, col_types = ct) }, - - #' @description Read `qc_summary.tsv.gz` file output from umccrise. - #' - #' @param x (`character(1)`)\cr - #' Path to `qc_summary.tsv.gz` file. + #' @description Read `qc_summary.tsv.gz` cancer report file. + #' @param x Path to file. read_qcsummarytsv = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) d |> @@ -235,23 +195,51 @@ UmccriseCanRepTables <- R6::R6Class( "hypermutated", "bpi_enabled" ) }, - #' @description - #' Reads contents of `cancer_report_tables` directory output by umccrise. - #' - #' @return A list of tibbles. - #' @export - read = function() { - x <- self$path - # now return all as list elements - list( - chord = grep_file(x, "-chord\\.tsv\\.gz$") |> self$read_chordtsv(), - hrdetect = grep_file(x, "-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(), - sigs2015 = grep_file(x, "-snv_2015\\.tsv\\.gz$") |> self$read_sigs(), - sigs2020 = grep_file(x, "-snv_2020\\.tsv\\.gz$") |> self$read_sigs(), - sigsdbs = grep_file(x, "-dbs\\.tsv\\.gz$") |> self$read_sigs(), - sigsindel = grep_file(x, "-indel\\.tsv\\.gz$") |> self$read_sigs(), - qcsum = grep_file(x, "-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv() + #' @description Read multiqc_conpair.txt file. + #' @param x Path to file. + read_conpairmultiqc = function(x) { + um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota") + um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples))) + cnames <- list( + old = c( + "Sample", "concordance_concordance", "concordance_used_markers", + "concordance_total_markers", "concordance_marker_threshold", + "concordance_min_mapping_quality", "concordance_min_base_quality", + "contamination" + ), + new = c( + "sampleid", "contamination", "concordance", "markers_used", + "markers_total", "marker_threshold", + "mapq_min", "baseq_min" + ) ) + ctypes <- list( + old = c("cddddddd"), + new = c("cddddddd") + ) + if (!file.exists(x)) { + return(empty_tbl(cnames$new, ctypes$new)) + } + d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c")) + assertthat::assert_that(all(colnames(d1) == cnames$old)) + d1 |> + dplyr::filter(!.data$Sample %in% um_ref_samples) |> + dplyr::relocate("contamination", .after = "Sample") |> + rlang::set_names(cnames$new) } - ) + ) # end public ) + +# read = function() { +# x <- self$path +# # now return all as list elements +# list( +# chord = grep_file(x, "-chord\\.tsv\\.gz$") |> self$read_chordtsv(), +# hrdetect = grep_file(x, "-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(), +# sigs2015 = grep_file(x, "-snv_2015\\.tsv\\.gz$") |> self$read_sigs(), +# sigs2020 = grep_file(x, "-snv_2020\\.tsv\\.gz$") |> self$read_sigs(), +# sigsdbs = grep_file(x, "-dbs\\.tsv\\.gz$") |> self$read_sigs(), +# sigsindel = grep_file(x, "-indel\\.tsv\\.gz$") |> self$read_sigs(), +# qcsum = grep_file(x, "-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv() +# ) +# } diff --git a/man/UmccriseCanRepTables.Rd b/man/UmccriseCanRepTables.Rd deleted file mode 100644 index 5c43358..0000000 --- a/man/UmccriseCanRepTables.Rd +++ /dev/null @@ -1,181 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/umccrise.R -\name{UmccriseCanRepTables} -\alias{UmccriseCanRepTables} -\title{UmccriseCanRepTables R6 Class} -\description{ -Reads and writes tidy versions of files within the \code{cancer_report_tables} directory -output from the \code{umccrise} workflow. -} -\examples{ -\dontrun{ -p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" -p2 <- "L2101566__L2101565" -p <- file.path(p1, p2) -obj <- UmccriseCanRepTables$new(p) -obj$path -obj$contents -d <- obj$read() -obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") -} - -} -\section{Public fields}{ -\if{html}{\out{
}} -\describe{ -\item{\code{path}}{Path to the \code{cancer_report_tables} directory.} - -\item{\code{contents}}{Tibble with file path, basename, and size.} -} -\if{html}{\out{
}} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-UmccriseCanRepTables-new}{\code{UmccriseCanRepTables$new()}} -\item \href{#method-UmccriseCanRepTables-print}{\code{UmccriseCanRepTables$print()}} -\item \href{#method-UmccriseCanRepTables-read_chordtsv}{\code{UmccriseCanRepTables$read_chordtsv()}} -\item \href{#method-UmccriseCanRepTables-read_hrdetecttsv}{\code{UmccriseCanRepTables$read_hrdetecttsv()}} -\item \href{#method-UmccriseCanRepTables-read_sigs}{\code{UmccriseCanRepTables$read_sigs()}} -\item \href{#method-UmccriseCanRepTables-read_qcsummarytsv}{\code{UmccriseCanRepTables$read_qcsummarytsv()}} -\item \href{#method-UmccriseCanRepTables-read}{\code{UmccriseCanRepTables$read()}} -\item \href{#method-UmccriseCanRepTables-clone}{\code{UmccriseCanRepTables$clone()}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-new}{}}} -\subsection{Method \code{new()}}{ -Create a new UmccriseCanRepTables object. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$new(path = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{path}}{Path to the \code{cancer_report_tables} directory.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-print}{}}} -\subsection{Method \code{print()}}{ -Print details about the cancer_report_tables directory. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$print(...)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{...}}{(ignored).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_chordtsv}{}}} -\subsection{Method \code{read_chordtsv()}}{ -Read \code{chord.tsv.gz} file output from umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read_chordtsv(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to \code{chord.tsv.gz} file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_hrdetecttsv}{}}} -\subsection{Method \code{read_hrdetecttsv()}}{ -Read \code{hrdetect.tsv.gz} file output from umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read_hrdetecttsv(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to \code{hrdetect.tsv.gz} file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_sigs}{}}} -\subsection{Method \code{read_sigs()}}{ -Read \code{snv_20XX.tsv.gz} file output from umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read_sigs(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to \code{snv_20XX.tsv.gz} file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_qcsummarytsv}{}}} -\subsection{Method \code{read_qcsummarytsv()}}{ -Read \code{qc_summary.tsv.gz} file output from umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read_qcsummarytsv(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to \code{qc_summary.tsv.gz} file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read}{}}} -\subsection{Method \code{read()}}{ -Reads contents of \code{cancer_report_tables} directory output by umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -A list of tibbles. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/Wf.Rd b/man/Wf.Rd index 243f68b..ce29182 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -39,9 +39,8 @@ regexes <- tibble::tribble( p1_local <- "~/icav1/g/production/analysis_data" p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) -um1$list_files(max_files = 100) -um1$list_files_filter_relevant(max_files = 100) - +um1$list_files(max_files = 10) +um1$list_files_filter_relevant(max_files = 10) #---- GDS ----# p1_gds <- "gds://production/analysis_data" @@ -53,7 +52,7 @@ um2$list_files(max_files = 10) um2$list_files_filter_relevant(ica_token = token, max_files = 500) d <- um2$download_files( outdir = outdir, ica_token = token, - max_files = 1000, dryrun = F + max_files = 1000, dryrun = T ) #---- S3 ----# @@ -71,17 +70,14 @@ d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryr \section{Public fields}{ \if{html}{\out{
}} \describe{ -\item{\code{path}}{(\code{character(1)})\cr -Path to directory with raw workflow results (from GDS, S3, or local filesystem).} +\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or +local filesystem).} -\item{\code{wname}}{(\code{character(1)})\cr -Name of workflow (e.g. umccrise, sash).} +\item{\code{wname}}{Name of workflow (e.g. umccrise, sash).} -\item{\code{filesystem}}{(\code{character(1)})\cr -Filesystem of \code{path}.} +\item{\code{filesystem}}{Filesystem of \code{path} (gds/s3/local).} -\item{\code{regexes}}{(\code{tibble()})\cr -Tibble with file \code{regex} and \code{fun}ction to parse it.} +\item{\code{regexes}}{Tibble with file \code{regex} and \code{fun}ction to parse it.} } \if{html}{\out{
}} } @@ -109,7 +105,7 @@ Create a new Workflow object. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{path}}{Output directory path with results.} +\item{\code{path}}{Path to directory with raw workflow results.} \item{\code{wname}}{Name of workflow.} @@ -142,6 +138,7 @@ Print details about the Workflow. List all files under given path. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$list_files( + path = self$path, max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ... @@ -151,6 +148,8 @@ List all files under given path. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ +\item{\code{path}}{Path with raw results.} + \item{\code{max_files}}{Max number of files to list (for gds/s3 only).} \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} @@ -167,6 +166,7 @@ List all files under given path. List dracarys files under given path \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$list_files_filter_relevant( + path = self$path, max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ... @@ -176,6 +176,8 @@ List dracarys files under given path \subsection{Arguments}{ \if{html}{\out{
}} \describe{ +\item{\code{path}}{Path with raw results.} + \item{\code{max_files}}{Max number of files to list (for gds/s3 only).} \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} @@ -193,28 +195,34 @@ the \code{s3_list_files_filter_relevant} function.} Download files from GDS/S3 to local filesystem. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$download_files( + path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, dryrun = FALSE, - recursive = NULL + recursive = NULL, + list_filter_fun = NULL )}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ +\item{\code{path}}{Path with raw results.} + \item{\code{outdir}}{Path to output directory.} \item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -\item{\code{max_files}}{Maximum number of files to list.} +\item{\code{max_files}}{Max number of files to list.} \item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't download them).} \item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} + +\item{\code{list_filter_fun}}{Function to filter relevant files.} } \if{html}{\out{
}} } @@ -231,7 +239,7 @@ Tidy given files. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{x}}{Tibble with \code{fun}ction to parse the file and \code{localpath} to the file.} +\item{\code{x}}{Tibble with \code{localpath} to file and the function \code{type} to parse it.} } \if{html}{\out{
}} } diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 7be8d45..d8eea05 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -8,23 +8,57 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow } \examples{ \dontrun{ + #---- LOCAL ----# +SubjectID <- "SBJ01155" +SampleID_tumor <- "PRJ211091" p1_local <- "~/icav1/g/production/analysis_data" p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") -um1 <- Wf_umccrise$new(path = p) -um1$list_files(max_files = 10) -um1$list_files_filter_relevant(max_files = 100) +um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +um1$list_files(max_files = 100) + +#---- GDS ----# +SubjectID <- "SBJ03043" +SampleID_tumor <- "PRJ230004" +p1_gds <- "gds://production/analysis_data" +p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +token <- Sys.getenv("ICA_ACCESS_TOKEN") +um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +um2$list_files(max_files = 8) +um2$list_files_filter_relevant(ica_token = token, max_files = 500) +d <- um2$download_files( + outdir = outdir, ica_token = token, + max_files = 1000, dryrun = F +) + } } \section{Super class}{ \code{\link[dracarys:Wf]{dracarys::Wf}} -> \code{Wf_umccrise} } +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +} +\if{html}{\out{
}} +} \section{Methods}{ \subsection{Public methods}{ \itemize{ \item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}} -\item \href{#method-Wf_umccrise-tidy}{\code{Wf_umccrise$tidy()}} +\item \href{#method-Wf_umccrise-print}{\code{Wf_umccrise$print()}} +\item \href{#method-Wf_umccrise-list_files_filter_relevant}{\code{Wf_umccrise$list_files_filter_relevant()}} +\item \href{#method-Wf_umccrise-download_files}{\code{Wf_umccrise$download_files()}} +\item \href{#method-Wf_umccrise-read_pcgrjson}{\code{Wf_umccrise$read_pcgrjson()}} +\item \href{#method-Wf_umccrise-read_chordtsv}{\code{Wf_umccrise$read_chordtsv()}} +\item \href{#method-Wf_umccrise-read_hrdetecttsv}{\code{Wf_umccrise$read_hrdetecttsv()}} +\item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}} +\item \href{#method-Wf_umccrise-read_qcsummarytsv}{\code{Wf_umccrise$read_qcsummarytsv()}} \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}} \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} } @@ -32,10 +66,7 @@ um1$list_files_filter_relevant(max_files = 100) \if{html}{\out{
Inherited methods
@@ -46,7 +77,7 @@ um1$list_files_filter_relevant(max_files = 100) \subsection{Method \code{new()}}{ Create a new Wf_umccrise object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$new(path = NULL)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$new(path = NULL, SubjectID = NULL, SampleID_tumor = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -54,26 +85,172 @@ Create a new Wf_umccrise object. \describe{ \item{\code{path}}{Output directory path with results.} -\item{\code{wname}}{Name of workflow.} +\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} } \if{html}{\out{
}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-tidy}{}}} -\subsection{Method \code{tidy()}}{ -Tidy up the output files from umccrise +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-print}{}}} +\subsection{Method \code{print()}}{ +Print details about the Workflow. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$tidy(indir, out_files)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$print(...)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{indir}}{Path to the \code{umccrise} directory.} +\item{\code{...}}{(ignored).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-list_files_filter_relevant}{}}} +\subsection{Method \code{list_files_filter_relevant()}}{ +List dracarys files under given path +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$list_files_filter_relevant( + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -\item{\code{out_files}}{Tibble with file path, basename, and size.} +\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or +the \code{s3_list_files_filter_relevant} function.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-download_files}{}}} +\subsection{Method \code{download_files()}}{ +Download files from GDS/S3 to local filesystem. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$download_files( + outdir, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, + dryrun = FALSE, + recursive = NULL +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{outdir}}{Path to output directory.} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{max_files}}{Max number of files to list.} + +\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't +download them).} + +\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} + +\item{\code{list_filter_fun}}{Function to filter relevant files.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgrjson}{}}} +\subsection{Method \code{read_pcgrjson()}}{ +Read \code{pcgr.json.gz} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_pcgrjson(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_chordtsv}{}}} +\subsection{Method \code{read_chordtsv()}}{ +Read \code{chord.tsv.gz} cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_chordtsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrdetecttsv}{}}} +\subsection{Method \code{read_hrdetecttsv()}}{ +Read \code{hrdetect.tsv.gz} cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrdetecttsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigstsv}{}}} +\subsection{Method \code{read_sigstsv()}}{ +Read \code{snv_20XX.tsv.gz} cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcsummarytsv}{}}} +\subsection{Method \code{read_qcsummarytsv()}}{ +Read \code{qc_summary.tsv.gz} cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_qcsummarytsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} } \if{html}{\out{
}} } diff --git a/man/dr_func_eval.Rd b/man/dr_func_eval.Rd index 2dfedf1..0697c7f 100644 --- a/man/dr_func_eval.Rd +++ b/man/dr_func_eval.Rd @@ -4,13 +4,16 @@ \alias{dr_func_eval} \title{Evaluate dracarys Function} \usage{ -dr_func_eval(f, v = NULL) +dr_func_eval(f, v = NULL, envir = parent.frame()) } \arguments{ \item{f}{Name of function to evaluate.} \item{v}{Character vector of strings evaluating to functions. By default, this points to the functions in the DR_FILE_REGEX dracarys tibble.} + +\item{envir}{the environment in which to evaluate the function e.g. use \code{self} +when using inside R6 classes.} } \value{ Evaluated function. diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd index f748163..677a654 100644 --- a/man/dr_gds_download.Rd +++ b/man/dr_gds_download.Rd @@ -12,7 +12,8 @@ dr_gds_download( page_size = 100, dryrun = FALSE, regexes = DR_FILE_REGEX, - recursive = NULL + recursive = NULL, + list_filter_fun = gds_list_files_filter_relevant ) } \arguments{ @@ -33,6 +34,8 @@ download them).} \item{recursive}{Should files be returned recursively \emph{in and under} the specified GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} + +\item{list_filter_fun}{Function to filter relevant GDS files.} } \description{ Download only GDS files that can be processed by dracarys. @@ -42,7 +45,7 @@ Download only GDS files that can be processed by dracarys. gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" outdir <- sub("gds:/", "~/icav1/g", gdsdir) regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") -dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = F) +dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T) } } diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd index 86ff3f8..1a6475c 100644 --- a/man/dr_s3_download.Rd +++ b/man/dr_s3_download.Rd @@ -10,7 +10,8 @@ dr_s3_download( max_objects = 100, pattern = NULL, regexes = DR_FILE_REGEX, - dryrun = FALSE + dryrun = FALSE, + list_filter_fun = s3_list_files_filter_relevant ) } \arguments{ @@ -26,6 +27,8 @@ dr_s3_download( \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} + +\item{list_filter_fun}{Function to filter relevant S3 files.} } \description{ Download only S3 files that can be processed by dracarys. @@ -37,6 +40,6 @@ p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" s3dir <- file.path(p1, p2) regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") outdir <- sub("s3:/", "~/s3", s3dir) -dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = F) +dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = T) } } diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd index de96245..d61b441 100644 --- a/man/tidy_files.Rd +++ b/man/tidy_files.Rd @@ -4,10 +4,13 @@ \alias{tidy_files} \title{Tidy Files} \usage{ -tidy_files(x) +tidy_files(x, envir = parent.frame()) } \arguments{ -\item{x}{Tibble with \code{fun}ction to parse the file and \code{localpath} to the file.} +\item{x}{Tibble with \code{localpath} to file and the function \code{type} to parse it.} + +\item{envir}{the environment in which to evaluate the function e.g. use \code{self} +when using inside R6 classes.} } \value{ Tibble with parsed data in a \code{data} list-column. @@ -20,7 +23,8 @@ Tidy Files p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz" p <- file.path(p1, p2) -x <- tibble::tibble(fun = "readr::read_tsv", localpath = p) +x <- tibble::tibble(type = "readr::read_tsv", localpath = p) +tidy_files(x) ) } diff --git a/tests/testthat/test-roxytest-testexamples-regex.R b/tests/testthat/test-roxytest-testexamples-regex.R index 91a7f26..05787ff 100644 --- a/tests/testthat/test-roxytest-testexamples-regex.R +++ b/tests/testthat/test-roxytest-testexamples-regex.R @@ -2,7 +2,7 @@ # File R/regex.R: @testexamples -test_that("Function dr_func_eval() @ L97", { +test_that("Function dr_func_eval() @ L99", { mean_1_to_10 <- dr_func_eval("mean", v = c("mean", "sd"))(1:10) x <- system.file("extdata/tso/sample705.fragment_length_hist.json.gz", package = "dracarys") From a23dbad0ee6f0ebb3edb61d42cdabc0b926294aa Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 9 Sep 2024 20:47:38 +1000 Subject: [PATCH 18/24] mirror gds/s3 subdir when downloading --- R/fs_icav1.R | 14 +++++++++++--- R/fs_s3.R | 16 ++++++++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/R/fs_icav1.R b/R/fs_icav1.R index 487669d..e38d851 100644 --- a/R/fs_icav1.R +++ b/R/fs_icav1.R @@ -160,7 +160,11 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_ #' \dontrun{ #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" #' outdir <- sub("gds:/", "~/icav1/g", gdsdir) -#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") +#' regexes <- tibble::tribble( +#' ~regex, ~fun, +#' "multiqc_data\\.json$", "MultiqcJsonFile", +#' "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" +#' ) #' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T) #' } #' @@ -179,11 +183,15 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN ) d <- d |> dplyr::mutate( - localpath = file.path(outdir, .data$bname), + gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path), + gdspath_minus_gdsdir_outdir = fs::dir_create( + file.path(outdir, dirname(gdspath_minus_gdsdir)) + ), + localpath = file.path(.data$gdspath_minus_gdsdir_outdir, .data$bname), gdspath = .data$path ) |> dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath") - # download recognisable dracarys files to outdir/{bname} + # download recognisable dracarys files to outdir//{bname} if (!dryrun) { cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}") res <- d |> diff --git a/R/fs_s3.R b/R/fs_s3.R index a4ca9ef..2dac50c 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -115,9 +115,13 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, #' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" #' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" #' s3dir <- file.path(p1, p2) -#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") +#' regexes <- tibble::tribble( +#' ~regex, ~fun, +#' "multiqc_data\\.json$", "MultiqcJsonFile", +#' "pcgr.*\\.json\\.gz$", "pcgrjson" +#' ) #' outdir <- sub("s3:/", "~/s3", s3dir) -#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = T) +#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 500, regexes = regexes, dryrun = F) #' } #' @export dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, @@ -132,11 +136,15 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, ) d <- d |> dplyr::mutate( - localpath = file.path(outdir, .data$bname), + s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path), + s3path_minus_s3dir_outdir = fs::dir_create( + file.path(outdir, dirname(s3path_minus_s3dir)) + ), + localpath = file.path(.data$s3path_minus_s3dir_outdir, .data$bname), s3path = .data$path ) |> dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path") - # download recognisable dracarys files to outdir/{bname} + # download recognisable dracarys files to outdir//{bname} if (!dryrun) { cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}") d |> From 2e6001434964aa172e8a7b2202d9b2736c027d67 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 9 Sep 2024 22:54:20 +1000 Subject: [PATCH 19/24] wrap sig parsers --- R/Wf.R | 1 + R/umccrise.R | 42 +++++++++++++++++----- man/Wf.Rd | 1 + man/Wf_umccrise.Rd | 79 ++++++++++++++++++++++++++++++++++++++++-- man/dr_gds_download.Rd | 6 +++- man/dr_s3_download.Rd | 8 +++-- man/tidy_files.Rd | 1 - 7 files changed, 122 insertions(+), 16 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index bb18f62..f101975 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -45,6 +45,7 @@ #' outdir = outdir, ica_token = token, #' max_files = 1000, dryrun = T #' ) +#' d_tidy <- um2$tidy_files(d) #' #' #---- S3 ----# #' p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" diff --git a/R/umccrise.R b/R/umccrise.R index 9c465c9..14f4479 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -7,12 +7,15 @@ #' \dontrun{ #' #' #---- LOCAL ----# -#' SubjectID <- "SBJ01155" -#' SampleID_tumor <- "PRJ211091" +#' SubjectID <- "SBJ03043" +#' SampleID_tumor <- "PRJ230004" #' p1_local <- "~/icav1/g/production/analysis_data" -#' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +#' p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") #' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) -#' um1$list_files(max_files = 100) +#' um1$list_files(max_files = 10) +#' um1$list_files_filter_relevant() +#' d <- um1$download_files(max_files = 1000, dryrun = T) +#' d_tidy <- um1$tidy_files(d) #' #' #---- GDS ----# #' SubjectID <- "SBJ03043" @@ -28,6 +31,7 @@ #' outdir = outdir, ica_token = token, #' max_files = 1000, dryrun = F #' ) +#' d_tidy <- um2$tidy_files(d) #' } #' #' @export @@ -49,10 +53,10 @@ Wf_umccrise <- R6::R6Class( ~regex, ~fun, "-chord\\.tsv\\.gz$", "chordtsv", "-hrdetect\\.tsv\\.gz$", "hrdetecttsv", - "-snv_2015\\.tsv\\.gz$", "sigstsv", - "-snv_2020\\.tsv\\.gz$", "sigstsv", - "-dbs\\.tsv\\.gz$", "sigstsv", - "-indel\\.tsv\\.gz$", "sigstsv", + "-snv_2015\\.tsv\\.gz$", "sigssnv2015tsv", + "-snv_2020\\.tsv\\.gz$", "sigssnv2020tsv", + "-dbs\\.tsv\\.gz$", "sigsdbstsv", + "-indel\\.tsv\\.gz$", "sigsindeltsv", "-qc_summary\\.tsv\\.gz$", "qcsummarytsv", "multiqc_conpair.txt", "conpairmultiqc", "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" @@ -152,7 +156,7 @@ Wf_umccrise <- R6::R6Class( read_tsvgz(x, col_types = ct) |> dplyr::select(-c("sample")) }, - #' @description Read `snv_20XX.tsv.gz` cancer report file. + #' @description Read signature cancer report file. #' @param x Path to file. read_sigstsv = function(x) { ct <- readr::cols( @@ -161,6 +165,26 @@ Wf_umccrise <- R6::R6Class( ) read_tsvgz(x, col_types = ct) }, + #' @description Read `snv_2015.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigssnv2015tsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `snv_2020.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigssnv2020tsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `dbs.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigsdbstsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `indel.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigsindeltsv = function(x) { + self$read_sigstsv(x) + }, #' @description Read `qc_summary.tsv.gz` cancer report file. #' @param x Path to file. read_qcsummarytsv = function(x) { diff --git a/man/Wf.Rd b/man/Wf.Rd index ce29182..f3e6767 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -54,6 +54,7 @@ d <- um2$download_files( outdir = outdir, ica_token = token, max_files = 1000, dryrun = T ) +d_tidy <- um2$tidy_files(d) #---- S3 ----# p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index d8eea05..6fc7693 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -15,7 +15,8 @@ SampleID_tumor <- "PRJ211091" p1_local <- "~/icav1/g/production/analysis_data" p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) -um1$list_files(max_files = 100) +um1$list_files(max_files = 10) +um1$list_files_filter_relevant() #---- GDS ----# SubjectID <- "SBJ03043" @@ -31,7 +32,7 @@ d <- um2$download_files( outdir = outdir, ica_token = token, max_files = 1000, dryrun = F ) - +d_tidy <- um2$tidy_files(d) } } @@ -58,6 +59,10 @@ d <- um2$download_files( \item \href{#method-Wf_umccrise-read_chordtsv}{\code{Wf_umccrise$read_chordtsv()}} \item \href{#method-Wf_umccrise-read_hrdetecttsv}{\code{Wf_umccrise$read_hrdetecttsv()}} \item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}} +\item \href{#method-Wf_umccrise-read_sigssnv2015tsv}{\code{Wf_umccrise$read_sigssnv2015tsv()}} +\item \href{#method-Wf_umccrise-read_sigssnv2020tsv}{\code{Wf_umccrise$read_sigssnv2020tsv()}} +\item \href{#method-Wf_umccrise-read_sigsdbstsv}{\code{Wf_umccrise$read_sigsdbstsv()}} +\item \href{#method-Wf_umccrise-read_sigsindeltsv}{\code{Wf_umccrise$read_sigsindeltsv()}} \item \href{#method-Wf_umccrise-read_qcsummarytsv}{\code{Wf_umccrise$read_qcsummarytsv()}} \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}} \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} @@ -225,11 +230,79 @@ Read \code{hrdetect.tsv.gz} cancer report file. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigstsv}{}}} \subsection{Method \code{read_sigstsv()}}{ -Read \code{snv_20XX.tsv.gz} cancer report file. +Read signature cancer report file. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigssnv2015tsv}{}}} +\subsection{Method \code{read_sigssnv2015tsv()}}{ +Read \code{snv_2015.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigssnv2015tsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigssnv2020tsv}{}}} +\subsection{Method \code{read_sigssnv2020tsv()}}{ +Read \code{snv_2020.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigssnv2020tsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsdbstsv}{}}} +\subsection{Method \code{read_sigsdbstsv()}}{ +Read \code{dbs.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigsdbstsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsindeltsv}{}}} +\subsection{Method \code{read_sigsindeltsv()}}{ +Read \code{indel.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigsindeltsv(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd index 677a654..61aa7f8 100644 --- a/man/dr_gds_download.Rd +++ b/man/dr_gds_download.Rd @@ -44,7 +44,11 @@ Download only GDS files that can be processed by dracarys. \dontrun{ gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" outdir <- sub("gds:/", "~/icav1/g", gdsdir) -regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") +regexes <- tibble::tribble( + ~regex, ~fun, + "multiqc_data\\\\.json$", "MultiqcJsonFile", + "-somatic\\\\.pcgr\\\\.json\\\\.gz$", "pcgrjson" +) dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T) } diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd index 1a6475c..674ed93 100644 --- a/man/dr_s3_download.Rd +++ b/man/dr_s3_download.Rd @@ -38,8 +38,12 @@ Download only S3 files that can be processed by dracarys. p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" s3dir <- file.path(p1, p2) -regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") +regexes <- tibble::tribble( + ~regex, ~fun, + "multiqc_data\\\\.json$", "MultiqcJsonFile", + "pcgr.*\\\\.json\\\\.gz$", "pcgrjson" +) outdir <- sub("s3:/", "~/s3", s3dir) -dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 300, regexes = regexes, dryrun = T) +dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 500, regexes = regexes, dryrun = F) } } diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd index d61b441..8d1849b 100644 --- a/man/tidy_files.Rd +++ b/man/tidy_files.Rd @@ -25,7 +25,6 @@ p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz" p <- file.path(p1, p2) x <- tibble::tibble(type = "readr::read_tsv", localpath = p) tidy_files(x) -) } } From 3125056ba50b72aa7c37d6ad9d70bc0bb6348843 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 9 Sep 2024 23:34:16 +1000 Subject: [PATCH 20/24] add Wf class write method --- R/Wf.R | 23 +++++++++++++++++++++++ R/umccrise.R | 8 +++++++- man/Wf.Rd | 26 ++++++++++++++++++++++++++ man/Wf_umccrise.Rd | 15 ++++++++++++--- 4 files changed, 68 insertions(+), 4 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index f101975..aa5bfae 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -202,6 +202,29 @@ Wf <- R6::R6Class( tidy_files = function(x) { # awesomeness tidy_files(x, envir = self) + }, + #' @description Write tidy data. + #' @param x Tibble with tidy `data` and file `type`. + #' @param outdir Directory path to output tidy files. + #' @param prefix Prefix of output files. + #' @param format Format of output files. + #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + write = function(x, outdir = NULL, prefix = NULL, format = "tsv", drid = NULL) { + assertthat::assert_that(!is.null(prefix)) + if (!is.null(outdir)) { + prefix <- file.path(outdir, prefix) + } + d_write <- x |> + dplyr::rowwise() |> + dplyr::mutate( + section = sub("read_", "", .data$type), + p = glue("{prefix}_{.data$section}"), + out = list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid)) + ) |> + dplyr::ungroup() |> + dplyr::select("section", "data") |> + tibble::deframe() + invisible(d_write) } ) # end public ) diff --git a/R/umccrise.R b/R/umccrise.R index 14f4479..2b962da 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -14,8 +14,14 @@ #' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) #' um1$list_files(max_files = 10) #' um1$list_files_filter_relevant() -#' d <- um1$download_files(max_files = 1000, dryrun = T) +#' d <- um1$download_files(max_files = 1000, dryrun = F) #' d_tidy <- um1$tidy_files(d) +#' d_write <- um1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = glue("{SubjectID}__{SampleID_tumor}"), +#' format = "tsv" +#' ) #' #' #---- GDS ----# #' SubjectID <- "SBJ03043" diff --git a/man/Wf.Rd b/man/Wf.Rd index f3e6767..f785399 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -91,6 +91,7 @@ local filesystem).} \item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}} \item \href{#method-Wf-download_files}{\code{Wf$download_files()}} \item \href{#method-Wf-tidy_files}{\code{Wf$tidy_files()}} +\item \href{#method-Wf-write}{\code{Wf$write()}} \item \href{#method-Wf-clone}{\code{Wf$clone()}} } } @@ -246,6 +247,31 @@ Tidy given files. } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-write}{}}} +\subsection{Method \code{write()}}{ +Write tidy data. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$write(x, outdir = NULL, prefix = NULL, format = "tsv", drid = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Tibble with tidy \code{data} and file \code{type}.} + +\item{\code{outdir}}{Directory path to output tidy files.} + +\item{\code{prefix}}{Prefix of output files.} + +\item{\code{format}}{Format of output files.} + +\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 6fc7693..1ff1a33 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -10,13 +10,21 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow \dontrun{ #---- LOCAL ----# -SubjectID <- "SBJ01155" -SampleID_tumor <- "PRJ211091" +SubjectID <- "SBJ03043" +SampleID_tumor <- "PRJ230004" p1_local <- "~/icav1/g/production/analysis_data" -p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) um1$list_files(max_files = 10) um1$list_files_filter_relevant() +d <- um1$download_files(max_files = 1000, dryrun = F) +d_tidy <- um1$tidy_files(d) +d_write <- um1$write( + d_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = "tsv" +) #---- GDS ----# SubjectID <- "SBJ03043" @@ -73,6 +81,7 @@ d_tidy <- um2$tidy_files(d) }} From efb827ecb3c0aea0f1e0f6118bc55f81e2db2e81 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 10 Sep 2024 11:03:42 +1000 Subject: [PATCH 21/24] add Wf_umccrise_download_tidy_write --- NAMESPACE | 1 + R/fs_icav1.R | 2 +- R/fs_s3.R | 2 +- R/umccrise.R | 79 +++++++++++++++++++++----- man/Wf_umccrise.Rd | 11 +++- man/Wf_umccrise_download_tidy_write.Rd | 54 ++++++++++++++++++ 6 files changed, 131 insertions(+), 18 deletions(-) create mode 100644 man/Wf_umccrise_download_tidy_write.Rd diff --git a/NAMESPACE b/NAMESPACE index c2f6f66..b10ca1f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,7 @@ export(VCMetricsFile) export(Wf) export(Wf_tso_ctdna_tumor_only) export(Wf_umccrise) +export(Wf_umccrise_download_tidy_write) export(WgsContigMeanCovFile) export(WgsCoverageMetricsFile) export(WgsFineHistFile) diff --git a/R/fs_icav1.R b/R/fs_icav1.R index e38d851..16df2c9 100644 --- a/R/fs_icav1.R +++ b/R/fs_icav1.R @@ -185,7 +185,7 @@ dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN dplyr::mutate( gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path), gdspath_minus_gdsdir_outdir = fs::dir_create( - file.path(outdir, dirname(gdspath_minus_gdsdir)) + file.path(outdir, dirname(.data$gdspath_minus_gdsdir)) ), localpath = file.path(.data$gdspath_minus_gdsdir_outdir, .data$bname), gdspath = .data$path diff --git a/R/fs_s3.R b/R/fs_s3.R index 2dac50c..34c437e 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -138,7 +138,7 @@ dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, dplyr::mutate( s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path), s3path_minus_s3dir_outdir = fs::dir_create( - file.path(outdir, dirname(s3path_minus_s3dir)) + file.path(outdir, dirname(.data$s3path_minus_s3dir)) ), localpath = file.path(.data$s3path_minus_s3dir_outdir, .data$bname), s3path = .data$path diff --git a/R/umccrise.R b/R/umccrise.R index 2b962da..8f1deae 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -9,6 +9,7 @@ #' #---- LOCAL ----# #' SubjectID <- "SBJ03043" #' SampleID_tumor <- "PRJ230004" +#' prefix <- glue("{SubjectID}__{SampleID_tumor}") #' p1_local <- "~/icav1/g/production/analysis_data" #' p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") #' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) @@ -26,6 +27,7 @@ #' #---- GDS ----# #' SubjectID <- "SBJ03043" #' SampleID_tumor <- "PRJ230004" +#' prefix <- glue("{SubjectID}__{SampleID_tumor}") #' p1_gds <- "gds://production/analysis_data" #' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") #' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) @@ -38,6 +40,12 @@ #' max_files = 1000, dryrun = F #' ) #' d_tidy <- um2$tidy_files(d) +#' d_write <- um2$write( +#' d_tidy, +#' outdir = file.path(outdir, "dracarys_tidy"), +#' prefix = glue("{SubjectID}__{SampleID_tumor}"), +#' format = "tsv" +#' ) #' } #' #' @export @@ -50,7 +58,8 @@ Wf_umccrise <- R6::R6Class( SubjectID = NULL, SampleID_tumor = NULL, #' @description Create a new Wf_umccrise object. - #' @param path Output directory path with results. + #' @param path Path to directory with raw workflow results (from GDS, S3, or + #' local filesystem). #' @param SubjectID The SubjectID of the sample (needed for path lookup). #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { @@ -64,7 +73,7 @@ Wf_umccrise <- R6::R6Class( "-dbs\\.tsv\\.gz$", "sigsdbstsv", "-indel\\.tsv\\.gz$", "sigsindeltsv", "-qc_summary\\.tsv\\.gz$", "qcsummarytsv", - "multiqc_conpair.txt", "conpairmultiqc", + "multiqc_conpair\\.txt$", "conpairmultiqc", "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" ) |> dplyr::mutate(fun = paste0("read_", .data$fun)) @@ -260,16 +269,56 @@ Wf_umccrise <- R6::R6Class( ) # end public ) -# read = function() { -# x <- self$path -# # now return all as list elements -# list( -# chord = grep_file(x, "-chord\\.tsv\\.gz$") |> self$read_chordtsv(), -# hrdetect = grep_file(x, "-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(), -# sigs2015 = grep_file(x, "-snv_2015\\.tsv\\.gz$") |> self$read_sigs(), -# sigs2020 = grep_file(x, "-snv_2020\\.tsv\\.gz$") |> self$read_sigs(), -# sigsdbs = grep_file(x, "-dbs\\.tsv\\.gz$") |> self$read_sigs(), -# sigsindel = grep_file(x, "-indel\\.tsv\\.gz$") |> self$read_sigs(), -# qcsum = grep_file(x, "-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv() -# ) -# } +#' umccrise Download Tidy and Write +#' +#' Downloads files from the `umccrise` workflow and writes them in a tidy format. +#' +#' @param path Path to directory with raw workflow results (from GDS, S3, or +#' local filesystem). +#' @param SubjectID The SubjectID of the sample (needed for path lookup). +#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). +#' @param outdir Path to output directory. +#' @param max_files Max number of files to list. +#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @return List where each element is a tidy tibble of a umccrise file. +#' +#' @examples +#' \dontrun{ +#' SubjectID <- "SBJ03043" +#' SampleID_tumor <- "PRJ230004" +#' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") +#' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' d <- Wf_umccrise_download_tidy_write( +#' path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, +#' outdir = outdir, +#' dryrun = F +#' ) +#' } +#' @export +Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor, + outdir, max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + dryrun = FALSE) { + um <- Wf_umccrise$new( + path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor + ) + d_dl <- um$download_files( + outdir = outdir, ica_token = ica_token, + max_files = max_files, dryrun = dryrun + ) + if (!dryrun) { + d_tidy <- um$tidy_files(d_dl) + d_write <- um$write( + d_tidy, + outdir = file.path(outdir, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = "tsv" + ) + return(d_write) + } + return(d_dl) +} diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 1ff1a33..5fd5ade 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -12,6 +12,7 @@ Reads and writes tidy versions of files from the \code{umccrise} workflow #---- LOCAL ----# SubjectID <- "SBJ03043" SampleID_tumor <- "PRJ230004" +prefix <- glue("{SubjectID}__{SampleID_tumor}") p1_local <- "~/icav1/g/production/analysis_data" p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) @@ -29,6 +30,7 @@ d_write <- um1$write( #---- GDS ----# SubjectID <- "SBJ03043" SampleID_tumor <- "PRJ230004" +prefix <- glue("{SubjectID}__{SampleID_tumor}") p1_gds <- "gds://production/analysis_data" p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") outdir <- file.path(sub("gds:/", "~/icav1/g", p)) @@ -41,6 +43,12 @@ d <- um2$download_files( max_files = 1000, dryrun = F ) d_tidy <- um2$tidy_files(d) +d_write <- um2$write( + d_tidy, + outdir = file.path(outdir, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = "tsv" +) } } @@ -97,7 +105,8 @@ Create a new Wf_umccrise object. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{path}}{Output directory path with results.} +\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or +local filesystem).} \item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} diff --git a/man/Wf_umccrise_download_tidy_write.Rd b/man/Wf_umccrise_download_tidy_write.Rd new file mode 100644 index 0000000..5364d61 --- /dev/null +++ b/man/Wf_umccrise_download_tidy_write.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/umccrise.R +\name{Wf_umccrise_download_tidy_write} +\alias{Wf_umccrise_download_tidy_write} +\title{umccrise Download Tidy and Write} +\usage{ +Wf_umccrise_download_tidy_write( + path, + SubjectID, + SampleID_tumor, + outdir, + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + dryrun = FALSE +) +} +\arguments{ +\item{path}{Path to directory with raw workflow results (from GDS, S3, or +local filesystem).} + +\item{SubjectID}{The SubjectID of the sample (needed for path lookup).} + +\item{SampleID_tumor}{The SampleID of the tumor sample (needed for path lookup).} + +\item{outdir}{Path to output directory.} + +\item{max_files}{Max number of files to list.} + +\item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{dryrun}{If TRUE, just list the files that will be downloaded (don't +download them).} +} +\value{ +List where each element is a tidy tibble of a umccrise file. +} +\description{ +Downloads files from the \code{umccrise} workflow and writes them in a tidy format. +} +\examples{ +\dontrun{ +SubjectID <- "SBJ03043" +SampleID_tumor <- "PRJ230004" +p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") +p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +token <- Sys.getenv("ICA_ACCESS_TOKEN") +d <- Wf_umccrise_download_tidy_write( + path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, + outdir = outdir, + dryrun = F +) +} +} From ea2bb516c14dbca580face3524c8282a3b197e04 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 10 Sep 2024 21:35:44 +1000 Subject: [PATCH 22/24] umccrise: add dl_and_tidy.R script --- R/umccrise.R | 5 +- inst/rmd/umccr_workflows/umccrise/.gitignore | 1 + .../umccr_workflows/umccrise/dl_and_tidy.R | 79 +++++++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 inst/rmd/umccr_workflows/umccrise/.gitignore create mode 100755 inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R diff --git a/R/umccrise.R b/R/umccrise.R index 8f1deae..ffbe5c5 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -282,6 +282,7 @@ Wf_umccrise <- R6::R6Class( #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). +#' @param format Format of output files. #' @return List where each element is a tidy tibble of a umccrise file. #' #' @examples @@ -300,7 +301,7 @@ Wf_umccrise <- R6::R6Class( #' } #' @export Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor, - outdir, max_files = 1000, + outdir, format = "rds", max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), dryrun = FALSE) { um <- Wf_umccrise$new( @@ -316,7 +317,7 @@ Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor, d_tidy, outdir = file.path(outdir, "dracarys_tidy"), prefix = glue("{SubjectID}__{SampleID_tumor}"), - format = "tsv" + format = format ) return(d_write) } diff --git a/inst/rmd/umccr_workflows/umccrise/.gitignore b/inst/rmd/umccr_workflows/umccrise/.gitignore new file mode 100644 index 0000000..674563b --- /dev/null +++ b/inst/rmd/umccr_workflows/umccrise/.gitignore @@ -0,0 +1 @@ +nogit diff --git a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R new file mode 100755 index 0000000..82f9974 --- /dev/null +++ b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R @@ -0,0 +1,79 @@ +#!/usr/bin/env Rscript + +{ + require(dplyr) + require(assertthat, include.only = "assert_that") + require(dracarys, include.only = "Wf_umccrise_download_tidy_write") + require(glue, include.only = "glue") + require(here, include.only = "here") + require(rportal, include.only = c("portaldb_query_workflow")) + require(tidyr, include.only = "separate_wider_delim") +} + +query_workflow_umccrise <- function(start_date, end_date) { + q1 <- glue( + "WHERE \"type_name\" = 'umccrise'", + "AND \"start\" >= date(\'{start_date}\')", + "AND \"end\" <= date(\'{end_date}\')", + "ORDER BY \"start\" DESC;" + ) + rportal::portaldb_query_workflow(q1) +} + +query_limsrow_libids <- function(libids) { + assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) + libids <- unique(libids) |> + paste(collapse = "|") + q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');") + rportal::portaldb_query_limsrow(q1) +} + +# first read in the workflows table, extract metadata, then join with lims +start_date <- "2024-08-29" +end_date <- "2024-09-01" +meta_raw <- query_workflow_umccrise(start_date, end_date) +meta <- meta_raw |> + rportal::meta_umccrise() +lims_raw <- query_limsrow_libids(meta$LibraryID_tumor) +lims <- lims_raw |> + tidyr::separate_wider_delim( + library_id, + delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" + ) |> + select( + subject_id, library_id, sample_id, sample_name, + external_subject_id, external_sample_id, + project_name, project_owner, + source, quality + ) |> + distinct() +table(lims$library_id %in% meta$LibraryID_tumor) # double-check + +meta_lims <- meta |> + left_join(lims, by = c("LibraryID_tumor" = "library_id")) |> + mutate(rownum = row_number()) |> + select( + rownum, wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal, + SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id, + project_owner, project_name, source, quality + ) +meta_lims |> + saveRDS(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/meta/{start_date}_{end_date}.rds"))) + +d <- meta_lims |> + rowwise() |> + mutate( + indir = .data$gds_outdir_umccrise, + outdir = file.path(sub("gds://", "", .data$indir)), + outdir = file.path(normalizePath("~/icav1/g"), .data$outdir), + res = list( + dracarys::Wf_umccrise_download_tidy_write( + path = .data$indir, SubjectID = .data$SubjectID, SampleID_tumor = .data$SampleID_tumor, + outdir = .data$outdir, max_files = 1000, dryrun = FALSE + ) + ) + ) |> + ungroup() + +d |> + saveRDS(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/results_{start_date}_{end_date}.rds"))) From 8baa52c4183419f6c2f680685f2e28224f2bcd33 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 10 Sep 2024 23:43:29 +1000 Subject: [PATCH 23/24] add umccrise summary report --- inst/rmd/umccr_workflows/umccrise/.gitignore | 2 + inst/rmd/umccr_workflows/umccrise/render.sh | 9 + .../umccrise/summary_umccrise.qmd | 224 ++++++++++++++++++ man/Wf_umccrise_download_tidy_write.Rd | 3 + 4 files changed, 238 insertions(+) create mode 100644 inst/rmd/umccr_workflows/umccrise/render.sh create mode 100644 inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd diff --git a/inst/rmd/umccr_workflows/umccrise/.gitignore b/inst/rmd/umccr_workflows/umccrise/.gitignore index 674563b..0182e8f 100644 --- a/inst/rmd/umccr_workflows/umccrise/.gitignore +++ b/inst/rmd/umccr_workflows/umccrise/.gitignore @@ -1 +1,3 @@ nogit + +/.quarto/ diff --git a/inst/rmd/umccr_workflows/umccrise/render.sh b/inst/rmd/umccr_workflows/umccrise/render.sh new file mode 100644 index 0000000..bfff2c7 --- /dev/null +++ b/inst/rmd/umccr_workflows/umccrise/render.sh @@ -0,0 +1,9 @@ +date_start="2024-08-29" +date_end="2024-09-01" +out="umccrise_${date_start}_${date_end}.html" + +quarto render summary_umccrise.qmd \ + -P date_start:${date_start} \ + -P date_end:${date_end} \ + -o ${out} \ + --output-dir nogit/html diff --git a/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd new file mode 100644 index 0000000..50b7329 --- /dev/null +++ b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd @@ -0,0 +1,224 @@ +--- +title: "{{< meta params.title >}}" +subtitle: "Period: `r paste(params$date_start, ' to ', params$date_end)`" +author: "UMCCR - Genomics Platform Group" +date: now +date-format: "YYYY-MM-DD HH:mm Z" +execute: + echo: false +format: + html: + toc: true + toc-expand: 1 + toc-title: Contents + toc-location: body + highlight-style: github + number-sections: false + link-external-icon: true + link-external-newwindow: true + embed-resources: true + code-copy: true + code-link: true + code-fold: true + code-block-border-left: true + smooth-scroll: true + grid: + body-width: 1300px +params: + title: "UMCCR umccrise Workflow Summary" + date_start: "XXXX-XX-XX" + date_end: "XXXX-XX-XX" +--- + +```{r} +#| label: pkg_load +#| message: false +{ + require(dplyr) # import all dplyr funcs + require(readr, include.only = c("read_rds")) + require(purrr, include.only = c("map")) + require(tidyr, include.only = c("unnest_wider")) + require(dracarys, include.only = c("session_info_kable")) + require(glue, include.only = "glue") + require(here, include.only = "here") + require(knitr, include.only = "kable") + require(reactable, include.only = "reactable") + require(ggplot2, include.only = c("ggplot", "aes")) + require(lubridate, include.only = c("as_datetime")) + require(plotly, include.only = c("ggplotly")) +} +set.seed(42) +``` + +```{r} +#| label: load_data +date_start <- "2024-08-29" +date_end <- "2024-09-01" +d_raw <- readr::read_rds(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/results_{date_start}_{date_end}.rds"))) +``` + +## Results + +```{r} +#| label: process +# one row per file type - not all samples have sigsdbstsv +d <- d_raw |> + tidyr::unnest_longer(res, indices_to = "filetype") +# main_cols <- c("rownum", "portal_run_id", "SubjectID", "LibraryID_tumor") +main_cols <- c("portal_run_id") +qcsum <- d |> + filter(filetype == "qcsummarytsv") |> + unnest_wider(res) |> + select(all_of(main_cols), qc_status_hmf:bpi_enabled) +hrd_chord <- d |> + filter(filetype == "chordtsv") |> + unnest_wider(res) |> + select(all_of(main_cols), + chord_p_hrd = "p_hrd", + chord_hr_status = "hr_status", + chord_hrd_type = "hrd_type", + chord_p_BRCA1 = "p_BRCA1", + chord_p_BRCA2 = "p_BRCA2" + ) +hrd_hrdetect <- d |> + filter(filetype == "hrdetecttsv") |> + unnest_wider(res) |> + select(all_of(main_cols), hrdetect_prob = "Probability") +sigs_snv2015 <- d |> + filter(filetype == "sigssnv2015tsv") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_snv2020 <- d |> + filter(filetype == "sigssnv2020tsv") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_dbs <- d |> + filter(filetype == "sigsdbstsv") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_indel <- d |> + filter(filetype == "sigsindeltsv") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) + +# signatures +dsig <- bind_rows( + list( + snv2015 = sigs_snv2015, snv2020 = sigs_snv2020, dbs = sigs_dbs, indel = sigs_indel + ), + .id = "Sig_group" +) + +# keep top two ranked sigs +dsig_filt <- dsig |> + group_by(Sig_group, portal_run_id) |> + mutate(tot_sig_vars = sum(Contribution)) |> + arrange(Rank) |> + slice_head(n = 2) |> + # some sigs have same Rank so use explicit sig_rank + mutate(sig_rank = row_number()) |> + ungroup() |> + mutate( + sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})") + ) |> + select(Sig_group, portal_run_id, sig_rank, sig_summary) |> + tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |> + mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |> + select(portal_run_id, sig_top2) + +dall <- d_raw |> + select( + date_analysed = "start", portal_run_id, + SubjectID, LibraryID_tumor, SampleID_tumor, + external_subject_id, external_sample_id, + project_owner, project_name, source, quality + ) |> + left_join(qcsum, by = "portal_run_id") |> + left_join(hrd_chord, by = "portal_run_id") |> + left_join(hrd_hrdetect, by = "portal_run_id") +``` + +```{r} +#| label: hrd_plot +#| fig-width: 15 +#| fig-height: 15 +p1 <- dall |> + mutate(sbj = glue("{SubjectID}_{LibraryID_tumor}")) |> + select(date_analysed, sbj, chord = hrd_chord, hrdetect = hrd_hrdetect) |> + tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") |> + ggplot2::ggplot(aes(x = date_analysed, y = probability, label = sbj)) + + ggplot2::geom_point(aes(colour = method)) + + ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) + + ggplot2::theme_bw() + + ggplot2::ggtitle("CHORD vs. HRDetect per SubjectID") + +plotly::ggplotly(p1) +``` + +### Signature Results + +```{r} +#| label: sig_results +#| fig-width: 15 +#| fig-height: 65 + +sig_order2015 <- paste0("Sig", 1:30) +sig_order2020 <- paste0( + "SBS", + c( + 1:6, + paste0(7, c("a", "b", "c", "d")), + 8:9, + paste0(10, c("a", "b", "c", "d")), + 11:16, + paste0(17, c("a", "b")), + 18:60, + 84:94 + ) +) + +p2_prep <- dsig |> + filter( + Sig_group == "snv2015", + Rank %in% c(1:3) + ) |> + left_join(dall |> select(portal_run_id, date_analysed, SubjectID, LibraryID_tumor), by = "portal_run_id") |> + mutate(sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}"))) |> + select(date_analysed, sbj, Sig_group, Rank, Signature, Contribution, RelFreq) |> + mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020))) +p2 <- p2_prep |> + ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) + + ggplot2::geom_bar(position = "fill", stat = "identity") + + ggplot2::theme_bw(base_size = 7) +# ggplot2::facet_wrap(~Sig_group, ncol = 1) + +plotly::ggplotly(p2, tooltip = c("x", "text", "fill")) +``` + +## Metadata Summary + +::: {.panel-tabset .nav-pills} + +### Project Name/Owner + +```{r} +#| label: project_owner_name +dall |> + count(project_name, project_owner) |> + knitr::kable() +``` + +### Source / Quality + +```{r} +#| label: source_quality +count(dall, source, quality) |> knitr::kable() +``` + +::: + +
diff --git a/man/Wf_umccrise_download_tidy_write.Rd b/man/Wf_umccrise_download_tidy_write.Rd index 5364d61..05ca0bf 100644 --- a/man/Wf_umccrise_download_tidy_write.Rd +++ b/man/Wf_umccrise_download_tidy_write.Rd @@ -9,6 +9,7 @@ Wf_umccrise_download_tidy_write( SubjectID, SampleID_tumor, outdir, + format = "rds", max_files = 1000, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), dryrun = FALSE @@ -24,6 +25,8 @@ local filesystem).} \item{outdir}{Path to output directory.} +\item{format}{Format of output files.} + \item{max_files}{Max number of files to list.} \item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} From e91187dfa15806e96e35d78d6094efd43a318a8d Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 11 Sep 2024 08:44:43 +1000 Subject: [PATCH 24/24] umccrise: add reactable table viewer --- .../umccr_workflows/umccrise/dl_and_tidy.R | 4 +- .../umccrise/summary_umccrise.qmd | 189 ++++++++++++++++-- 2 files changed, 169 insertions(+), 24 deletions(-) diff --git a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R index 82f9974..3b5ede3 100755 --- a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R +++ b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R @@ -44,7 +44,7 @@ lims <- lims_raw |> subject_id, library_id, sample_id, sample_name, external_subject_id, external_sample_id, project_name, project_owner, - source, quality + source, quality, workflow ) |> distinct() table(lims$library_id %in% meta$LibraryID_tumor) # double-check @@ -55,7 +55,7 @@ meta_lims <- meta |> select( rownum, wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal, SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id, - project_owner, project_name, source, quality + project_owner, project_name, source, quality, workflow ) meta_lims |> saveRDS(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/meta/{start_date}_{end_date}.rds"))) diff --git a/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd index 50b7329..0f9390e 100644 --- a/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd +++ b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd @@ -57,6 +57,48 @@ date_end <- "2024-09-01" d_raw <- readr::read_rds(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/results_{date_start}_{date_end}.rds"))) ``` +```{r} +#| label: funcs +dt_view <- function(x, id, ...) { + htmltools::browsable( + htmltools::tagList( + htmltools::tags$button( + htmltools::tagList(fontawesome::fa("download"), "CSV"), + onclick = glue("Reactable.downloadDataCSV('{id}', '{id}.csv')") + ), + x |> + reactable::reactable( + bordered = TRUE, + filterable = TRUE, + fullWidth = TRUE, + height = 800, + highlight = TRUE, + pagination = FALSE, + resizable = TRUE, + searchable = TRUE, + sortable = TRUE, + striped = TRUE, + wrap = FALSE, + elementId = id, + ... + ) + ) + ) +} +``` + +## Metadata + +```{r} +#| label: metadata +meta <- d_raw |> + select( + rownum, portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) +dt_view(meta, id = "metadata") +``` + ## Results ```{r} @@ -66,10 +108,61 @@ d <- d_raw |> tidyr::unnest_longer(res, indices_to = "filetype") # main_cols <- c("rownum", "portal_run_id", "SubjectID", "LibraryID_tumor") main_cols <- c("portal_run_id") +``` + +```{r} +#| label: qcsum qcsum <- d |> filter(filetype == "qcsummarytsv") |> + select(all_of(main_cols), res) |> + unnest_wider(res) +``` + +```{r} +#| label: pcgr +pcgr <- d |> + filter(filetype == "pcgrjson") |> + select(all_of(main_cols), res) |> unnest_wider(res) |> - select(all_of(main_cols), qc_status_hmf:bpi_enabled) + rename( + msi_fraction_indels_pcgr = "fracIndels", + msi_pcgr = "predicted_class", + tmb_pcgr = "tmb_estimate", + n_tmb_pcgr = "n_tmb" + ) |> + mutate(msi_pcgr = sub(" \\(.*\\)", "", msi_pcgr)) +``` + +```{r} +#| label: conpair +sampleids <- d_raw |> + select(SampleID_tumor, SampleID_normal) |> + tidyr::pivot_longer(everything(), values_to = "sampleid") |> + mutate(phenotype = sub("SampleID_", "", .data$name)) |> + select(sampleid, phenotype) |> + distinct() +conpair_raw <- d |> + filter(filetype == "conpairmultiqc") |> + select(all_of(main_cols), res) |> + tidyr::unnest(res) |> + left_join(sampleids, by = "sampleid") |> + select(-sampleid) +conpair_tumor <- conpair_raw |> + filter(phenotype == "tumor") +conpair_normal <- conpair_raw |> + filter(phenotype == "normal") |> + select(portal_run_id, contamination) +conpair <- conpair_tumor |> + left_join(conpair_normal, by = "portal_run_id", suffix = c("_tumor", "_normal")) |> + select(portal_run_id, + contamination_tumor_conpair = "contamination_tumor", + contamination_normal_conpair = "contamination_normal", + concordance_conpair = "concordance" + ) +``` + +```{r} +#| label: hrd hrd_chord <- d |> filter(filetype == "chordtsv") |> unnest_wider(res) |> @@ -84,6 +177,10 @@ hrd_hrdetect <- d |> filter(filetype == "hrdetecttsv") |> unnest_wider(res) |> select(all_of(main_cols), hrdetect_prob = "Probability") +``` + +```{r} +#| label: sigs sigs_snv2015 <- d |> filter(filetype == "sigssnv2015tsv") |> select(all_of(main_cols), res) |> @@ -104,8 +201,6 @@ sigs_indel <- d |> select(all_of(main_cols), res) |> tidyr::unnest_wider(res) |> tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) - -# signatures dsig <- bind_rows( list( snv2015 = sigs_snv2015, snv2020 = sigs_snv2020, dbs = sigs_dbs, indel = sigs_indel @@ -128,20 +223,37 @@ dsig_filt <- dsig |> select(Sig_group, portal_run_id, sig_rank, sig_summary) |> tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |> mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |> - select(portal_run_id, sig_top2) + select(Sig_group, portal_run_id, sig_top2) |> + tidyr::pivot_wider(names_from = Sig_group, values_from = sig_top2) |> + select(portal_run_id, snv2015, snv2020, dbs, indel) +``` +```{r} +#| label: qc_all dall <- d_raw |> select( + rownum, date_analysed = "start", portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id, - project_owner, project_name, source, quality + project_owner, project_name, source, quality, workflow ) |> left_join(qcsum, by = "portal_run_id") |> left_join(hrd_chord, by = "portal_run_id") |> - left_join(hrd_hrdetect, by = "portal_run_id") + left_join(hrd_hrdetect, by = "portal_run_id") |> + left_join(pcgr, by = "portal_run_id") |> + left_join(conpair, by = "portal_run_id") +``` + +### Summary Metrics + +```{r} +#| label: summary_metrics +dt_view(dall, "summary_metrics") ``` +### HRD Plot + ```{r} #| label: hrd_plot #| fig-width: 15 @@ -159,7 +271,34 @@ p1 <- dall |> plotly::ggplotly(p1) ``` -### Signature Results +### Signatures + +#### All (SNV, Indel, DBS) + +```{r} +#| label: sig_results_all +dsig |> + left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor), + by = "portal_run_id" + ) |> + select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + dt_view("sig_results_all") +``` + +#### Top 2 + +```{r} +#| label: sig_results_top2 +dsig_filt |> + left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor), + by = "portal_run_id" + ) |> + select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + arrange(rownum) |> + dt_view("sig_results_top2") +``` + +#### Top 3 SNV2015 ```{r} #| label: sig_results @@ -167,19 +306,19 @@ plotly::ggplotly(p1) #| fig-height: 65 sig_order2015 <- paste0("Sig", 1:30) -sig_order2020 <- paste0( - "SBS", - c( - 1:6, - paste0(7, c("a", "b", "c", "d")), - 8:9, - paste0(10, c("a", "b", "c", "d")), - 11:16, - paste0(17, c("a", "b")), - 18:60, - 84:94 - ) -) +# sig_order2020 <- paste0( +# "SBS", +# c( +# 1:6, +# paste0(7, c("a", "b", "c", "d")), +# 8:9, +# paste0(10, c("a", "b", "c", "d")), +# 11:16, +# paste0(17, c("a", "b")), +# 18:60, +# 84:94 +# ) +# ) p2_prep <- dsig |> filter( @@ -189,12 +328,11 @@ p2_prep <- dsig |> left_join(dall |> select(portal_run_id, date_analysed, SubjectID, LibraryID_tumor), by = "portal_run_id") |> mutate(sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}"))) |> select(date_analysed, sbj, Sig_group, Rank, Signature, Contribution, RelFreq) |> - mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020))) + mutate(Signature = factor(Signature, levels = sig_order2015)) p2 <- p2_prep |> ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) + ggplot2::geom_bar(position = "fill", stat = "identity") + ggplot2::theme_bw(base_size = 7) -# ggplot2::facet_wrap(~Sig_group, ncol = 1) plotly::ggplotly(p2, tooltip = c("x", "text", "fill")) ``` @@ -219,6 +357,13 @@ dall |> count(dall, source, quality) |> knitr::kable() ``` +### Workflow + +```{r} +#| label: workflow_summary +count(dall, workflow) |> knitr::kable() +``` + :::