diff --git a/NAMESPACE b/NAMESPACE index 4eb4d8d..b10ca1f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,10 +28,11 @@ export(TsoSampleAnalysisResultsFile) export(TsoTargetRegionCoverageFile) export(TsoTmbFile) export(TsoTmbTraceTsvFile) -export(UmccriseCanRepTables) export(VCMetricsFile) export(Wf) export(Wf_tso_ctdna_tumor_only) +export(Wf_umccrise) +export(Wf_umccrise_download_tidy_write) export(WgsContigMeanCovFile) export(WgsCoverageMetricsFile) export(WgsFineHistFile) @@ -45,14 +46,17 @@ export(dr_output_format_valid) export(dr_s3_download) export(empty_tbl) export(file_regex_getter) -export(gds_file_download) export(gds_file_download_api) +export(gds_file_download_cli) export(gds_file_presignedurl) -export(gds_files_list) export(gds_files_list_fastq) -export(gds_files_list_filter_relevant) +export(gds_list_files_dir) +export(gds_list_files_filter_relevant) export(gds_volumes_list) +export(grep_file) export(ica_token_validate) +export(local_list_files_dir) +export(local_list_files_filter_relevant) export(match_regex) export(multiqc_column_map_append) export(multiqc_date_fmt) @@ -66,11 +70,15 @@ export(multiqc_parse_xyline_plot_contig_cvg) export(multiqc_tidy_json) export(rdf2tab) export(read) -export(s3_files_list_filter_relevant) +export(s3_file_presignedurl) +export(s3_list_files_dir) +export(s3_list_files_filter_relevant) export(s3_search) export(session_info_kable) +export(tidy_files) export(time_metrics_process) export(umccr_tidy) +export(write_dracarys_list_of_tbls) importFrom(R6,R6Class) importFrom(ggplot2,ggplot) importFrom(ggrepel,geom_text_repel) diff --git a/R/Wf.R b/R/Wf.R index 317100f..aa5bfae 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -1,47 +1,82 @@ -#' Workflow R6 Class +#' @title Workflow #' #' @description Workflow is a base R6 class representing a bioinformatic #' workflow run from a UMCCR workflow manager. #' +#' A workflow has: +#' +#' - a directory path with all the raw output files (either on GDS, S3 or +#' local filesystem) +#' - a subset of files that are of interest for ingestion +#' - tibble with full path and basename columns +#' - a set of parsers that can parse and tidy those files +#' - each parser takes a path and returns a tidy tibble +#' - a list of tidy tibbles (or a tibble with nested tibbles) +#' #' @examples -#' p1 <- system.file("extdata/portaldb_workflow_top4.rds", package = "rportal") |> -#' readRDS() |> -#' dplyr::filter(type_name == "umccrise") |> -#' dplyr::slice(1) -#' w <- Wf$new( -#' prid = p1$portal_run_id, type = p1$type_name, start = p1$start, end = p1$end, -#' status = p1$end_status, input = p1$input, output = p1$output +#' \dontrun{ +#' regexes <- tibble::tribble( +#' ~regex, ~fun, +#' "-chord\\.tsv\\.gz$", "UmChordTsvFile", +#' "-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", +#' "-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", +#' "-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", +#' "-dbs\\.tsv\\.gz$", "UmSigsDbsFile", +#' "-indel\\.tsv\\.gz$", "UmSigsIndelFile", +#' "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", +#' ) +#' +#' #---- LOCAL ----# +#' p1_local <- "~/icav1/g/production/analysis_data" +#' p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +#' um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) +#' um1$list_files(max_files = 10) +#' um1$list_files_filter_relevant(max_files = 10) +#' +#' #---- GDS ----# +#' p1_gds <- "gds://production/analysis_data" +#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) +#' um2$list_files(max_files = 10) +#' um2$list_files_filter_relevant(ica_token = token, max_files = 500) +#' d <- um2$download_files( +#' outdir = outdir, ica_token = token, +#' max_files = 1000, dryrun = T #' ) -#' w +#' d_tidy <- um2$tidy_files(d) +#' +#' #---- S3 ----# +#' p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" +#' p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables" +#' p <- file.path(p1_s3, p2_s3) +#' outdir <- sub("s3:/", "~/s3", p) +#' um3 <- Wf$new(path = p, wname = "sash", regexes = regexes) +#' um3$list_files(max_files = 10) +#' um3$list_files_filter_relevant(max_files = 50) +#' d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F) +#' } +#' #' @export Wf <- R6::R6Class( "Wf", public = list( - #' @field prid Portal run ID. - #' @field type Workflow type. - #' @field start Workflow start datetime. - #' @field end Workflow end datetime. - #' @field status Workflow end status. - #' @field input Workflow input JSON string. - #' @field output Workflow output JSON string. - prid = NULL, - type = NULL, - start = NULL, - end = NULL, - status = NULL, - input = NULL, - output = NULL, + #' @field path Path to directory with raw workflow results (from GDS, S3, or + #' local filesystem). + #' @field wname Name of workflow (e.g. umccrise, sash). + #' @field filesystem Filesystem of `path` (gds/s3/local). + #' @field regexes Tibble with file `regex` and `fun`ction to parse it. + path = NULL, + wname = NULL, + filesystem = NULL, + regexes = NULL, #' @description Create a new Workflow object. - #' @param prid Portal run ID. - #' @param type Workflow type. - #' @param start Workflow start datetime. - #' @param end Workflow end datetime. - #' @param status Workflow end status. - #' @param input Workflow input JSON string. - #' @param output Workflow output JSON string. - initialize = function(prid = NULL, type = NULL, start = NULL, end = NULL, - status = NULL, input = NULL, output = NULL) { - types <- c( + #' @param path Path to directory with raw workflow results. + #' @param wname Name of workflow. + #' @param regexes Tibble with file `regex` and `fun`ction to parse it. + initialize = function(path = NULL, wname = NULL, regexes = NULL) { + wnames <- c( "bcl_convert", "tso_ctdna_tumor_only", "wgs_alignment_qc", @@ -56,30 +91,140 @@ Wf <- R6::R6Class( "oncoanalyser_wgts_existing_both", "sash" ) - assertthat::assert_that( - type %in% types + assertthat::assert_that(wname %in% wnames) + self$path <- path + self$wname <- wname + self$filesystem <- dplyr::case_when( + grepl("^gds://", path) ~ "gds", + grepl("^s3://", path) ~ "s3", + .default = "local" ) - self$prid <- prid - self$type <- type - self$start <- start - self$end <- end - self$status <- status - self$input <- input - self$output <- output + self$regexes <- regexes }, #' @description Print details about the Workflow. #' @param ... (ignored). print = function(...) { res <- tibble::tribble( ~var, ~value, - "prid", self$prid, - "type", self$type, - "start", as.character(self$start), - "end", as.character(self$end), - "status", self$status, + "path", self$path, + "wname", self$wname, + "filesystem", self$filesystem ) print(res) invisible(self) + }, + #' @description List all files under given path. + #' @param path Path with raw results. + #' @param max_files Max number of files to list (for gds/s3 only). + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param ... Passed on to `gds_list_files_dir` function. + list_files = function(path = self$path, max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + if (self$filesystem == "gds") { + d <- gds_list_files_dir( + gdsdir = path, token = ica_token, page_size = max_files, ... + ) + } else if (self$filesystem == "s3") { + d <- s3_list_files_dir(s3dir = path, max_objects = max_files) + } else { + d <- local_list_files_dir(localdir = path, max_files = max_files) + } + return(d) + }, + #' @description List dracarys files under given path + #' @param path Path with raw results. + #' @param max_files Max number of files to list (for gds/s3 only). + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param ... Passed on to the `gds_list_files_filter_relevant` or + #' the `s3_list_files_filter_relevant` function. + list_files_filter_relevant = function(path = self$path, max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + regexes <- self$regexes + assertthat::assert_that(!is.null(regexes)) + if (self$filesystem == "gds") { + d <- gds_list_files_filter_relevant( + gdsdir = path, regexes = regexes, token = ica_token, page_size = max_files, ... + ) + } else if (self$filesystem == "s3") { + d <- s3_list_files_filter_relevant( + s3dir = path, regexes = regexes, max_objects = max_files, ... + ) + } else { + d <- local_list_files_filter_relevant( + localdir = path, regexes = regexes, max_files = max_files + ) + } + d + }, + #' @description Download files from GDS/S3 to local filesystem. + #' @param path Path with raw results. + #' @param outdir Path to output directory. + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param max_files Max number of files to list. + #' @param dryrun If TRUE, just list the files that will be downloaded (don't + #' download them). + #' @param recursive Should files be returned recursively _in and under_ the specified + #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). + #' @param list_filter_fun Function to filter relevant files. + download_files = function(path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, dryrun = FALSE, recursive = NULL, + list_filter_fun = NULL) { + # TODO: add envvar checker + regexes <- self$regexes + assertthat::assert_that(!is.null(regexes), !is.null(list_filter_fun)) + if (self$filesystem == "gds") { + d <- dr_gds_download( + gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token, + page_size = max_files, dryrun = dryrun, recursive = recursive, + list_filter_fun = list_filter_fun + ) + if (!dryrun) { + self$filesystem <- "local" + self$path <- outdir + } + } else if (self$filesystem == "s3") { + d <- dr_s3_download( + s3dir = path, outdir = outdir, regexes = regexes, + max_objects = max_files, dryrun = dryrun, + list_filter_fun = list_filter_fun + ) + if (!dryrun) { + self$filesystem <- "local" + self$path <- outdir + } + } else { + d <- self$list_files_filter_relevant(regexes = regexes, max_files = max_files) + } + return(d) + }, + #' @description Tidy given files. + #' @param x Tibble with `localpath` to file and the function `type` to parse it. + tidy_files = function(x) { + # awesomeness + tidy_files(x, envir = self) + }, + #' @description Write tidy data. + #' @param x Tibble with tidy `data` and file `type`. + #' @param outdir Directory path to output tidy files. + #' @param prefix Prefix of output files. + #' @param format Format of output files. + #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + write = function(x, outdir = NULL, prefix = NULL, format = "tsv", drid = NULL) { + assertthat::assert_that(!is.null(prefix)) + if (!is.null(outdir)) { + prefix <- file.path(outdir, prefix) + } + d_write <- x |> + dplyr::rowwise() |> + dplyr::mutate( + section = sub("read_", "", .data$type), + p = glue("{prefix}_{.data$section}"), + out = list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid)) + ) |> + dplyr::ungroup() |> + dplyr::select("section", "data") |> + tibble::deframe() + invisible(d_write) } ) # end public ) diff --git a/R/fs_icav1.R b/R/fs_icav1.R new file mode 100644 index 0000000..16df2c9 --- /dev/null +++ b/R/fs_icav1.R @@ -0,0 +1,268 @@ +#' List Files in ICAv1 GDS Directory +#' +#' Lists files in a GDS directory. +#' +#' @param gdsdir Full path to GDS directory. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @param page_size Page size (def: 10 via ICA API). +#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API). +#' @param page_token Page token (def: NULL). Used internally for recursion. +#' @param no_recurse Do not recurse through the file list i.e. just give the first items +#' without recursing further down the list using . +#' @param recursive Should files be returned recursively _in and under_ the specified +#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). +#' +#' @return A tibble with file ID, basename, size, last modified timestamp, +#' full GDS path, and presigned URL if requested. +#' @examples +#' \dontrun{ +#' gdsdir <- file.path( +#' "gds://production/analysis_data/SBJ00699/umccrise", +#' "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" +#' ) +#' token <- ica_token_validate() +#' page_size <- 11 +#' include_url <- F +#' page_token <- NULL +#' no_recurse <- TRUE +#' recursive <- NULL +#' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) +#' } +#' @export +gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL, + include_url = FALSE, no_recurse = TRUE, page_token = NULL, + recursive = NULL) { + assertthat::assert_that(is.logical(no_recurse), is.logical(include_url)) + assertthat::assert_that(is.null(recursive) || is.logical(recursive)) + token <- ica_token_validate(token) + assertthat::assert_that(grepl("^gds://", gdsdir)) + gdsdir_original <- gdsdir + if (!grepl("/$", gdsdir)) { + gdsdir <- glue("{gdsdir}/") + } + base_url <- "https://aps2.platform.illumina.com/v1" + volname <- sub("gds://(.*?)/.*", "\\1", gdsdir) + path2 <- sub("gds://(.*?)/(.*)", "\\2", gdsdir) + page_size <- ifelse(is.null(page_size), "", glue("&pageSize={page_size}")) + query_url <- glue("{base_url}/files?volume.name={volname}&path=/{path2}*{page_size}") + if (include_url) { + query_url <- glue("{query_url}&include=PresignedUrl") + } + if (!is.null(page_token)) { + query_url <- glue("{query_url}&pageToken={page_token}") + } + if (!is.null(recursive)) { + # without specifying recursive, it's true by default + recursive <- ifelse(recursive, "true", "false") + query_url <- glue("{query_url}&recursive={recursive}") + } + query_res <- httr::GET( + query_url, + httr::add_headers(Authorization = glue("Bearer {token}")), + httr::accept_json() + ) + j <- jsonlite::fromJSON(httr::content(x = query_res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE) + if (j[["itemCount"]] == 0) { + if (gds_likely_file(gdsdir_original)) { + cli::cli_abort("{date_log()} ERROR: Is the input directory a file perhaps?\n{.file {gdsdir_original}}") + } + # if there is a nextPageToken then abort, else continue + if (!is.null(j[["nextPageToken"]])) { + msg <- paste0( + "{date_log()} ERROR: ", + "No GDS files listed in the input directory. Please confirm you can ", + "access the following GDS input directory with your token: ", + "{.file {gdsdir_original}}" + ) + cli::cli_abort(msg) + } + } # endif + d <- j[["items"]] |> + purrr::map(\(x) c( + file_id = x[["id"]], + path = x[["path"]], + size = x[["sizeInBytes"]], + lastmodified = x[["timeModified"]], + presigned_url = x[["presignedUrl"]] + )) |> + dplyr::bind_rows() + if (nrow(d) == 0) { + # We've iterated through all available items, and the next page has 0 items. + # So dplyr::bind_rows(d, NULL) will return d. + return(NULL) + } + res <- d |> + dplyr::mutate( + size = fs::as_fs_bytes(.data$size), + bname = basename(.data$path), + path = glue("gds://{volname}{.data$path}") + ) |> + dplyr::select(dplyr::any_of(c("bname", "size", "lastmodified", "file_id", "path", "presigned_url"))) + + if (!is.null(j[["nextPageToken"]]) && !no_recurse) { + res2 <- gds_list_files_dir( + gdsdir = gdsdir, token = token, page_size = NULL, + include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]], + recursive = NULL + ) + res <- dplyr::bind_rows(res, res2) + } + res +} + +#' List Relevant Files In ICAv1 GDS Directory +#' +#' Lists relevant files in a GDS directory. +#' +#' @inheritParams gds_list_files_dir +#' @param pattern Pattern to further filter the returned file type tibble. +#' @param regexes Tibble with `regex` and `fun`ction name (see example). +#' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path, +#' and presigned URL if requested. +#' @examples +#' \dontrun{ +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") +#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +#' gds_list_files_filter_relevant(gdsdir) +#' } +#' @export +gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_FILE_REGEX, + token = Sys.getenv("ICA_ACCESS_TOKEN"), + page_size = 100, include_url = FALSE, + no_recurse = TRUE, page_token = NULL, + recursive = NULL) { + pattern <- pattern %||% ".*" # keep all recognisable files by default + assertthat::assert_that(all(colnames(regexes) == c("regex", "fun"))) + cols_sel <- c("type", "bname", "size", "lastmodified", "file_id", "path", "presigned_url") + d <- dracarys::gds_list_files_dir( + gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, + no_recurse = no_recurse, page_token = page_token, recursive = recursive + ) |> + dplyr::rowwise() |> + dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> + dplyr::ungroup() |> + dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> + dplyr::select(dplyr::any_of(cols_sel)) + d +} + +#' dracarys GDS Download +#' +#' Download only GDS files that can be processed by dracarys. +#' +#' @inheritParams gds_list_files_dir +#' @inheritParams gds_list_files_filter_relevant +#' @param outdir Local output directory. +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @param list_filter_fun Function to filter relevant GDS files. +#' @examples +#' \dontrun{ +#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +#' outdir <- sub("gds:/", "~/icav1/g", gdsdir) +#' regexes <- tibble::tribble( +#' ~regex, ~fun, +#' "multiqc_data\\.json$", "MultiqcJsonFile", +#' "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" +#' ) +#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T) +#' } +#' +#' @export +dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), + pattern = NULL, page_size = 100, dryrun = FALSE, + regexes = DR_FILE_REGEX, recursive = NULL, + list_filter_fun = gds_list_files_filter_relevant) { + e <- emojifont::emoji + fs::dir_create(outdir) + d <- list_filter_fun( + gdsdir = gdsdir, pattern = pattern, regexes = regexes, + token = token, page_size = page_size, include_url = FALSE, + no_recurse = FALSE, page_token = NULL, + recursive = recursive + ) + d <- d |> + dplyr::mutate( + gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path), + gdspath_minus_gdsdir_outdir = fs::dir_create( + file.path(outdir, dirname(.data$gdspath_minus_gdsdir)) + ), + localpath = file.path(.data$gdspath_minus_gdsdir_outdir, .data$bname), + gdspath = .data$path + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath") + # download recognisable dracarys files to outdir//{bname} + if (!dryrun) { + cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}") + res <- d |> + dplyr::rowwise() |> + dplyr::mutate( + dl = gds_file_download_api( + gds_fileid = .data$file_id, out_file = .data$localpath, token = token + ), + localpath = normalizePath(.data$localpath) + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id") + return(res) + } else { + cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}") + d |> + dplyr::select("type", "bname", "size", "lastmodified", "gdspath", "file_id", localpath2be = "localpath") |> + as.data.frame() |> + print() + } +} + +#' GDS File Presigned URL +#' +#' Returns presigned URL of given GDS file. +#' +#' @param gds_fileid GDS file ID. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @return Presigned URL if valid. +#' @export +gds_file_presignedurl <- function(gds_fileid, token) { + token <- ica_token_validate(token) + base_url <- "https://aps2.platform.illumina.com/v1" + url <- glue("{base_url}/files/{gds_fileid}") + res <- httr::GET( + url, + httr::add_headers(Authorization = glue("Bearer {token}")), + httr::accept_json() + ) + presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]] + assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url)) + presigned_url +} + +#' GDS File Download via API +#' +#' @param gds_fileid GDS file ID. +#' @param out_file Path to output file. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' +#' @examples +#' \dontrun{ +#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0" +#' out <- tempfile() +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' } +#' @export +gds_file_download_api <- function(gds_fileid, out_file, token) { + presigned_url <- gds_file_presignedurl(gds_fileid, token) + # keep quiet instead of logging presigned urls + status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE) + assertthat::assert_that(status_code == 0) + normalizePath(out_file) +} + +#' GDS File Download via CLI +#' +#' @param gds Full path to GDS file. +#' @param out Path to output file. +#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @export +gds_file_download_cli <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) { + token <- ica_token_validate(token) + system(glue("ica files download {gds} {out} --access-token {token}")) +} diff --git a/R/fs_local.R b/R/fs_local.R new file mode 100644 index 0000000..07397aa --- /dev/null +++ b/R/fs_local.R @@ -0,0 +1,54 @@ +#' List Files in Local Directory +#' +#' Lists files in a local directory. +#' +#' @param localdir Path to local directory. +#' @param max_files Max files returned. +#' +#' @return A tibble with file basename, size, last modification timestamp +#' and full path. +#' @examples +#' localdir <- system.file("R", package = "dracarys") +#' x <- local_list_files_dir(localdir) +#' @testexamples +#' expect_equal(names(x), c("bname", "size", "lastmodified", "path")) +#' @export +local_list_files_dir <- function(localdir, max_files = NULL) { + d <- fs::dir_info(path = localdir, recurse = TRUE, type = "file") |> + dplyr::mutate( + bname = basename(.data$path), + lastmodified = .data$modification_time + ) |> + dplyr::select("bname", "size", "lastmodified", "path") + if (!is.null(max_files)) { + d <- d |> + dplyr::slice_head(n = max_files) + } + d +} + +#' List Relevant Files In Local Directory +#' +#' Lists relevant files in a local directory. +#' +#' @inheritParams local_list_files_dir +#' @param regexes Tibble with `regex` and `fun`ction name (see example). +#' +#' @return A tibble with file type, basename, size, last modified timestamp, and +#' path. +#' +#' @examples +#' localdir <- system.file("extdata/tso", package = "dracarys") +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile") +#' x <- local_list_files_filter_relevant(localdir, regexes) +#' @testexamples +#' expect_equal(nrow(x), 1) +#' @export +local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX, max_files = NULL) { + local_list_files_dir(localdir = localdir, max_files = max_files) |> + dplyr::mutate( + type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes)) + ) |> + dplyr::filter(!is.na(.data$type)) |> + dplyr::select("type", "bname", "size", "lastmodified", localpath = "path") +} diff --git a/R/fs_s3.R b/R/fs_s3.R new file mode 100644 index 0000000..34c437e --- /dev/null +++ b/R/fs_s3.R @@ -0,0 +1,237 @@ +#' List Objects in AWS S3 Directory +#' +#' Returns some or all (up to 1,000) of the objects in an S3 directory. +#' +#' @param s3dir S3 directory. +#' @param max_objects Maximum objects returned. +#' +#' +#' @return A tibble with object basename, size, last modified timestamp, and +#' full S3 path. +#' @examples +#' \dontrun{ +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +#' s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables") +#' s3_list_files_dir(s3dir, max_objects = 15) +#' } +#' @export +s3_list_files_dir <- function(s3dir, max_objects = 1000) { + assertthat::assert_that(grepl("^s3://", s3dir)) + bucket <- sub("s3://(.*?)/.*", "\\1", s3dir) + prefix <- sub("s3://(.*?)/(.*)", "\\2", s3dir) + s3 <- paws.storage::s3() + l <- s3$list_objects_v2(Bucket = bucket, Prefix = prefix, MaxKeys = max_objects) + assertthat::assert_that(all(c("Contents", "KeyCount") %in% names(l))) + cols_sel <- c("bname", "size", "lastmodified", "path") + # handle no results + if (l[["KeyCount"]] == 0) { + return(empty_tbl(cnames = cols_sel, ctypes = "cccc")) + } + d <- l[["Contents"]] |> + purrr::map(\(x) tibble::tibble( + Key = x[["Key"]], + Size = x[["Size"]], + lastmodified = x[["LastModified"]] + )) |> + dplyr::bind_rows() |> + dplyr::mutate( + path = glue("s3://{bucket}/{.data$Key}"), + bname = basename(.data$path), + size = fs::as_fs_bytes(.data$Size) + ) |> + dplyr::select(dplyr::all_of(cols_sel)) + return(d) +} + +#' List Relevant Files In AWS S3 Directory +#' +#' Lists relevant files in an AWS S3 directory. +#' +#' @inheritParams s3_list_files_dir +#' @param pattern Pattern to further filter the returned file type tibble. +#' @param regexes Tibble with `regex` and `fun`ction name. +#' @param presign Include presigned URLs (def: FALSE). +#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated). +#' +#' @return A tibble with file type, basename, size, last modified timestamp, +#' full path, and presigned URL if requested. +#' @examples +#' \dontrun{ +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +#' s3dir <- file.path(p1, p2) +#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") +#' s3_list_files_filter_relevant(s3dir = s3dir, regexes = regexes, max_objects = 300) +#' } +#' @export +s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, + regexes = DR_FILE_REGEX, max_objects = 100, + presign = FALSE, expiry_sec = 3600) { + assertthat::assert_that(rlang::is_logical(presign), max_objects <= 1000) + d_all <- s3_list_files_dir(s3dir = s3dir, max_objects = max_objects) + if (nrow(d_all) == 0) { + return(d_all) + } + pattern <- pattern %||% ".*" # keep all recognisable files by default + cols_sel <- c("type", "bname", "size", "lastmodified", "path") + d <- d_all |> + dplyr::rowwise() |> + dplyr::mutate( + type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes)) + ) |> + dplyr::ungroup() |> + dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> + dplyr::select(dplyr::all_of(cols_sel)) + + if (presign) { + if (nrow(d) == 0) { + return(d) + } + s3_client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) + d <- d |> + dplyr::rowwise() |> + dplyr::mutate(presigned_url = s3_file_presignedurl( + client = s3_client, s3path = .data$path, expiry_seconds = expiry_sec + )) |> + dplyr::ungroup() |> + dplyr::select(dplyr::all_of(c(cols_sel, "presigned_url"))) + } + d +} + +#' dracarys S3 Download +#' +#' Download only S3 files that can be processed by dracarys. +#' +#' @inheritParams s3_list_files_dir +#' @inheritParams s3_list_files_filter_relevant +#' @param outdir Path to output directory. +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @param list_filter_fun Function to filter relevant S3 files. +#' @examples +#' \dontrun{ +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +#' p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +#' s3dir <- file.path(p1, p2) +#' regexes <- tibble::tribble( +#' ~regex, ~fun, +#' "multiqc_data\\.json$", "MultiqcJsonFile", +#' "pcgr.*\\.json\\.gz$", "pcgrjson" +#' ) +#' outdir <- sub("s3:/", "~/s3", s3dir) +#' dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 500, regexes = regexes, dryrun = F) +#' } +#' @export +dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, + regexes = DR_FILE_REGEX, dryrun = FALSE, + list_filter_fun = s3_list_files_filter_relevant) { + s3 <- paws.storage::s3() + e <- emojifont::emoji + fs::dir_create(outdir) + d <- list_filter_fun( + s3dir = s3dir, pattern = NULL, regexes = regexes, + max_objects = max_objects, presign = FALSE + ) + d <- d |> + dplyr::mutate( + s3path_minus_s3dir = sub(glue("{s3dir}/"), "", .data$path), + s3path_minus_s3dir_outdir = fs::dir_create( + file.path(outdir, dirname(.data$s3path_minus_s3dir)) + ), + localpath = file.path(.data$s3path_minus_s3dir_outdir, .data$bname), + s3path = .data$path + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path") + # download recognisable dracarys files to outdir//{bname} + if (!dryrun) { + cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}") + d |> + dplyr::rowwise() |> + dplyr::mutate( + s3bucket = sub("s3://(.*?)/.*", "\\1", .data$s3path), + s3key = sub("s3://(.*?)/(.*)", "\\2", .data$s3path), + dl = list( + s3$download_file( + Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$localpath + ) + ), + localpath = normalizePath(.data$localpath) + ) |> + dplyr::select("type", "bname", "size", "lastmodified", "localpath", "s3path") + } else { + cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}") + d |> + dplyr::select("type", "bname", "size", "lastmodified", "s3path", localpath2be = "localpath") |> + as.data.frame() |> + print() + } +} + +#' S3 Generate Presigned URL +#' +#' @param client S3 client. Make sure you use `signature_version = "s3v4"` (see example). +#' @param s3path Full path to S3 object. +#' @param expiry_seconds Number of seconds the presigned URL is valid for (3600 = 1 hour). +#' +#' @return An S3 presigned URL. +#' @examples +#' \dontrun{ +#' client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) +#' s3path <- "s3://bucket1/path/to/file.tsv" +#' s3_file_presignedurl(client, s3path) +#' } +#' +#' @export +s3_file_presignedurl <- function(client, s3path, expiry_seconds = 3600) { + bucket <- sub("s3://(.*?)/.*", "\\1", s3path) + prefix <- sub("s3://(.*?)/(.*)", "\\2", s3path) + client$generate_presigned_url( + client_method = "get_object", + params = list(Bucket = bucket, Key = prefix), + expires_in = expiry_seconds + ) +} + +#' Search AWS S3 Objects +#' +#' Searches for the given pattern in the UMCCR `umccr-primary-data-prod` AWS S3 +#' bucket. +#' +#' @param pat Pattern to search for (e.g. 'multiqc_data.json'). +#' @param rows Max number of rows to return. +#' +#' @return Tibble with S3 path, object size, date modified, id, unique hash. +#' +#' @examples +#' \dontrun{ +#' pat <- "qc_summary.tsv.gz" +#' s3_search(pat, 10) +#' } +#' @export +s3_search <- function(pat, rows) { + au_tz <- "Australia/Melbourne" + utc_tz <- "UTC" + base_url <- "https://api.portal.prod.umccr.org/iam/s3" + url1 <- utils::URLencode(glue("{base_url}?rowsPerPage={rows}&search={pat}")) + awscurl_cmd <- glue( + "awscurl '{url1}' ", + "--header 'Accept: application/json'" + ) + message(glue("Running {awscurl_cmd}")) + j <- system(awscurl_cmd, intern = TRUE) + date_fmt <- "%Y-%m-%dT%H:%M:%S" + d <- j |> + jsonlite::fromJSON() |> + purrr::pluck("results") |> + tibble::as_tibble() + d |> + dplyr::mutate( + date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt), + date_aest = lubridate::with_tz(.data$date1, tz = au_tz), + path = glue("s3://{bucket}/{key}"), + size = fs::as_fs_bytes(.data$size) + ) |> + dplyr::select("path", "size", "date_aest", "id", "unique_hash") +} diff --git a/R/ica.R b/R/ica.R index 0838ef0..951095b 100644 --- a/R/ica.R +++ b/R/ica.R @@ -1,28 +1,3 @@ -#' List Relevant Files In GDS Directory -#' -#' Lists relevant files in a GDS directory. -#' -#' @param gdsdir GDS directory. -#' @param token ICA access token. -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE). -#' @param page_size Page size (def: 100). -#' @param regexes Tibble with regex and function name. -#' -#' @return A tibble with type, bname, size, file_id, path, and presigned URL. -#' @export -gds_files_list_filter_relevant <- function(gdsdir, token, pattern = NULL, include_url = FALSE, page_size = 100, regexes = DR_FILE_REGEX) { - pattern <- pattern %||% ".*" # keep all recognisable files by default - cols_sel <- c("type", "bname", "size", "file_id", "path", "presigned_url") - d <- dracarys::gds_files_list(gdsdir, token, include_url = include_url, page_size = page_size) |> - dplyr::rowwise() |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> - dplyr::ungroup() |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select(dplyr::any_of(cols_sel)) - d -} - #' List FASTQs In GDS Directory #' #' @param gdsdir GDS directory. @@ -48,12 +23,9 @@ gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = ~regex, ~fun, "fastq\\.gz$", "FASTQ" ) - g <- gds_files_list_filter_relevant( - gdsdir = gdsdir, token = token, pattern = NULL, include_url = include_url, - page_size = page_size, regexes = fq_regex - ) - assertthat::assert_that( - all(colnames(g) == c("type", "bname", "size", "file_id", "path")) + g <- gds_list_files_filter_relevant( + gdsdir = gdsdir, pattern = NULL, regexes = fq_regex, + token = token, page_size = page_size, include_url = include_url ) g |> dplyr::mutate( @@ -61,176 +33,10 @@ gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = size_num = as.numeric(.data$size) ) |> dplyr::select( - "type", "bname", "size", "size_chr", "size_num", "file_id", "path" + "type", "bname", "size", "lastmodified", "size_chr", "size_num", "file_id", "path" ) } -#' GDS File Presigned URL -#' -#' Returns presigned URL of given GDS file. -#' -#' @param gds_fileid GDS file ID. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @return Presigned URL if valid. -#' @export -gds_file_presignedurl <- function(gds_fileid, token) { - token <- ica_token_validate(token) - base_url <- "https://aps2.platform.illumina.com/v1" - url <- glue("{base_url}/files/{gds_fileid}") - res <- httr::GET( - url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]] - assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url)) - presigned_url -} - -#' GDS File Download via API -#' -#' @param gds_fileid GDS file ID. -#' @param out_file Path to output file. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' -#' @examples -#' \dontrun{ -#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0" -#' out <- tempfile() -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' } -#' @export -gds_file_download_api <- function(gds_fileid, out_file, token) { - presigned_url <- gds_file_presignedurl(gds_fileid, token) - # keep quiet instead of logging presigned urls - status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE) - assertthat::assert_that(status_code == 0) - out_file -} - -#' GDS File Download via CLI -#' -#' @param gds Full path to GDS file. -#' @param out Path to output file. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @export -gds_file_download <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) { - token <- ica_token_validate(token) - system(glue("ica files download {gds} {out} --access-token {token}")) -} - -#' GDS Files List -#' -#' List files on ICA GDS filesystem. -#' -#' @param gdsdir Full path to GDS directory. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @param page_size Page size (def: 10). -#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE). -#' @param page_token Page token (def: NULL). Used internally for recursion. -#' @param no_recurse Do not recurse through the file list i.e. just give the first items -#' without recursing further down the list using . -#' @param recursive Should files be returned recursively _in and under_ the specified -#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE). -#' -#' @return Tibble with file basename, file size, file full data path, file dir name. -#' @examples -#' \dontrun{ -#' gdsdir <- file.path( -#' "gds://production/primary_data", -#' "240322_A00130_0290_BH5HLLDSXC/20240323f56ec5a5/WGS_TsqNano" -#' ) -#' gdsdir <- file.path( -#' "gds://bssh.acddbfda498038ed99fa94fe79523959/Runs", -#' "240322_A00130_0290_BH5HLLDSXC_r.3TbcOsEKZUyetygkqIOXcg/InterOp" -#' ) -#' gdsdir <- file.path( -#' "gds://production/analysis_data/SBJ00699/umccrise", -#' "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" -#' ) -#' token <- ica_token_validate() -#' page_size <- 11 -#' include_url <- TRUE -#' page_token <- NULL -#' no_recurse <- TRUE -#' recursive <- NULL -#' gds_files_list(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) -#' } -#' @export -gds_files_list <- function(gdsdir, token, page_size = NULL, include_url = FALSE, - no_recurse = TRUE, page_token = NULL, recursive = NULL) { - assertthat::assert_that(is.logical(no_recurse), is.logical(include_url)) - assertthat::assert_that(is.null(recursive) || is.logical(recursive)) - token <- ica_token_validate(token) - assertthat::assert_that(grepl("^gds://", gdsdir)) - gdsdir_original <- gdsdir - if (!grepl("/$", gdsdir)) { - gdsdir <- glue("{gdsdir}/") - } - base_url <- "https://aps2.platform.illumina.com/v1" - volname <- sub("gds://(.*?)/.*", "\\1", gdsdir) - path2 <- sub("gds://(.*?)/(.*)", "\\2", gdsdir) - page_size <- ifelse(is.null(page_size), "", glue("&pageSize={page_size}")) - query_url <- glue("{base_url}/files?volume.name={volname}&path=/{path2}*{page_size}") - if (include_url) { - query_url <- glue("{query_url}&include=PresignedUrl") - } - if (!is.null(page_token)) { - query_url <- glue("{query_url}&pageToken={page_token}") - } - if (!is.null(recursive)) { - # without specifying recursive, it's true by default - recursive <- ifelse(recursive, "true", "false") - query_url <- glue("{query_url}&recursive={recursive}") - } - query_res <- httr::GET( - query_url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - j <- jsonlite::fromJSON(httr::content(x = query_res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE) - if (j[["itemCount"]] == 0) { - if (likely_file(gdsdir_original)) { - cli::cli_abort("{date_log()} ERROR: Is the input directory a file perhaps?\n{.file {gdsdir_original}}") - } - # if there is a nextPageToken then abort, else continue - if (!is.null(j[["nextPageToken"]])) { - msg <- paste0( - "{date_log()} ERROR: ", - "No GDS files listed in the input directory. Please confirm you can ", - "access the following GDS input directory with your token: ", - "{.file {gdsdir_original}}" - ) - cli::cli_abort(msg) - } - } # endif - d <- j[["items"]] |> - purrr::map(\(x) c(file_id = x[["id"]], path = x[["path"]], size = x[["sizeInBytes"]], presigned_url = x[["presignedUrl"]])) |> - dplyr::bind_rows() - if (nrow(d) == 0) { - # We've iterated through all available items, and the next page has 0 items. - # So dplyr::bind_rows(d, NULL) will return d. - return(NULL) - } - res <- d |> - dplyr::mutate( - size = fs::as_fs_bytes(.data$size), - bname = basename(.data$path), - path = glue("gds://{volname}{.data$path}"), - dname = basename(dirname(.data$path)) - ) |> - dplyr::select(dplyr::any_of(c("file_id", "bname", "size", "path", "dname", "presigned_url"))) - if (!is.null(j[["nextPageToken"]]) && !no_recurse) { - res2 <- gds_files_list( - gdsdir = gdsdir, token = token, page_size = NULL, - include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]], - recursive = NULL - ) - res <- dplyr::bind_rows(res, res2) - } - res -} - #' List GDS Volumes #' #' Lists GDS volumes accessible by the provided ICA token. @@ -255,52 +61,6 @@ gds_volumes_list <- function(token, page_size = 10) { } -#' dracarys GDS Download -#' -#' Download only GDS files that can be processed by dracarys. -#' -#' @param gdsdir Full path to GDS directory. -#' @param outdir Path to output directory. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @param page_size Page size (def: 100). -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param dryrun If TRUE, just list the files that will be downloaded (don't -#' download them). -#' @param regexes Tibble with regex and function name. -#' @param recursive Should files be returned recursively _in and under_ the specified -#' GDS directory (TRUE), or _only directly in_ the specified GDS directory (FALSE) (def: TRUE). -#' -#' @export -dr_gds_download <- function(gdsdir, outdir, token, page_size = 100, pattern = NULL, - dryrun = FALSE, regexes = DR_FILE_REGEX, recursive = NULL) { - e <- emojifont::emoji - fs::dir_create(outdir) - d <- gds_files_list( - gdsdir = gdsdir, token = token, page_size = page_size, - no_recurse = FALSE, recursive = recursive - ) |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> - dplyr::select("file_id", "dname", "type", "size", "path", "bname") - - # download recognisable dracarys files to outdir/{bname} - pattern <- pattern %||% ".*" # keep all recognisable files - d_filt <- d |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::mutate(out = file.path(outdir, .data$bname)) - if (!dryrun) { - cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {gdsdir}}") - d_filt |> - dplyr::rowwise() |> - dplyr::mutate(out_dl = gds_file_download_api(.data$file_id, .data$out, token)) - } else { - cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}") - d_filt |> - dplyr::select("path", "type", "size") |> - as.data.frame() |> - print() - } -} - #' Validate ICA access token #' #' Validates ICA access token by parsing it and checking its expiration date. @@ -339,7 +99,7 @@ ica_token_exp <- function(token = Sys.getenv("ICA_ACCESS_TOKEN")) { structure(l$payload$exp, class = c("POSIXct", "POSIXt")) } -likely_file <- function(x) { +gds_likely_file <- function(x) { e <- c( "txt", "tsv", "csv", "html", "json", "stdout", "stderr", "stdouterr", "log", "vcf", "gz", "bam", "bai" diff --git a/R/regex.R b/R/regex.R index 25a2f61..34f4079 100644 --- a/R/regex.R +++ b/R/regex.R @@ -61,11 +61,11 @@ DR_FILE_REGEX <- tibble::tribble( "multiqc_data\\.json", "MultiqcFile", "somatic\\.pcgr\\.json\\.gz$", "PcgrJsonFile", "somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile", - "chord\\.tsv\\.gz$", "UmChordTsvFile", - "hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", - "snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", - "snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", - "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", + # "chord\\.tsv\\.gz$", "UmChordTsvFile", + # "hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", + # "snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", + # "snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", + # "-qc_summary\\.tsv\\.gz$", "UmQcSumFile", "bcftools_stats\\.txt$", "BcftoolsStatsFile" ) @@ -83,6 +83,8 @@ FILES_DOWNLOAD_BUT_IGNORE <- c( #' @param f Name of function to evaluate. #' @param v Character vector of strings evaluating to functions. By default, #' this points to the functions in the DR_FILE_REGEX dracarys tibble. +#' @param envir the environment in which to evaluate the function e.g. use `self` +#' when using inside R6 classes. #' #' @return Evaluated function. #' @examples @@ -94,13 +96,13 @@ FILES_DOWNLOAD_BUT_IGNORE <- c( #' expect_equal(mean_1_to_10, base::mean(1:10)) #' expect_null(dr_func_eval("foo")) #' @export -dr_func_eval <- function(f, v = NULL) { +dr_func_eval <- function(f, v = NULL, envir = parent.frame()) { v <- v %||% DR_FILE_REGEX[["fun"]] if (!f %in% v) { return(NULL) } # evaluate string - eval(parse(text = f)) + eval(parse(text = f), envir = envir) } #' Get dracarys `DR_FILE_REGEX` diff --git a/R/s3.R b/R/s3.R deleted file mode 100644 index 6b6daaa..0000000 --- a/R/s3.R +++ /dev/null @@ -1,156 +0,0 @@ -#' List Relevant Files In AWS S3 Directory -#' -#' Lists relevant files in an AWS S3 directory. -#' -#' @param s3dir S3 directory. -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param page_size The size of each page to get in the AWS service call (def: 1000). -#' @param max_items The total number of items to return in the command’s output (def: 1000). -#' @param presign Include presigned URLs (def: FALSE). -#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)). -#' -#' @return A tibble with path, date, file size, file type, and presigned URL if requested. -#' @examples -#' \dontrun{ -#' s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" -#' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) -#' } -#' @export -s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200) { - assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign)) - pattern <- pattern %||% ".*" # keep all recognisable files by default - b <- sub("s3://(.*?)/.*", "\\1", s3dir) - p <- sub("s3://(.*?)/(.*)", "\\2", s3dir) - cmd <- glue( - "aws --output json s3api list-objects-v2 --bucket {b} --prefix {p} ", - "--max-items {max_items} --page-size {page_size}" - ) - l <- system(cmd, intern = TRUE) - j <- jsonlite::fromJSON(l) - assertthat::assert_that("Contents" %in% names(j)) - d <- j[["Contents"]] |> - tibble::as_tibble() |> - dplyr::mutate( - path = glue("s3://{b}/{.data$Key}"), - date_utc = .data$LastModified, - size = fs::as_fs_bytes(.data$Size) - ) |> - dplyr::rowwise() |> - dplyr::mutate( - bname = basename(.data$path), - type = purrr::map_chr(.data$bname, match_regex) - ) |> - dplyr::ungroup() |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select("path", "bname", "date_utc", "size", "type") - - if (presign) { - d <- d |> - dplyr::rowwise() |> - dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path, expiry_seconds = expiry_sec)) |> - dplyr::ungroup() - } - d -} - -s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) { - p <- system(glue("aws s3 presign {s3path} --expires-in {expiry_seconds}"), intern = TRUE) - p -} - -#' Search AWS S3 Objects -#' -#' Searches for the given pattern in the UMCCR `umccr-primary-data-prod` AWS S3 -#' bucket. -#' -#' @param pat Pattern to search for (e.g. 'multiqc_data.json'). -#' @param rows Max number of rows to return. -#' -#' @return Tibble with S3 path, object size, date modified, id, unique hash. -#' -#' @examples -#' \dontrun{ -#' pat <- "qc_summary.tsv.gz" -#' s3_search(pat, 10) -#' } -#' @export -s3_search <- function(pat, rows) { - au_tz <- "Australia/Melbourne" - utc_tz <- "UTC" - base_url <- "https://api.portal.prod.umccr.org/iam/s3" - url1 <- utils::URLencode(glue("{base_url}?rowsPerPage={rows}&search={pat}")) - awscurl_cmd <- glue( - "awscurl '{url1}' ", - "--header 'Accept: application/json'" - ) - message(glue("Running {awscurl_cmd}")) - j <- system(awscurl_cmd, intern = TRUE) - date_fmt <- "%Y-%m-%dT%H:%M:%S" - d <- j |> - jsonlite::fromJSON() |> - purrr::pluck("results") |> - tibble::as_tibble() - d |> - dplyr::mutate( - date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt), - date_aest = lubridate::with_tz(.data$date1, tz = au_tz), - path = glue("s3://{bucket}/{key}"), - size = fs::as_fs_bytes(.data$size) - ) |> - dplyr::select("path", "size", "date_aest", "id", "unique_hash") -} - -#' dracarys S3 Download -#' -#' Download only S3 files that can be processed by dracarys. -#' -#' @param s3dir Full path to S3 directory. -#' @param outdir Path to output directory. -#' @param page_size Page size (def: 100). -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param regexes Tibble with regex and function name. -#' @param dryrun If TRUE, just list the files that will be downloaded (don't -#' download them). -#' @examples -#' s3dir <- file.path( -#' "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", -#' "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" -#' ) -#' outdir <- sub("s3:/", "~/s3", s3dir) -#' -#' @export -dr_s3_download <- function(s3dir, outdir, page_size = 100, pattern = NULL, regexes = DR_FILE_REGEX, dryrun = FALSE) { - s3 <- paws.storage::s3() - e <- emojifont::emoji - fs::dir_create(outdir) - d <- s3_files_list_filter_relevant(s3dir, pattern = NULL, page_size = page_size, max_items = 1000, presign = FALSE, expiry_sec = 43200) - d <- d |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> - dplyr::select("type", "size", "path", "bname") - - # download recognisable dracarys files to outdir/{bname} - pattern <- pattern %||% ".*" # keep all recognisable files - d_filt <- d |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::mutate(out = file.path(outdir, .data$bname)) - if (!dryrun) { - cli::cli_alert_info("{date_log()} {e('arrow_heading_down')} Downloading files from {.file {s3dir}}") - d_filt |> - dplyr::rowwise() |> - dplyr::mutate( - s3bucket = sub("s3://(.*?)/.*", "\\1", .data$path), - s3key = sub("s3://(.*?)/(.*)", "\\2", .data$path), - dl = list( - s3$download_file( - Bucket = .data$s3bucket, Key = .data$s3key, Filename = .data$out - ) - ) - ) - } else { - cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {s3dir}}") - d_filt |> - dplyr::select("path", "type", "size") |> - as.data.frame() |> - print() - } -} diff --git a/R/tidy.R b/R/tidy.R index a1b9313..06fac82 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -1,3 +1,30 @@ +#' Tidy Files +#' +#' @param x Tibble with `localpath` to file and the function `type` to parse it. +#' @param envir the environment in which to evaluate the function e.g. use `self` +#' when using inside R6 classes. +#' +#' @return Tibble with parsed data in a `data` list-column. +#' @examples +#' \dontrun{ +#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" +#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz" +#' p <- file.path(p1, p2) +#' x <- tibble::tibble(type = "readr::read_tsv", localpath = p) +#' tidy_files(x) +#' } +#' +#' @export +tidy_files <- function(x, envir = parent.frame()) { + assertthat::assert_that(is.data.frame(x)) + assertthat::assert_that(all(c("type", "localpath") %in% colnames(x))) + x |> + dplyr::rowwise() |> + dplyr::mutate( + data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath)) + ) +} + #' Tidy UMCCR Results #' #' Tidies UMCCR workflow results into a list of tibbles and writes individual diff --git a/R/umccrise.R b/R/umccrise.R index e358d2d..ffbe5c5 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -1,80 +1,156 @@ -#' UmccriseCanRepTables R6 Class +#' Wf_umccrise R6 Class #' #' @description -#' Reads and writes tidy versions of files within the `cancer_report_tables` directory -#' output from the `umccrise` workflow. +#' Reads and writes tidy versions of files from the `umccrise` workflow #' #' @examples #' \dontrun{ -#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" -#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables" -#' p <- file.path(p1, p2) -#' obj <- UmccriseCanRepTables$new(p) -#' obj$path -#' obj$contents -#' d <- obj$read() -#' obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") +#' +#' #---- LOCAL ----# +#' SubjectID <- "SBJ03043" +#' SampleID_tumor <- "PRJ230004" +#' prefix <- glue("{SubjectID}__{SampleID_tumor}") +#' p1_local <- "~/icav1/g/production/analysis_data" +#' p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +#' um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' um1$list_files(max_files = 10) +#' um1$list_files_filter_relevant() +#' d <- um1$download_files(max_files = 1000, dryrun = F) +#' d_tidy <- um1$tidy_files(d) +#' d_write <- um1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = glue("{SubjectID}__{SampleID_tumor}"), +#' format = "tsv" +#' ) +#' +#' #---- GDS ----# +#' SubjectID <- "SBJ03043" +#' SampleID_tumor <- "PRJ230004" +#' prefix <- glue("{SubjectID}__{SampleID_tumor}") +#' p1_gds <- "gds://production/analysis_data" +#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' um2$list_files(max_files = 8) +#' um2$list_files_filter_relevant(ica_token = token, max_files = 500) +#' d <- um2$download_files( +#' outdir = outdir, ica_token = token, +#' max_files = 1000, dryrun = F +#' ) +#' d_tidy <- um2$tidy_files(d) +#' d_write <- um2$write( +#' d_tidy, +#' outdir = file.path(outdir, "dracarys_tidy"), +#' prefix = glue("{SubjectID}__{SampleID_tumor}"), +#' format = "tsv" +#' ) #' } #' #' @export -UmccriseCanRepTables <- R6::R6Class( - "UmccriseCanRepTables", +Wf_umccrise <- R6::R6Class( + "Wf_umccrise", + inherit = Wf, public = list( - #' @field path Path to the `cancer_report_tables` directory. - #' @field contents Tibble with file path, basename, and size. - path = NULL, - contents = NULL, - #' @description Create a new UmccriseCanRepTables object. - #' @param path Path to the `cancer_report_tables` directory. - initialize = function(path = NULL) { - stopifnot(is.character(path), length(path) == 1) - self$path <- normalizePath(path) - self$contents <- fs::dir_info(path, type = "file", recurse = TRUE) |> - dplyr::mutate( - bname = basename(.data$path), - size = as.character(trimws(.data$size)) - ) |> - dplyr::select("path", "bname", "size") + #' @field SubjectID The SubjectID of the sample (needed for path lookup). + #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + SubjectID = NULL, + SampleID_tumor = NULL, + #' @description Create a new Wf_umccrise object. + #' @param path Path to directory with raw workflow results (from GDS, S3, or + #' local filesystem). + #' @param SubjectID The SubjectID of the sample (needed for path lookup). + #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { + wname <- "umccrise" + regexes <- tibble::tribble( + ~regex, ~fun, + "-chord\\.tsv\\.gz$", "chordtsv", + "-hrdetect\\.tsv\\.gz$", "hrdetecttsv", + "-snv_2015\\.tsv\\.gz$", "sigssnv2015tsv", + "-snv_2020\\.tsv\\.gz$", "sigssnv2020tsv", + "-dbs\\.tsv\\.gz$", "sigsdbstsv", + "-indel\\.tsv\\.gz$", "sigsindeltsv", + "-qc_summary\\.tsv\\.gz$", "qcsummarytsv", + "multiqc_conpair\\.txt$", "conpairmultiqc", + "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" + ) |> + dplyr::mutate(fun = paste0("read_", .data$fun)) + + super$initialize(path = path, wname = wname, regexes = regexes) + self$SubjectID <- SubjectID + self$SampleID_tumor <- SampleID_tumor }, - #' @description Print details about the cancer_report_tables directory. + #' @description Print details about the Workflow. #' @param ... (ignored). print = function(...) { - bnames <- self$contents |> - dplyr::mutate( - low = tolower(.data$bname), - ) |> - dplyr::arrange(.data$low) |> - dplyr::mutate( - n = dplyr::row_number(), - bn = glue("{.data$n}. {.data$bname} ({.data$size})") - ) |> - dplyr::pull("bn") - cat("#--- UmccriseCanRepTables ---#\n") - cat(glue("Path: {self$path}"), "\n") - cat("Contents:\n") - cat(bnames, sep = "\n") + res <- tibble::tribble( + ~var, ~value, + "path", self$path, + "wname", self$wname, + "filesystem", self$filesystem, + "SubjectID", self$SubjectID, + "SampleID_tumor", self$SampleID_tumor + ) + print(res) invisible(self) }, - #' @description Returns file with given pattern from the cancer_report_tables directory. - #' @param pat File pattern to look for. - grep_file = function(pat) { - x <- self$contents |> - dplyr::filter(grepl(pat, .data$path)) |> - dplyr::pull(.data$path) - if (length(x) > 1) { - fnames <- paste(x, collapse = ", ") - cli::cli_abort("More than 1 match found for {pat} ({fnames}). Aborting.") - } - if (length(x) == 0) { - return("") # file.exists("") returns FALSE - } - return(x) + #' @description List dracarys files under given path + #' @param max_files Max number of files to list (for gds/s3 only). + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param ... Passed on to the `gds_list_files_filter_relevant` or + #' the `s3_list_files_filter_relevant` function. + list_files_filter_relevant = function(max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + path <- self$path + dir_final <- file.path(path, glue("{self$SubjectID}__{self$SampleID_tumor}")) + dir_work <- file.path(path, "work", glue("{self$SubjectID}__{self$SampleID_tumor}")) + dir_work_pcgr <- file.path(dir_work, "pcgr") # for pcgr json + f1 <- super$list_files_filter_relevant(path = dir_final, max_files = 300, ica_token = ica_token) + f2 <- super$list_files_filter_relevant(path = dir_work_pcgr, max_files = 50, ica_token = ica_token) + f_all <- dplyr::bind_rows(f1, f2) + return(f_all) }, - - #' @description Read `chord.tsv.gz` file output from umccrise. - #' - #' @param x (`character(1)`)\cr - #' Path to `chord.tsv.gz` file. + #' @description Download files from GDS/S3 to local filesystem. + #' @param outdir Path to output directory. + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param max_files Max number of files to list. + #' @param dryrun If TRUE, just list the files that will be downloaded (don't + #' download them). + #' @param recursive Should files be returned recursively _in and under_ the specified + #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). + #' @param list_filter_fun Function to filter relevant files. + download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, dryrun = FALSE, recursive = NULL) { + super$download_files( + outdir = outdir, ica_token = ica_token, max_files = max_files, + dryrun = dryrun, recursive = recursive, + list_filter_fun = self$list_files_filter_relevant + ) + }, + #' @description Read `pcgr.json.gz` file. + #' @param x Path to file. + read_pcgrjson = function(x) { + j <- read_jsongz_jsonlite(x) + tmb <- + j[["content"]][["tmb"]][["variant_statistic"]] %||% + j[["content"]][["tmb"]][["v_stat"]] %||% + list(tmb_estimate = NA, n_tmb = NA) + tmb <- purrr::flatten(tmb) |> + tibble::as_tibble_row() |> + dplyr::select("tmb_estimate", "n_tmb") + msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]] + # handle nulls + msi <- msi %||% list(fracIndels = NA, predicted_class = NA) + msi <- purrr::flatten(msi) |> + tibble::as_tibble_row() |> + dplyr::select("fracIndels", "predicted_class") + metrics <- dplyr::bind_cols(msi, tmb) + return(metrics) + }, + #' @description Read `chord.tsv.gz` cancer report file. + #' @param x Path to file. read_chordtsv = function(x) { ct <- readr::cols_only( p_hrd = "d", @@ -85,10 +161,8 @@ UmccriseCanRepTables <- R6::R6Class( ) read_tsvgz(x, col_types = ct) }, - #' @description Read `hrdetect.tsv.gz` file output from umccrise. - #' - #' @param x (`character(1)`)\cr - #' Path to `hrdetect.tsv.gz` file. + #' @description Read `hrdetect.tsv.gz` cancer report file. + #' @param x Path to file. read_hrdetecttsv = function(x) { ct <- readr::cols( .default = "d", @@ -97,24 +171,37 @@ UmccriseCanRepTables <- R6::R6Class( read_tsvgz(x, col_types = ct) |> dplyr::select(-c("sample")) }, - - - #' @description Read `snv_20XX.tsv.gz` file output from umccrise. - #' - #' @param x (`character(1)`)\cr - #' Path to `snv_20XX.tsv.gz` file. - read_sigs = function(x) { + #' @description Read signature cancer report file. + #' @param x Path to file. + read_sigstsv = function(x) { ct <- readr::cols( .default = "d", Signature = "c" ) read_tsvgz(x, col_types = ct) }, - - #' @description Read `qc_summary.tsv.gz` file output from umccrise. - #' - #' @param x (`character(1)`)\cr - #' Path to `qc_summary.tsv.gz` file. + #' @description Read `snv_2015.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigssnv2015tsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `snv_2020.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigssnv2020tsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `dbs.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigsdbstsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `indel.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigsindeltsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `qc_summary.tsv.gz` cancer report file. + #' @param x Path to file. read_qcsummarytsv = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) d |> @@ -147,48 +234,92 @@ UmccriseCanRepTables <- R6::R6Class( "hypermutated", "bpi_enabled" ) }, - #' @description - #' Reads contents of `cancer_report_tables` directory output by umccrise. - #' - #' @return A list of tibbles. - #' @export - read = function() { - # now return all as list elements - list( - chord = self$grep_file("-chord\\.tsv\\.gz$") |> self$read_chordtsv(), - hrdetect = self$grep_file("-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(), - sigs2015 = self$grep_file("-snv_2015\\.tsv\\.gz$") |> self$read_sigs(), - sigs2020 = self$grep_file("-snv_2020\\.tsv\\.gz$") |> self$read_sigs(), - sigsdbs = self$grep_file("-dbs\\.tsv\\.gz$") |> self$read_sigs(), - sigsindel = self$grep_file("-indel\\.tsv\\.gz$") |> self$read_sigs(), - qcsum = self$grep_file("-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv() + #' @description Read multiqc_conpair.txt file. + #' @param x Path to file. + read_conpairmultiqc = function(x) { + um_ref_samples <- c("Alice", "Bob", "Chen", "Elon", "Dakota") + um_ref_samples <- paste0(um_ref_samples, rep(c("_T", "_B", ""), each = length(um_ref_samples))) + cnames <- list( + old = c( + "Sample", "concordance_concordance", "concordance_used_markers", + "concordance_total_markers", "concordance_marker_threshold", + "concordance_min_mapping_quality", "concordance_min_base_quality", + "contamination" + ), + new = c( + "sampleid", "contamination", "concordance", "markers_used", + "markers_total", "marker_threshold", + "mapq_min", "baseq_min" + ) ) - }, - - #' @description - #' Writes tidied contents of `cancer_report_tables` directory output by umccrise. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) + ctypes <- list( + old = c("cddddddd"), + new = c("cddddddd") + ) + if (!file.exists(x)) { + return(empty_tbl(cnames$new, ctypes$new)) } - d_write <- d |> - tibble::enframe(name = "section") |> - dplyr::rowwise() |> - dplyr::mutate( - section_low = tolower(.data$section), - p = glue("{prefix}_{.data$section_low}"), - out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid)) - ) |> - dplyr::ungroup() |> - dplyr::select("section", "value") |> - tibble::deframe() - invisible(d_write) + d1 <- readr::read_tsv(x, col_types = readr::cols(.default = "d", Sample = "c")) + assertthat::assert_that(all(colnames(d1) == cnames$old)) + d1 |> + dplyr::filter(!.data$Sample %in% um_ref_samples) |> + dplyr::relocate("contamination", .after = "Sample") |> + rlang::set_names(cnames$new) } - ) + ) # end public ) + +#' umccrise Download Tidy and Write +#' +#' Downloads files from the `umccrise` workflow and writes them in a tidy format. +#' +#' @param path Path to directory with raw workflow results (from GDS, S3, or +#' local filesystem). +#' @param SubjectID The SubjectID of the sample (needed for path lookup). +#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). +#' @param outdir Path to output directory. +#' @param max_files Max number of files to list. +#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @param format Format of output files. +#' @return List where each element is a tidy tibble of a umccrise file. +#' +#' @examples +#' \dontrun{ +#' SubjectID <- "SBJ03043" +#' SampleID_tumor <- "PRJ230004" +#' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") +#' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' d <- Wf_umccrise_download_tidy_write( +#' path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, +#' outdir = outdir, +#' dryrun = F +#' ) +#' } +#' @export +Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor, + outdir, format = "rds", max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + dryrun = FALSE) { + um <- Wf_umccrise$new( + path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor + ) + d_dl <- um$download_files( + outdir = outdir, ica_token = ica_token, + max_files = max_files, dryrun = dryrun + ) + if (!dryrun) { + d_tidy <- um$tidy_files(d_dl) + d_write <- um$write( + d_tidy, + outdir = file.path(outdir, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = format + ) + return(d_write) + } + return(d_dl) +} diff --git a/R/utils.R b/R/utils.R index f61fec9..378588b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -117,6 +117,34 @@ write_dracarys <- function(obj, prefix, out_format, drid = NULL) { return(invisible(obj)) } +#' Write List of Tidy Tibbles +#' +#' @param list_of_tbls List of tidy tibbles. +#' @param out_dir Output directory. +#' @param prefix Prefix of output file(s). +#' @param out_format Format of output file(s). +#' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). +#' +#' @return Tibble with nested objects that have been written to the output directory. +#' @export +write_dracarys_list_of_tbls <- function(list_of_tbls, out_dir = NULL, prefix = NULL, out_format = "tsv", drid = NULL) { + assertthat::assert_that(!is.null(prefix)) + if (!is.null(out_dir)) { + prefix <- file.path(out_dir, prefix) + } + d_write <- list_of_tbls |> + tibble::enframe(name = "section") |> + dplyr::rowwise() |> + dplyr::mutate( + section_low = tolower(.data$section), + p = glue("{prefix}_{.data$section_low}"), + out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid)) + ) |> + dplyr::ungroup() |> + dplyr::select("section", "value") |> + tibble::deframe() + invisible(d_write) +} #' Create Empty Tibble Given Column Names #' @@ -165,7 +193,24 @@ read_jsongz_rjsonio <- function(x, ...) { RJSONIO::fromJSON(x, ...) } - +#' Grep File Pattern +#' +#' @param path Path to look for file. +#' @param regexp A regular expression (e.g. [.]csv$) passed on to `grep()` to filter paths. +#' +#' @return The path to the file or an empty string if no match is found. +#' @export +grep_file <- function(path = ".", regexp) { + x <- fs::dir_ls(path, recurse = TRUE, type = "file", regexp = regexp) + if (length(x) > 1) { + fnames <- paste(x, collapse = ", ") + cli::cli_abort("More than 1 match found for {regexp} ({fnames}). Aborting.") + } + if (length(x) == 0) { + return("") # file.exists("") returns FALSE + } + return(x) +} #' @noRd dummy1 <- function() { diff --git a/inst/rmd/umccr_workflows/umccrise/.gitignore b/inst/rmd/umccr_workflows/umccrise/.gitignore new file mode 100644 index 0000000..0182e8f --- /dev/null +++ b/inst/rmd/umccr_workflows/umccrise/.gitignore @@ -0,0 +1,3 @@ +nogit + +/.quarto/ diff --git a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R new file mode 100755 index 0000000..3b5ede3 --- /dev/null +++ b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R @@ -0,0 +1,79 @@ +#!/usr/bin/env Rscript + +{ + require(dplyr) + require(assertthat, include.only = "assert_that") + require(dracarys, include.only = "Wf_umccrise_download_tidy_write") + require(glue, include.only = "glue") + require(here, include.only = "here") + require(rportal, include.only = c("portaldb_query_workflow")) + require(tidyr, include.only = "separate_wider_delim") +} + +query_workflow_umccrise <- function(start_date, end_date) { + q1 <- glue( + "WHERE \"type_name\" = 'umccrise'", + "AND \"start\" >= date(\'{start_date}\')", + "AND \"end\" <= date(\'{end_date}\')", + "ORDER BY \"start\" DESC;" + ) + rportal::portaldb_query_workflow(q1) +} + +query_limsrow_libids <- function(libids) { + assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) + libids <- unique(libids) |> + paste(collapse = "|") + q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');") + rportal::portaldb_query_limsrow(q1) +} + +# first read in the workflows table, extract metadata, then join with lims +start_date <- "2024-08-29" +end_date <- "2024-09-01" +meta_raw <- query_workflow_umccrise(start_date, end_date) +meta <- meta_raw |> + rportal::meta_umccrise() +lims_raw <- query_limsrow_libids(meta$LibraryID_tumor) +lims <- lims_raw |> + tidyr::separate_wider_delim( + library_id, + delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" + ) |> + select( + subject_id, library_id, sample_id, sample_name, + external_subject_id, external_sample_id, + project_name, project_owner, + source, quality, workflow + ) |> + distinct() +table(lims$library_id %in% meta$LibraryID_tumor) # double-check + +meta_lims <- meta |> + left_join(lims, by = c("LibraryID_tumor" = "library_id")) |> + mutate(rownum = row_number()) |> + select( + rownum, wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal, + SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) +meta_lims |> + saveRDS(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/meta/{start_date}_{end_date}.rds"))) + +d <- meta_lims |> + rowwise() |> + mutate( + indir = .data$gds_outdir_umccrise, + outdir = file.path(sub("gds://", "", .data$indir)), + outdir = file.path(normalizePath("~/icav1/g"), .data$outdir), + res = list( + dracarys::Wf_umccrise_download_tidy_write( + path = .data$indir, SubjectID = .data$SubjectID, SampleID_tumor = .data$SampleID_tumor, + outdir = .data$outdir, max_files = 1000, dryrun = FALSE + ) + ) + ) |> + ungroup() + +d |> + saveRDS(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/results_{start_date}_{end_date}.rds"))) diff --git a/inst/rmd/umccr_workflows/umccrise/render.sh b/inst/rmd/umccr_workflows/umccrise/render.sh new file mode 100644 index 0000000..bfff2c7 --- /dev/null +++ b/inst/rmd/umccr_workflows/umccrise/render.sh @@ -0,0 +1,9 @@ +date_start="2024-08-29" +date_end="2024-09-01" +out="umccrise_${date_start}_${date_end}.html" + +quarto render summary_umccrise.qmd \ + -P date_start:${date_start} \ + -P date_end:${date_end} \ + -o ${out} \ + --output-dir nogit/html diff --git a/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd new file mode 100644 index 0000000..0f9390e --- /dev/null +++ b/inst/rmd/umccr_workflows/umccrise/summary_umccrise.qmd @@ -0,0 +1,369 @@ +--- +title: "{{< meta params.title >}}" +subtitle: "Period: `r paste(params$date_start, ' to ', params$date_end)`" +author: "UMCCR - Genomics Platform Group" +date: now +date-format: "YYYY-MM-DD HH:mm Z" +execute: + echo: false +format: + html: + toc: true + toc-expand: 1 + toc-title: Contents + toc-location: body + highlight-style: github + number-sections: false + link-external-icon: true + link-external-newwindow: true + embed-resources: true + code-copy: true + code-link: true + code-fold: true + code-block-border-left: true + smooth-scroll: true + grid: + body-width: 1300px +params: + title: "UMCCR umccrise Workflow Summary" + date_start: "XXXX-XX-XX" + date_end: "XXXX-XX-XX" +--- + +```{r} +#| label: pkg_load +#| message: false +{ + require(dplyr) # import all dplyr funcs + require(readr, include.only = c("read_rds")) + require(purrr, include.only = c("map")) + require(tidyr, include.only = c("unnest_wider")) + require(dracarys, include.only = c("session_info_kable")) + require(glue, include.only = "glue") + require(here, include.only = "here") + require(knitr, include.only = "kable") + require(reactable, include.only = "reactable") + require(ggplot2, include.only = c("ggplot", "aes")) + require(lubridate, include.only = c("as_datetime")) + require(plotly, include.only = c("ggplotly")) +} +set.seed(42) +``` + +```{r} +#| label: load_data +date_start <- "2024-08-29" +date_end <- "2024-09-01" +d_raw <- readr::read_rds(here(glue("inst/rmd/umccr_workflows/umccrise/nogit/results_{date_start}_{date_end}.rds"))) +``` + +```{r} +#| label: funcs +dt_view <- function(x, id, ...) { + htmltools::browsable( + htmltools::tagList( + htmltools::tags$button( + htmltools::tagList(fontawesome::fa("download"), "CSV"), + onclick = glue("Reactable.downloadDataCSV('{id}', '{id}.csv')") + ), + x |> + reactable::reactable( + bordered = TRUE, + filterable = TRUE, + fullWidth = TRUE, + height = 800, + highlight = TRUE, + pagination = FALSE, + resizable = TRUE, + searchable = TRUE, + sortable = TRUE, + striped = TRUE, + wrap = FALSE, + elementId = id, + ... + ) + ) + ) +} +``` + +## Metadata + +```{r} +#| label: metadata +meta <- d_raw |> + select( + rownum, portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) +dt_view(meta, id = "metadata") +``` + +## Results + +```{r} +#| label: process +# one row per file type - not all samples have sigsdbstsv +d <- d_raw |> + tidyr::unnest_longer(res, indices_to = "filetype") +# main_cols <- c("rownum", "portal_run_id", "SubjectID", "LibraryID_tumor") +main_cols <- c("portal_run_id") +``` + +```{r} +#| label: qcsum +qcsum <- d |> + filter(filetype == "qcsummarytsv") |> + select(all_of(main_cols), res) |> + unnest_wider(res) +``` + +```{r} +#| label: pcgr +pcgr <- d |> + filter(filetype == "pcgrjson") |> + select(all_of(main_cols), res) |> + unnest_wider(res) |> + rename( + msi_fraction_indels_pcgr = "fracIndels", + msi_pcgr = "predicted_class", + tmb_pcgr = "tmb_estimate", + n_tmb_pcgr = "n_tmb" + ) |> + mutate(msi_pcgr = sub(" \\(.*\\)", "", msi_pcgr)) +``` + +```{r} +#| label: conpair +sampleids <- d_raw |> + select(SampleID_tumor, SampleID_normal) |> + tidyr::pivot_longer(everything(), values_to = "sampleid") |> + mutate(phenotype = sub("SampleID_", "", .data$name)) |> + select(sampleid, phenotype) |> + distinct() +conpair_raw <- d |> + filter(filetype == "conpairmultiqc") |> + select(all_of(main_cols), res) |> + tidyr::unnest(res) |> + left_join(sampleids, by = "sampleid") |> + select(-sampleid) +conpair_tumor <- conpair_raw |> + filter(phenotype == "tumor") +conpair_normal <- conpair_raw |> + filter(phenotype == "normal") |> + select(portal_run_id, contamination) +conpair <- conpair_tumor |> + left_join(conpair_normal, by = "portal_run_id", suffix = c("_tumor", "_normal")) |> + select(portal_run_id, + contamination_tumor_conpair = "contamination_tumor", + contamination_normal_conpair = "contamination_normal", + concordance_conpair = "concordance" + ) +``` + +```{r} +#| label: hrd +hrd_chord <- d |> + filter(filetype == "chordtsv") |> + unnest_wider(res) |> + select(all_of(main_cols), + chord_p_hrd = "p_hrd", + chord_hr_status = "hr_status", + chord_hrd_type = "hrd_type", + chord_p_BRCA1 = "p_BRCA1", + chord_p_BRCA2 = "p_BRCA2" + ) +hrd_hrdetect <- d |> + filter(filetype == "hrdetecttsv") |> + unnest_wider(res) |> + select(all_of(main_cols), hrdetect_prob = "Probability") +``` + +```{r} +#| label: sigs +sigs_snv2015 <- d |> + filter(filetype == "sigssnv2015tsv") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_snv2020 <- d |> + filter(filetype == "sigssnv2020tsv") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_dbs <- d |> + filter(filetype == "sigsdbstsv") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_indel <- d |> + filter(filetype == "sigsindeltsv") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +dsig <- bind_rows( + list( + snv2015 = sigs_snv2015, snv2020 = sigs_snv2020, dbs = sigs_dbs, indel = sigs_indel + ), + .id = "Sig_group" +) + +# keep top two ranked sigs +dsig_filt <- dsig |> + group_by(Sig_group, portal_run_id) |> + mutate(tot_sig_vars = sum(Contribution)) |> + arrange(Rank) |> + slice_head(n = 2) |> + # some sigs have same Rank so use explicit sig_rank + mutate(sig_rank = row_number()) |> + ungroup() |> + mutate( + sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})") + ) |> + select(Sig_group, portal_run_id, sig_rank, sig_summary) |> + tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |> + mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |> + select(Sig_group, portal_run_id, sig_top2) |> + tidyr::pivot_wider(names_from = Sig_group, values_from = sig_top2) |> + select(portal_run_id, snv2015, snv2020, dbs, indel) +``` + +```{r} +#| label: qc_all +dall <- d_raw |> + select( + rownum, + date_analysed = "start", portal_run_id, + SubjectID, LibraryID_tumor, SampleID_tumor, + external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) |> + left_join(qcsum, by = "portal_run_id") |> + left_join(hrd_chord, by = "portal_run_id") |> + left_join(hrd_hrdetect, by = "portal_run_id") |> + left_join(pcgr, by = "portal_run_id") |> + left_join(conpair, by = "portal_run_id") +``` + +### Summary Metrics + +```{r} +#| label: summary_metrics +dt_view(dall, "summary_metrics") +``` + +### HRD Plot + +```{r} +#| label: hrd_plot +#| fig-width: 15 +#| fig-height: 15 +p1 <- dall |> + mutate(sbj = glue("{SubjectID}_{LibraryID_tumor}")) |> + select(date_analysed, sbj, chord = hrd_chord, hrdetect = hrd_hrdetect) |> + tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") |> + ggplot2::ggplot(aes(x = date_analysed, y = probability, label = sbj)) + + ggplot2::geom_point(aes(colour = method)) + + ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) + + ggplot2::theme_bw() + + ggplot2::ggtitle("CHORD vs. HRDetect per SubjectID") + +plotly::ggplotly(p1) +``` + +### Signatures + +#### All (SNV, Indel, DBS) + +```{r} +#| label: sig_results_all +dsig |> + left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor), + by = "portal_run_id" + ) |> + select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + dt_view("sig_results_all") +``` + +#### Top 2 + +```{r} +#| label: sig_results_top2 +dsig_filt |> + left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor), + by = "portal_run_id" + ) |> + select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + arrange(rownum) |> + dt_view("sig_results_top2") +``` + +#### Top 3 SNV2015 + +```{r} +#| label: sig_results +#| fig-width: 15 +#| fig-height: 65 + +sig_order2015 <- paste0("Sig", 1:30) +# sig_order2020 <- paste0( +# "SBS", +# c( +# 1:6, +# paste0(7, c("a", "b", "c", "d")), +# 8:9, +# paste0(10, c("a", "b", "c", "d")), +# 11:16, +# paste0(17, c("a", "b")), +# 18:60, +# 84:94 +# ) +# ) + +p2_prep <- dsig |> + filter( + Sig_group == "snv2015", + Rank %in% c(1:3) + ) |> + left_join(dall |> select(portal_run_id, date_analysed, SubjectID, LibraryID_tumor), by = "portal_run_id") |> + mutate(sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}"))) |> + select(date_analysed, sbj, Sig_group, Rank, Signature, Contribution, RelFreq) |> + mutate(Signature = factor(Signature, levels = sig_order2015)) +p2 <- p2_prep |> + ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) + + ggplot2::geom_bar(position = "fill", stat = "identity") + + ggplot2::theme_bw(base_size = 7) + +plotly::ggplotly(p2, tooltip = c("x", "text", "fill")) +``` + +## Metadata Summary + +::: {.panel-tabset .nav-pills} + +### Project Name/Owner + +```{r} +#| label: project_owner_name +dall |> + count(project_name, project_owner) |> + knitr::kable() +``` + +### Source / Quality + +```{r} +#| label: source_quality +count(dall, source, quality) |> knitr::kable() +``` + +### Workflow + +```{r} +#| label: workflow_summary +count(dall, workflow) |> knitr::kable() +``` + +::: + + diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R index 6602d87..ea4d2f4 100644 --- a/inst/scripts/umccrise_run.R +++ b/inst/scripts/umccrise_run.R @@ -1,74 +1,62 @@ require(dracarys) -require(here) -require(glue) +require(rportal, include.only = "portaldb_query_workflow") +require(here, include.only = "here") +require(glue, include.only = "glue") +require(readr, include.only = "read_rds") require(dplyr) -require(readr) - -#---- GDS ----# -# read last 1000 umccrise runs from portal -# 475 from 2022-01-24 until 2023-09-03, of which 449 Succeeded -date1 <- "2023-09-04" -pmeta_raw_rds <- here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_raw.rds")) -# pmeta_raw <- dracarys::portal_meta_read(rows = 1000, params = "&type_name=umccrise") -# saveRDS(pmeta_raw, file = pmeta_raw_rds) -pmeta <- readr::read_rds(pmeta_raw_rds) |> - dracarys::meta_umccrise(status = "Succeeded") -lims_raw_rds <- here(glue("nogit/umccrise/rds/lims/{date1}_lims_raw.rds")) -# lims_raw <- dracarys::glims_read() +require(tidyr, include.only = "separate_wider_delim") + +start_date <- "2024-08-29" +query_workflow_umccrise <- function(start_date) { + q1 <- glue( + "WHERE \"type_name\" = 'umccrise' AND \"start\" > date(\'{start_date}\') ", + "ORDER BY \"start\" DESC;" + ) + rportal::portaldb_query_workflow(q1) +} + +query_limsrow_libids <- function(libids) { + assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) + libids <- unique(libids) |> + paste(collapse = "|") + q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');") + rportal::portaldb_query_limsrow(q1) +} + +# p_raw <- query_workflow_umccrise(start_date) +p_raw_rds <- here(glue("nogit/data_portal/workflows/{start_date}.rds")) +# saveRDS(p_raw, file = p_raw_rds) +p_raw <- readr::read_rds(p_raw_rds) + +p <- p_raw |> + rportal::meta_umccrise(status = "Succeeded") +# lims_raw <- query_limsrow_libids(p$LibraryID_tumor) +lims_raw_rds <- here(glue("nogit/data_portal/lims/{start_date}.rds")) # saveRDS(lims_raw, file = lims_raw_rds) +# L2100192 is L2100192_rerun in the lims, 15 libs are rerun/topup/topup2 lims_raw <- readr::read_rds(lims_raw_rds) lims <- lims_raw |> - filter(Type == "WGS") |> - filter(LibraryID %in% c(pmeta$LibraryID_normal, pmeta$LibraryID_tumor)) -table(pmeta$LibraryID_tumor %in% lims$LibraryID) -table(pmeta$LibraryID_normal %in% lims$LibraryID) - -# The final results sit under gds_outdir_umccrise/__/ -# We need to get the SampleID_tumor for runs before 2023-04-07. We can do that -# by using the LibraryID_tumor to match up with the glims. -missing_tumor_sampleid <- pmeta |> - filter(end < "2023-04-07") |> - pull(LibraryID_tumor) - -table(missing_tumor_sampleid %in% lims$LibraryID) -libid2sampid <- lims |> - filter(LibraryID %in% missing_tumor_sampleid) |> - select(LibraryID_tumor = LibraryID, SampleID_tumor = SampleID) - -d <- pmeta |> - left_join(libid2sampid, by = "LibraryID_tumor") |> - mutate(SampleID_tumor = if_else(is.na(SampleID_tumor.x), SampleID_tumor.y, SampleID_tumor.x)) |> - select(-c(SampleID_tumor.x, SampleID_tumor.y)) |> - relocate(SampleID_tumor, .before = SampleID_normal) |> - mutate(gds_outdir_umccrise = glue("{.data$gds_outdir_umccrise}/{.data$SubjectID}__{.data$SampleID_tumor}")) -d - -# final portal meta for umccrise runs -# columns: -# "id", "wfr_name", "wfr_id", "version", "end_status", "start", "end", "portal_run_id", -# "SubjectID", "LibraryID_tumor", "LibraryID_normal", "SampleID_tumor", "SampleID_normal", -# "gds_outdir_umccrise", "gds_indir_dragen_somatic", "gds_indir_dragen_germline", "gds_infile_genomes_tar" -saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds"))) - -#---- S3 ----# -pat <- "qc_summary.tsv.gz" -rows <- 1000 -d_s3_raw <- dracarys::s3_search(pat = pat, rows = rows) - -d_s3 <- d_s3_raw |> - arrange(desc(date_aest)) |> - mutate( - bname = basename(path), - dir1 = dirname(path), # path/to/dirA/cancer_report_tables - dir2 = basename(dirname(dir1)), # dirA - sbj_samp_lib = sub(".*__(.*)", "\\1", dir2), - SubjectID = sub("(SBJ[0-9]{5})_.*", "\\1", sbj_samp_lib), - SampleID_tumor = sub("SBJ.*?_(.*?)_.*", "\\1", sbj_samp_lib), - LibraryID_tumor = sub("SBJ.*?_.*?_(.*)", "\\1", sbj_samp_lib), - rerun = grepl("rerun", .data$LibraryID_tumor) + tidyr::separate_wider_delim( + library_id, + delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" ) |> - select(dir1, SubjectID, LibraryID_tumor, SampleID_tumor, date = date_aest, rerun) + select( + subject_id, library_id, sample_id, sample_name, + external_subject_id, external_sample_id, + project_name, project_owner, + source, quality + ) |> + distinct() +table(lims$library_id %in% p$LibraryID_tumor) # double-check + +d <- p |> + left_join(lims, by = c("LibraryID_tumor" = "library_id")) |> + mutate(gds_outdir_umccrise = glue("{.data$gds_outdir_umccrise}/{.data$SubjectID}__{.data$SampleID_tumor}")) |> + select( + wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal, + SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id, + project_owner, project_name, source, quality + ) +d -date2 <- "2023-09-12" -saveRDS(d_s3, file = here(glue("nogit/umccrise/rds/portal_meta/{date2}_pmeta_s3.rds"))) -# now we have S3 paths and metadata, so all we need is to generate presigned URLs and read the data +saveRDS(d, file = here(glue("nogit/data_portal/workflows/umccrise_tidy_{start_date}.rds"))) diff --git a/man/BclconvertReports375.Rd b/man/BclconvertReports375.Rd index 01aba8b..e644a84 100644 --- a/man/BclconvertReports375.Rd +++ b/man/BclconvertReports375.Rd @@ -10,8 +10,8 @@ https://support-docs.illumina.com/SW/dragen_v42/Content/SW/DRAGEN/OutputFiles.ht } \examples{ \dontrun{ -p1 <- "240816_A01052_0220_AHM7VHDSXC/202408195d4f2fc4/Reports" -b <- here::here("nogit/bcl_convert", p1) |> +p1 <- "nogit/bcl_convert/WGS_TsqNano/Reports" +b <- here::here(p1) |> BclconvertReports375$new() b$path b$contents diff --git a/man/UmccriseCanRepTables.Rd b/man/UmccriseCanRepTables.Rd deleted file mode 100644 index 67e20f3..0000000 --- a/man/UmccriseCanRepTables.Rd +++ /dev/null @@ -1,231 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/umccrise.R -\name{UmccriseCanRepTables} -\alias{UmccriseCanRepTables} -\title{UmccriseCanRepTables R6 Class} -\description{ -Reads and writes tidy versions of files within the \code{cancer_report_tables} directory -output from the \code{umccrise} workflow. -} -\examples{ -\dontrun{ -p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" -p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables" -p <- file.path(p1, p2) -obj <- UmccriseCanRepTables$new(p) -obj$path -obj$contents -d <- obj$read() -obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv") -} - -} -\section{Public fields}{ -\if{html}{\out{
}} -\describe{ -\item{\code{path}}{Path to the \code{cancer_report_tables} directory.} - -\item{\code{contents}}{Tibble with file path, basename, and size.} -} -\if{html}{\out{
}} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-UmccriseCanRepTables-new}{\code{UmccriseCanRepTables$new()}} -\item \href{#method-UmccriseCanRepTables-print}{\code{UmccriseCanRepTables$print()}} -\item \href{#method-UmccriseCanRepTables-grep_file}{\code{UmccriseCanRepTables$grep_file()}} -\item \href{#method-UmccriseCanRepTables-read_chordtsv}{\code{UmccriseCanRepTables$read_chordtsv()}} -\item \href{#method-UmccriseCanRepTables-read_hrdetecttsv}{\code{UmccriseCanRepTables$read_hrdetecttsv()}} -\item \href{#method-UmccriseCanRepTables-read_sigs}{\code{UmccriseCanRepTables$read_sigs()}} -\item \href{#method-UmccriseCanRepTables-read_qcsummarytsv}{\code{UmccriseCanRepTables$read_qcsummarytsv()}} -\item \href{#method-UmccriseCanRepTables-read}{\code{UmccriseCanRepTables$read()}} -\item \href{#method-UmccriseCanRepTables-write}{\code{UmccriseCanRepTables$write()}} -\item \href{#method-UmccriseCanRepTables-clone}{\code{UmccriseCanRepTables$clone()}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-new}{}}} -\subsection{Method \code{new()}}{ -Create a new UmccriseCanRepTables object. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$new(path = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{path}}{Path to the \code{cancer_report_tables} directory.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-print}{}}} -\subsection{Method \code{print()}}{ -Print details about the cancer_report_tables directory. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$print(...)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{...}}{(ignored).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-grep_file}{}}} -\subsection{Method \code{grep_file()}}{ -Returns file with given pattern from the cancer_report_tables directory. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$grep_file(pat)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{pat}}{File pattern to look for.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_chordtsv}{}}} -\subsection{Method \code{read_chordtsv()}}{ -Read \code{chord.tsv.gz} file output from umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read_chordtsv(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to \code{chord.tsv.gz} file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_hrdetecttsv}{}}} -\subsection{Method \code{read_hrdetecttsv()}}{ -Read \code{hrdetect.tsv.gz} file output from umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read_hrdetecttsv(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to \code{hrdetect.tsv.gz} file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_sigs}{}}} -\subsection{Method \code{read_sigs()}}{ -Read \code{snv_20XX.tsv.gz} file output from umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read_sigs(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to \code{snv_20XX.tsv.gz} file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read_qcsummarytsv}{}}} -\subsection{Method \code{read_qcsummarytsv()}}{ -Read \code{qc_summary.tsv.gz} file output from umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read_qcsummarytsv(x)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{(\code{character(1)})\cr -Path to \code{qc_summary.tsv.gz} file.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-read}{}}} -\subsection{Method \code{read()}}{ -Reads contents of \code{cancer_report_tables} directory output by umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -A list of tibbles. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-write}{}}} -\subsection{Method \code{write()}}{ -Writes tidied contents of \code{cancer_report_tables} directory output by umccrise. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-UmccriseCanRepTables-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmccriseCanRepTables$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/Wf.Rd b/man/Wf.Rd index d5c99ae..f785399 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -2,38 +2,83 @@ % Please edit documentation in R/Wf.R \name{Wf} \alias{Wf} -\title{Workflow R6 Class} +\title{Workflow} \description{ Workflow is a base R6 class representing a bioinformatic workflow run from a UMCCR workflow manager. + +A workflow has: +\itemize{ +\item a directory path with all the raw output files (either on GDS, S3 or +local filesystem) +\item a subset of files that are of interest for ingestion +\itemize{ +\item tibble with full path and basename columns +} +\item a set of parsers that can parse and tidy those files +\itemize{ +\item each parser takes a path and returns a tidy tibble +} +\item a list of tidy tibbles (or a tibble with nested tibbles) +} } \examples{ -p1 <- system.file("extdata/portaldb_workflow_top4.rds", package = "rportal") |> - readRDS() |> - dplyr::filter(type_name == "umccrise") |> - dplyr::slice(1) -w <- Wf$new( - prid = p1$portal_run_id, type = p1$type_name, start = p1$start, end = p1$end, - status = p1$end_status, input = p1$input, output = p1$output +\dontrun{ +regexes <- tibble::tribble( + ~regex, ~fun, + "-chord\\\\.tsv\\\\.gz$", "UmChordTsvFile", + "-hrdetect\\\\.tsv\\\\.gz$", "UmHrdetectTsvFile", + "-snv_2015\\\\.tsv\\\\.gz$", "UmSigsSnvFile", + "-snv_2020\\\\.tsv\\\\.gz$", "UmSigsSnvFile", + "-dbs\\\\.tsv\\\\.gz$", "UmSigsDbsFile", + "-indel\\\\.tsv\\\\.gz$", "UmSigsIndelFile", + "-qc_summary\\\\.tsv\\\\.gz$", "UmQcSumFile", +) + +#---- LOCAL ----# +p1_local <- "~/icav1/g/production/analysis_data" +p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565") +um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) +um1$list_files(max_files = 10) +um1$list_files_filter_relevant(max_files = 10) + +#---- GDS ----# +p1_gds <- "gds://production/analysis_data" +p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +token <- Sys.getenv("ICA_ACCESS_TOKEN") +um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) +um2$list_files(max_files = 10) +um2$list_files_filter_relevant(ica_token = token, max_files = 500) +d <- um2$download_files( + outdir = outdir, ica_token = token, + max_files = 1000, dryrun = T ) -w +d_tidy <- um2$tidy_files(d) + +#---- S3 ----# +p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" +p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables" +p <- file.path(p1_s3, p2_s3) +outdir <- sub("s3:/", "~/s3", p) +um3 <- Wf$new(path = p, wname = "sash", regexes = regexes) +um3$list_files(max_files = 10) +um3$list_files_filter_relevant(max_files = 50) +d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F) +} + } \section{Public fields}{ \if{html}{\out{
}} \describe{ -\item{\code{prid}}{Portal run ID.} - -\item{\code{type}}{Workflow type.} - -\item{\code{start}}{Workflow start datetime.} +\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or +local filesystem).} -\item{\code{end}}{Workflow end datetime.} +\item{\code{wname}}{Name of workflow (e.g. umccrise, sash).} -\item{\code{status}}{Workflow end status.} +\item{\code{filesystem}}{Filesystem of \code{path} (gds/s3/local).} -\item{\code{input}}{Workflow input JSON string.} - -\item{\code{output}}{Workflow output JSON string.} +\item{\code{regexes}}{Tibble with file \code{regex} and \code{fun}ction to parse it.} } \if{html}{\out{
}} } @@ -42,6 +87,11 @@ w \itemize{ \item \href{#method-Wf-new}{\code{Wf$new()}} \item \href{#method-Wf-print}{\code{Wf$print()}} +\item \href{#method-Wf-list_files}{\code{Wf$list_files()}} +\item \href{#method-Wf-list_files_filter_relevant}{\code{Wf$list_files_filter_relevant()}} +\item \href{#method-Wf-download_files}{\code{Wf$download_files()}} +\item \href{#method-Wf-tidy_files}{\code{Wf$tidy_files()}} +\item \href{#method-Wf-write}{\code{Wf$write()}} \item \href{#method-Wf-clone}{\code{Wf$clone()}} } } @@ -51,33 +101,17 @@ w \subsection{Method \code{new()}}{ Create a new Workflow object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$new( - prid = NULL, - type = NULL, - start = NULL, - end = NULL, - status = NULL, - input = NULL, - output = NULL -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$new(path = NULL, wname = NULL, regexes = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{prid}}{Portal run ID.} - -\item{\code{type}}{Workflow type.} +\item{\code{path}}{Path to directory with raw workflow results.} -\item{\code{start}}{Workflow start datetime.} +\item{\code{wname}}{Name of workflow.} -\item{\code{end}}{Workflow end datetime.} - -\item{\code{status}}{Workflow end status.} - -\item{\code{input}}{Workflow input JSON string.} - -\item{\code{output}}{Workflow output JSON string.} +\item{\code{regexes}}{Tibble with file \code{regex} and \code{fun}ction to parse it.} } \if{html}{\out{
}} } @@ -100,6 +134,144 @@ Print details about the Workflow. } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-list_files}{}}} +\subsection{Method \code{list_files()}}{ +List all files under given path. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$list_files( + path = self$path, + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path with raw results.} + +\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{...}}{Passed on to \code{gds_list_files_dir} function.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-list_files_filter_relevant}{}}} +\subsection{Method \code{list_files_filter_relevant()}}{ +List dracarys files under given path +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$list_files_filter_relevant( + path = self$path, + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path with raw results.} + +\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or +the \code{s3_list_files_filter_relevant} function.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-download_files}{}}} +\subsection{Method \code{download_files()}}{ +Download files from GDS/S3 to local filesystem. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$download_files( + path = self$path, + outdir, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, + dryrun = FALSE, + recursive = NULL, + list_filter_fun = NULL +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path with raw results.} + +\item{\code{outdir}}{Path to output directory.} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{max_files}}{Max number of files to list.} + +\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't +download them).} + +\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} + +\item{\code{list_filter_fun}}{Function to filter relevant files.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-tidy_files}{}}} +\subsection{Method \code{tidy_files()}}{ +Tidy given files. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$tidy_files(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Tibble with \code{localpath} to file and the function \code{type} to parse it.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf-write}{}}} +\subsection{Method \code{write()}}{ +Write tidy data. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf$write(x, outdir = NULL, prefix = NULL, format = "tsv", drid = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Tibble with tidy \code{data} and file \code{type}.} + +\item{\code{outdir}}{Directory path to output tidy files.} + +\item{\code{prefix}}{Prefix of output files.} + +\item{\code{format}}{Format of output files.} + +\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd new file mode 100644 index 0000000..5fd5ade --- /dev/null +++ b/man/Wf_umccrise.Rd @@ -0,0 +1,383 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/umccrise.R +\name{Wf_umccrise} +\alias{Wf_umccrise} +\title{Wf_umccrise R6 Class} +\description{ +Reads and writes tidy versions of files from the \code{umccrise} workflow +} +\examples{ +\dontrun{ + +#---- LOCAL ----# +SubjectID <- "SBJ03043" +SampleID_tumor <- "PRJ230004" +prefix <- glue("{SubjectID}__{SampleID_tumor}") +p1_local <- "~/icav1/g/production/analysis_data" +p <- file.path(p1_local, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +um1 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +um1$list_files(max_files = 10) +um1$list_files_filter_relevant() +d <- um1$download_files(max_files = 1000, dryrun = F) +d_tidy <- um1$tidy_files(d) +d_write <- um1$write( + d_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = "tsv" +) + +#---- GDS ----# +SubjectID <- "SBJ03043" +SampleID_tumor <- "PRJ230004" +prefix <- glue("{SubjectID}__{SampleID_tumor}") +p1_gds <- "gds://production/analysis_data" +p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") +outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +token <- Sys.getenv("ICA_ACCESS_TOKEN") +um2 <- Wf_umccrise$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +um2$list_files(max_files = 8) +um2$list_files_filter_relevant(ica_token = token, max_files = 500) +d <- um2$download_files( + outdir = outdir, ica_token = token, + max_files = 1000, dryrun = F +) +d_tidy <- um2$tidy_files(d) +d_write <- um2$write( + d_tidy, + outdir = file.path(outdir, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = "tsv" +) +} + +} +\section{Super class}{ +\code{\link[dracarys:Wf]{dracarys::Wf}} -> \code{Wf_umccrise} +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}} +\item \href{#method-Wf_umccrise-print}{\code{Wf_umccrise$print()}} +\item \href{#method-Wf_umccrise-list_files_filter_relevant}{\code{Wf_umccrise$list_files_filter_relevant()}} +\item \href{#method-Wf_umccrise-download_files}{\code{Wf_umccrise$download_files()}} +\item \href{#method-Wf_umccrise-read_pcgrjson}{\code{Wf_umccrise$read_pcgrjson()}} +\item \href{#method-Wf_umccrise-read_chordtsv}{\code{Wf_umccrise$read_chordtsv()}} +\item \href{#method-Wf_umccrise-read_hrdetecttsv}{\code{Wf_umccrise$read_hrdetecttsv()}} +\item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}} +\item \href{#method-Wf_umccrise-read_sigssnv2015tsv}{\code{Wf_umccrise$read_sigssnv2015tsv()}} +\item \href{#method-Wf_umccrise-read_sigssnv2020tsv}{\code{Wf_umccrise$read_sigssnv2020tsv()}} +\item \href{#method-Wf_umccrise-read_sigsdbstsv}{\code{Wf_umccrise$read_sigsdbstsv()}} +\item \href{#method-Wf_umccrise-read_sigsindeltsv}{\code{Wf_umccrise$read_sigsindeltsv()}} +\item \href{#method-Wf_umccrise-read_qcsummarytsv}{\code{Wf_umccrise$read_qcsummarytsv()}} +\item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}} +\item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} +} +} +\if{html}{\out{ +
Inherited methods + +
+}} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-new}{}}} +\subsection{Method \code{new()}}{ +Create a new Wf_umccrise object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$new(path = NULL, SubjectID = NULL, SampleID_tumor = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or +local filesystem).} + +\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-print}{}}} +\subsection{Method \code{print()}}{ +Print details about the Workflow. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$print(...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{(ignored).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-list_files_filter_relevant}{}}} +\subsection{Method \code{list_files_filter_relevant()}}{ +List dracarys files under given path +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$list_files_filter_relevant( + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or +the \code{s3_list_files_filter_relevant} function.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-download_files}{}}} +\subsection{Method \code{download_files()}}{ +Download files from GDS/S3 to local filesystem. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$download_files( + outdir, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, + dryrun = FALSE, + recursive = NULL +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{outdir}}{Path to output directory.} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{max_files}}{Max number of files to list.} + +\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't +download them).} + +\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} + +\item{\code{list_filter_fun}}{Function to filter relevant files.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgrjson}{}}} +\subsection{Method \code{read_pcgrjson()}}{ +Read \code{pcgr.json.gz} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_pcgrjson(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_chordtsv}{}}} +\subsection{Method \code{read_chordtsv()}}{ +Read \code{chord.tsv.gz} cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_chordtsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrdetecttsv}{}}} +\subsection{Method \code{read_hrdetecttsv()}}{ +Read \code{hrdetect.tsv.gz} cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrdetecttsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigstsv}{}}} +\subsection{Method \code{read_sigstsv()}}{ +Read signature cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigstsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigssnv2015tsv}{}}} +\subsection{Method \code{read_sigssnv2015tsv()}}{ +Read \code{snv_2015.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigssnv2015tsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigssnv2020tsv}{}}} +\subsection{Method \code{read_sigssnv2020tsv()}}{ +Read \code{snv_2020.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigssnv2020tsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsdbstsv}{}}} +\subsection{Method \code{read_sigsdbstsv()}}{ +Read \code{dbs.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigsdbstsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsindeltsv}{}}} +\subsection{Method \code{read_sigsindeltsv()}}{ +Read \code{indel.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigsindeltsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcsummarytsv}{}}} +\subsection{Method \code{read_qcsummarytsv()}}{ +Read \code{qc_summary.tsv.gz} cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_qcsummarytsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_conpairmultiqc}{}}} +\subsection{Method \code{read_conpairmultiqc()}}{ +Read multiqc_conpair.txt file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_conpairmultiqc(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_umccrise$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/Wf_umccrise_download_tidy_write.Rd b/man/Wf_umccrise_download_tidy_write.Rd new file mode 100644 index 0000000..05ca0bf --- /dev/null +++ b/man/Wf_umccrise_download_tidy_write.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/umccrise.R +\name{Wf_umccrise_download_tidy_write} +\alias{Wf_umccrise_download_tidy_write} +\title{umccrise Download Tidy and Write} +\usage{ +Wf_umccrise_download_tidy_write( + path, + SubjectID, + SampleID_tumor, + outdir, + format = "rds", + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + dryrun = FALSE +) +} +\arguments{ +\item{path}{Path to directory with raw workflow results (from GDS, S3, or +local filesystem).} + +\item{SubjectID}{The SubjectID of the sample (needed for path lookup).} + +\item{SampleID_tumor}{The SampleID of the tumor sample (needed for path lookup).} + +\item{outdir}{Path to output directory.} + +\item{format}{Format of output files.} + +\item{max_files}{Max number of files to list.} + +\item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{dryrun}{If TRUE, just list the files that will be downloaded (don't +download them).} +} +\value{ +List where each element is a tidy tibble of a umccrise file. +} +\description{ +Downloads files from the \code{umccrise} workflow and writes them in a tidy format. +} +\examples{ +\dontrun{ +SubjectID <- "SBJ03043" +SampleID_tumor <- "PRJ230004" +p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") +p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +token <- Sys.getenv("ICA_ACCESS_TOKEN") +d <- Wf_umccrise_download_tidy_write( + path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, + outdir = outdir, + dryrun = F +) +} +} diff --git a/man/dr_func_eval.Rd b/man/dr_func_eval.Rd index 2dfedf1..0697c7f 100644 --- a/man/dr_func_eval.Rd +++ b/man/dr_func_eval.Rd @@ -4,13 +4,16 @@ \alias{dr_func_eval} \title{Evaluate dracarys Function} \usage{ -dr_func_eval(f, v = NULL) +dr_func_eval(f, v = NULL, envir = parent.frame()) } \arguments{ \item{f}{Name of function to evaluate.} \item{v}{Character vector of strings evaluating to functions. By default, this points to the functions in the DR_FILE_REGEX dracarys tibble.} + +\item{envir}{the environment in which to evaluate the function e.g. use \code{self} +when using inside R6 classes.} } \value{ Evaluated function. diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd index 087a7ed..61aa7f8 100644 --- a/man/dr_gds_download.Rd +++ b/man/dr_gds_download.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R +% Please edit documentation in R/fs_icav1.R \name{dr_gds_download} \alias{dr_gds_download} \title{dracarys GDS Download} @@ -7,33 +7,49 @@ dr_gds_download( gdsdir, outdir, - token, - page_size = 100, + token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, + page_size = 100, dryrun = FALSE, regexes = DR_FILE_REGEX, - recursive = NULL + recursive = NULL, + list_filter_fun = gds_list_files_filter_relevant ) } \arguments{ \item{gdsdir}{Full path to GDS directory.} -\item{outdir}{Path to output directory.} +\item{outdir}{Local output directory.} \item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -\item{page_size}{Page size (def: 100).} - \item{pattern}{Pattern to further filter the returned file type tibble.} +\item{page_size}{Page size (def: 10 via ICA API).} + \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} -\item{regexes}{Tibble with regex and function name.} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} \item{recursive}{Should files be returned recursively \emph{in and under} the specified -GDS directory (TRUE), or \emph{only directly in} the specified GDS directory (FALSE) (def: TRUE).} +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} + +\item{list_filter_fun}{Function to filter relevant GDS files.} } \description{ Download only GDS files that can be processed by dracarys. } +\examples{ +\dontrun{ +gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +outdir <- sub("gds:/", "~/icav1/g", gdsdir) +regexes <- tibble::tribble( + ~regex, ~fun, + "multiqc_data\\\\.json$", "MultiqcJsonFile", + "-somatic\\\\.pcgr\\\\.json\\\\.gz$", "pcgrjson" +) +dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T) +} + +} diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd index 76f0339..674ed93 100644 --- a/man/dr_s3_download.Rd +++ b/man/dr_s3_download.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3.R +% Please edit documentation in R/fs_s3.R \name{dr_s3_download} \alias{dr_s3_download} \title{dracarys S3 Download} @@ -7,34 +7,43 @@ dr_s3_download( s3dir, outdir, - page_size = 100, + max_objects = 100, pattern = NULL, regexes = DR_FILE_REGEX, - dryrun = FALSE + dryrun = FALSE, + list_filter_fun = s3_list_files_filter_relevant ) } \arguments{ -\item{s3dir}{Full path to S3 directory.} +\item{s3dir}{S3 directory.} \item{outdir}{Path to output directory.} -\item{page_size}{Page size (def: 100).} +\item{max_objects}{Maximum objects returned.} \item{pattern}{Pattern to further filter the returned file type tibble.} -\item{regexes}{Tibble with regex and function name.} +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} + +\item{list_filter_fun}{Function to filter relevant S3 files.} } \description{ Download only S3 files that can be processed by dracarys. } \examples{ -s3dir <- file.path( - "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", - "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" +\dontrun{ +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +s3dir <- file.path(p1, p2) +regexes <- tibble::tribble( + ~regex, ~fun, + "multiqc_data\\\\.json$", "MultiqcJsonFile", + "pcgr.*\\\\.json\\\\.gz$", "pcgrjson" ) outdir <- sub("s3:/", "~/s3", s3dir) - +dr_s3_download(s3dir = s3dir, outdir = outdir, max_objects = 500, regexes = regexes, dryrun = F) +} } diff --git a/man/gds_file_download_api.Rd b/man/gds_file_download_api.Rd index 1397860..b10a438 100644 --- a/man/gds_file_download_api.Rd +++ b/man/gds_file_download_api.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R +% Please edit documentation in R/fs_icav1.R \name{gds_file_download_api} \alias{gds_file_download_api} \title{GDS File Download via API} diff --git a/man/gds_file_download.Rd b/man/gds_file_download_cli.Rd similarity index 61% rename from man/gds_file_download.Rd rename to man/gds_file_download_cli.Rd index 10ab7b5..f5946fb 100644 --- a/man/gds_file_download.Rd +++ b/man/gds_file_download_cli.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R -\name{gds_file_download} -\alias{gds_file_download} +% Please edit documentation in R/fs_icav1.R +\name{gds_file_download_cli} +\alias{gds_file_download_cli} \title{GDS File Download via CLI} \usage{ -gds_file_download(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) +gds_file_download_cli(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) } \arguments{ \item{gds}{Full path to GDS file.} diff --git a/man/gds_file_presignedurl.Rd b/man/gds_file_presignedurl.Rd index 064adb5..841db4a 100644 --- a/man/gds_file_presignedurl.Rd +++ b/man/gds_file_presignedurl.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R +% Please edit documentation in R/fs_icav1.R \name{gds_file_presignedurl} \alias{gds_file_presignedurl} \title{GDS File Presigned URL} diff --git a/man/gds_files_list_filter_relevant.Rd b/man/gds_files_list_filter_relevant.Rd deleted file mode 100644 index 784a393..0000000 --- a/man/gds_files_list_filter_relevant.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R -\name{gds_files_list_filter_relevant} -\alias{gds_files_list_filter_relevant} -\title{List Relevant Files In GDS Directory} -\usage{ -gds_files_list_filter_relevant( - gdsdir, - token, - pattern = NULL, - include_url = FALSE, - page_size = 100, - regexes = DR_FILE_REGEX -) -} -\arguments{ -\item{gdsdir}{GDS directory.} - -\item{token}{ICA access token.} - -\item{pattern}{Pattern to further filter the returned file type tibble.} - -\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE).} - -\item{page_size}{Page size (def: 100).} - -\item{regexes}{Tibble with regex and function name.} -} -\value{ -A tibble with type, bname, size, file_id, path, and presigned URL. -} -\description{ -Lists relevant files in a GDS directory. -} diff --git a/man/gds_files_list.Rd b/man/gds_list_files_dir.Rd similarity index 59% rename from man/gds_files_list.Rd rename to man/gds_list_files_dir.Rd index 30e3bd9..d60b546 100644 --- a/man/gds_files_list.Rd +++ b/man/gds_list_files_dir.Rd @@ -1,12 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R -\name{gds_files_list} -\alias{gds_files_list} -\title{GDS Files List} +% Please edit documentation in R/fs_icav1.R +\name{gds_list_files_dir} +\alias{gds_list_files_dir} +\title{List Files in ICAv1 GDS Directory} \usage{ -gds_files_list( +gds_list_files_dir( gdsdir, - token, + token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL, include_url = FALSE, no_recurse = TRUE, @@ -19,9 +19,9 @@ gds_files_list( \item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -\item{page_size}{Page size (def: 10).} +\item{page_size}{Page size (def: 10 via ICA API).} -\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE).} +\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).} \item{no_recurse}{Do not recurse through the file list i.e. just give the first items without recursing further down the list using .} @@ -29,34 +29,27 @@ without recursing further down the list using .} \item{page_token}{Page token (def: NULL). Used internally for recursion.} \item{recursive}{Should files be returned recursively \emph{in and under} the specified -GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE).} +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} } \value{ -Tibble with file basename, file size, file full data path, file dir name. +A tibble with file ID, basename, size, last modified timestamp, +full GDS path, and presigned URL if requested. } \description{ -List files on ICA GDS filesystem. +Lists files in a GDS directory. } \examples{ \dontrun{ -gdsdir <- file.path( - "gds://production/primary_data", - "240322_A00130_0290_BH5HLLDSXC/20240323f56ec5a5/WGS_TsqNano" -) -gdsdir <- file.path( - "gds://bssh.acddbfda498038ed99fa94fe79523959/Runs", - "240322_A00130_0290_BH5HLLDSXC_r.3TbcOsEKZUyetygkqIOXcg/InterOp" -) gdsdir <- file.path( "gds://production/analysis_data/SBJ00699/umccrise", "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" ) token <- ica_token_validate() page_size <- 11 -include_url <- TRUE +include_url <- F page_token <- NULL no_recurse <- TRUE recursive <- NULL -gds_files_list(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) +gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) } } diff --git a/man/gds_list_files_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd new file mode 100644 index 0000000..1e280b4 --- /dev/null +++ b/man/gds_list_files_filter_relevant.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fs_icav1.R +\name{gds_list_files_filter_relevant} +\alias{gds_list_files_filter_relevant} +\title{List Relevant Files In ICAv1 GDS Directory} +\usage{ +gds_list_files_filter_relevant( + gdsdir, + pattern = NULL, + regexes = DR_FILE_REGEX, + token = Sys.getenv("ICA_ACCESS_TOKEN"), + page_size = 100, + include_url = FALSE, + no_recurse = TRUE, + page_token = NULL, + recursive = NULL +) +} +\arguments{ +\item{gdsdir}{Full path to GDS directory.} + +\item{pattern}{Pattern to further filter the returned file type tibble.} + +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} + +\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{page_size}{Page size (def: 10 via ICA API).} + +\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).} + +\item{no_recurse}{Do not recurse through the file list i.e. just give the first items +without recursing further down the list using .} + +\item{page_token}{Page token (def: NULL). Used internally for recursion.} + +\item{recursive}{Should files be returned recursively \emph{in and under} the specified +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} +} +\value{ +A tibble with file type, basename, size, last modified timestamp, file_id, full path, +and presigned URL if requested. +} +\description{ +Lists relevant files in a GDS directory. +} +\examples{ +\dontrun{ +regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") +gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" +gds_list_files_filter_relevant(gdsdir) +} +} diff --git a/man/grep_file.Rd b/man/grep_file.Rd new file mode 100644 index 0000000..d0c588e --- /dev/null +++ b/man/grep_file.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{grep_file} +\alias{grep_file} +\title{Grep File Pattern} +\usage{ +grep_file(path = ".", regexp) +} +\arguments{ +\item{path}{Path to look for file.} + +\item{regexp}{A regular expression (e.g. \link{.}csv$) passed on to \code{grep()} to filter paths.} +} +\value{ +The path to the file or an empty string if no match is found. +} +\description{ +Grep File Pattern +} diff --git a/man/local_list_files_dir.Rd b/man/local_list_files_dir.Rd new file mode 100644 index 0000000..d77ca63 --- /dev/null +++ b/man/local_list_files_dir.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fs_local.R +\name{local_list_files_dir} +\alias{local_list_files_dir} +\title{List Files in Local Directory} +\usage{ +local_list_files_dir(localdir, max_files = NULL) +} +\arguments{ +\item{localdir}{Path to local directory.} + +\item{max_files}{Max files returned.} +} +\value{ +A tibble with file basename, size, last modification timestamp +and full path. +} +\description{ +Lists files in a local directory. +} +\examples{ +localdir <- system.file("R", package = "dracarys") +x <- local_list_files_dir(localdir) +} diff --git a/man/local_list_files_filter_relevant.Rd b/man/local_list_files_filter_relevant.Rd new file mode 100644 index 0000000..ac62065 --- /dev/null +++ b/man/local_list_files_filter_relevant.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fs_local.R +\name{local_list_files_filter_relevant} +\alias{local_list_files_filter_relevant} +\title{List Relevant Files In Local Directory} +\usage{ +local_list_files_filter_relevant( + localdir, + regexes = DR_FILE_REGEX, + max_files = NULL +) +} +\arguments{ +\item{localdir}{Path to local directory.} + +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} + +\item{max_files}{Max files returned.} +} +\value{ +A tibble with file type, basename, size, last modified timestamp, and +path. +} +\description{ +Lists relevant files in a local directory. +} +\examples{ +localdir <- system.file("extdata/tso", package = "dracarys") +regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcFile") +x <- local_list_files_filter_relevant(localdir, regexes) +} diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd new file mode 100644 index 0000000..79eb7d6 --- /dev/null +++ b/man/s3_file_presignedurl.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fs_s3.R +\name{s3_file_presignedurl} +\alias{s3_file_presignedurl} +\title{S3 Generate Presigned URL} +\usage{ +s3_file_presignedurl(client, s3path, expiry_seconds = 3600) +} +\arguments{ +\item{client}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).} + +\item{s3path}{Full path to S3 object.} + +\item{expiry_seconds}{Number of seconds the presigned URL is valid for (3600 = 1 hour).} +} +\value{ +An S3 presigned URL. +} +\description{ +S3 Generate Presigned URL +} +\examples{ +\dontrun{ +client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) +s3path <- "s3://bucket1/path/to/file.tsv" +s3_file_presignedurl(client, s3path) +} + +} diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd deleted file mode 100644 index 7fc45f3..0000000 --- a/man/s3_files_list_filter_relevant.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3.R -\name{s3_files_list_filter_relevant} -\alias{s3_files_list_filter_relevant} -\title{List Relevant Files In AWS S3 Directory} -\usage{ -s3_files_list_filter_relevant( - s3dir, - pattern = NULL, - page_size = 1000, - max_items = 1000, - presign = FALSE, - expiry_sec = 43200 -) -} -\arguments{ -\item{s3dir}{S3 directory.} - -\item{pattern}{Pattern to further filter the returned file type tibble.} - -\item{page_size}{The size of each page to get in the AWS service call (def: 1000).} - -\item{max_items}{The total number of items to return in the command’s output (def: 1000).} - -\item{presign}{Include presigned URLs (def: FALSE).} - -\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).} -} -\value{ -A tibble with path, date, file size, file type, and presigned URL if requested. -} -\description{ -Lists relevant files in an AWS S3 directory. -} -\examples{ -\dontrun{ -s3dir <- "s3://umccr-primary-data-prod/cancer_report_tables" -s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) -} -} diff --git a/man/s3_list_files_dir.Rd b/man/s3_list_files_dir.Rd new file mode 100644 index 0000000..820da1d --- /dev/null +++ b/man/s3_list_files_dir.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fs_s3.R +\name{s3_list_files_dir} +\alias{s3_list_files_dir} +\title{List Objects in AWS S3 Directory} +\usage{ +s3_list_files_dir(s3dir, max_objects = 1000) +} +\arguments{ +\item{s3dir}{S3 directory.} + +\item{max_objects}{Maximum objects returned.} +} +\value{ +A tibble with object basename, size, last modified timestamp, and +full S3 path. +} +\description{ +Returns some or all (up to 1,000) of the objects in an S3 directory. +} +\examples{ +\dontrun{ +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +s3dir <- file.path(p1, p2, "cancer_report/cancer_report_tables") +s3_list_files_dir(s3dir, max_objects = 15) +} +} diff --git a/man/s3_list_files_filter_relevant.Rd b/man/s3_list_files_filter_relevant.Rd new file mode 100644 index 0000000..1716825 --- /dev/null +++ b/man/s3_list_files_filter_relevant.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fs_s3.R +\name{s3_list_files_filter_relevant} +\alias{s3_list_files_filter_relevant} +\title{List Relevant Files In AWS S3 Directory} +\usage{ +s3_list_files_filter_relevant( + s3dir, + pattern = NULL, + regexes = DR_FILE_REGEX, + max_objects = 100, + presign = FALSE, + expiry_sec = 3600 +) +} +\arguments{ +\item{s3dir}{S3 directory.} + +\item{pattern}{Pattern to further filter the returned file type tibble.} + +\item{regexes}{Tibble with \code{regex} and \code{fun}ction name.} + +\item{max_objects}{Maximum objects returned.} + +\item{presign}{Include presigned URLs (def: FALSE).} + +\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated).} +} +\value{ +A tibble with file type, basename, size, last modified timestamp, +full path, and presigned URL if requested. +} +\description{ +Lists relevant files in an AWS S3 directory. +} +\examples{ +\dontrun{ +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" +p2 <- "20240707becde493/L2401018_L2401017/SBJ05373_MDX240220" +s3dir <- file.path(p1, p2) +regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") +s3_list_files_filter_relevant(s3dir = s3dir, regexes = regexes, max_objects = 300) +} +} diff --git a/man/s3_search.Rd b/man/s3_search.Rd index c0d9f64..c4a1db2 100644 --- a/man/s3_search.Rd +++ b/man/s3_search.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3.R +% Please edit documentation in R/fs_s3.R \name{s3_search} \alias{s3_search} \title{Search AWS S3 Objects} diff --git a/man/tidy_files.Rd b/man/tidy_files.Rd new file mode 100644 index 0000000..8d1849b --- /dev/null +++ b/man/tidy_files.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidy.R +\name{tidy_files} +\alias{tidy_files} +\title{Tidy Files} +\usage{ +tidy_files(x, envir = parent.frame()) +} +\arguments{ +\item{x}{Tibble with \code{localpath} to file and the function \code{type} to parse it.} + +\item{envir}{the environment in which to evaluate the function e.g. use \code{self} +when using inside R6 classes.} +} +\value{ +Tibble with parsed data in a \code{data} list-column. +} +\description{ +Tidy Files +} +\examples{ +\dontrun{ +p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043" +p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz" +p <- file.path(p1, p2) +x <- tibble::tibble(type = "readr::read_tsv", localpath = p) +tidy_files(x) +} + +} diff --git a/man/write_dracarys_list_of_tbls.Rd b/man/write_dracarys_list_of_tbls.Rd new file mode 100644 index 0000000..e6aa8be --- /dev/null +++ b/man/write_dracarys_list_of_tbls.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{write_dracarys_list_of_tbls} +\alias{write_dracarys_list_of_tbls} +\title{Write List of Tidy Tibbles} +\usage{ +write_dracarys_list_of_tbls( + list_of_tbls, + out_dir = NULL, + prefix = NULL, + out_format = "tsv", + drid = NULL +) +} +\arguments{ +\item{list_of_tbls}{List of tidy tibbles.} + +\item{out_dir}{Output directory.} + +\item{prefix}{Prefix of output file(s).} + +\item{out_format}{Format of output file(s).} + +\item{drid}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} +} +\value{ +Tibble with nested objects that have been written to the output directory. +} +\description{ +Write List of Tidy Tibbles +} diff --git a/tests/testthat/test-roxytest-testexamples-fs_local.R b/tests/testthat/test-roxytest-testexamples-fs_local.R new file mode 100644 index 0000000..764c4aa --- /dev/null +++ b/tests/testthat/test-roxytest-testexamples-fs_local.R @@ -0,0 +1,20 @@ +# Generated by roxytest: do not edit by hand! + +# File R/fs_local.R: @testexamples + +test_that("Function local_list_files_dir() @ L16", { + + localdir <- system.file("R", package = "dracarys") + x <- local_list_files_dir(localdir) + expect_equal(names(x), c("bname", "size", "lastmodified", "path")) +}) + + +test_that("Function local_list_files_filter_relevant() @ L47", { + + localdir <- system.file("extdata/tso", package = "dracarys") + regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcFile") + x <- local_list_files_filter_relevant(localdir, regexes) + expect_equal(nrow(x), 1) +}) + diff --git a/tests/testthat/test-roxytest-testexamples-regex.R b/tests/testthat/test-roxytest-testexamples-regex.R index 91a7f26..05787ff 100644 --- a/tests/testthat/test-roxytest-testexamples-regex.R +++ b/tests/testthat/test-roxytest-testexamples-regex.R @@ -2,7 +2,7 @@ # File R/regex.R: @testexamples -test_that("Function dr_func_eval() @ L97", { +test_that("Function dr_func_eval() @ L99", { mean_1_to_10 <- dr_func_eval("mean", v = c("mean", "sd"))(1:10) x <- system.file("extdata/tso/sample705.fragment_length_hist.json.gz", package = "dracarys")