diff --git a/.Rbuildignore b/.Rbuildignore index fcacee5..fbcf8b7 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -23,6 +23,7 @@ ^setup\.cfg$ ^setup\.py$ ^vignettes$ +inst/reports/wgts-qc/nogit inst/rmd/umccr_portal/html inst/rmd/umccr_workflows/alignment_qc/nogit inst/rmd/umccr_workflows/bcl_convert/html diff --git a/DESCRIPTION b/DESCRIPTION index fb57955..c2bc5cd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,8 +31,6 @@ Imports: ggrepel, glue, here, - httr, - jose, jsonlite, knitr, lubridate, diff --git a/NAMESPACE b/NAMESPACE index 0463659..3628f56 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,7 +18,6 @@ export(bcftools_parse_vcf) export(bcftools_parse_vcf_regions) export(date_log) export(dr_func_eval) -export(dr_gds_download) export(dr_output_format_valid) export(dr_s3_download) export(dragen_cnv_metrics_read) @@ -35,15 +34,7 @@ export(dtw_Wf_tso_ctdna_tumor_only) export(dtw_Wf_tso_ctdna_tumor_only_v2) export(empty_tbl) export(file_regex_getter) -export(gds_file_download_api) -export(gds_file_download_cli) -export(gds_file_presignedurl) -export(gds_files_list_fastq) -export(gds_list_files_dir) -export(gds_list_files_filter_relevant) -export(gds_volumes_list) export(grep_file) -export(ica_token_validate) export(local_list_files_dir) export(local_list_files_filter_relevant) export(match_regex) diff --git a/R/Wf.R b/R/Wf.R index b6ff9c3..863086c 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -5,7 +5,7 @@ #' #' A workflow has: #' -#' - a directory path with all the raw output files (either on GDS, S3 or +#' - a directory path with all the raw output files (either on S3 or #' local filesystem) #' - a subset of files that are of interest for ingestion #' - tibble with full path and basename columns @@ -33,20 +33,6 @@ #' um1$list_files(max_files = 10) #' um1$list_files_filter_relevant(max_files = 10) #' -#' #---- GDS ----# -#' p1_gds <- "gds://production/analysis_data" -#' p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") -#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) -#' um2$list_files(max_files = 10) -#' um2$list_files_filter_relevant(ica_token = token, max_files = 500) -#' d <- um2$download_files( -#' outdir = outdir, ica_token = token, -#' max_files = 1000, dryrun = T -#' ) -#' d_tidy <- um2$tidy_files(d) -#' #' #---- S3 ----# #' p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" #' p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables" @@ -110,7 +96,6 @@ Wf <- R6::R6Class( private$.path <- path private$.wname <- wname private$.filesystem <- dplyr::case_when( - grepl("^gds://", path) ~ "gds", grepl("^s3://", path) ~ "s3", .default = "local" ) @@ -135,16 +120,9 @@ Wf <- R6::R6Class( }, #' @description List all files under given path. #' @param path Path with raw results. - #' @param max_files Max number of files to list (for gds/s3 only). - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param ... Passed on to `gds_list_files_dir` function. - list_files = function(path = private$.path, max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { - if (private$.filesystem == "gds") { - d <- gds_list_files_dir( - gdsdir = path, token = ica_token, page_size = max_files, ... - ) - } else if (private$.filesystem == "s3") { + #' @param max_files Max number of files to list. + list_files = function(path = private$.path, max_files = 1000) { + if (private$.filesystem == "s3") { d <- s3_list_files_dir(s3dir = path, max_objects = max_files) } else { d <- local_list_files_dir(localdir = path, max_files = max_files) @@ -153,19 +131,12 @@ Wf <- R6::R6Class( }, #' @description List dracarys files under given path #' @param path Path with raw results. - #' @param max_files Max number of files to list (for gds/s3 only). - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param ... Passed on to the `gds_list_files_filter_relevant` or - #' the `s3_list_files_filter_relevant` function. - list_files_filter_relevant = function(path = private$.path, max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + #' @param max_files Max number of files to list. + #' @param ... Passed on to `s3_list_files_filter_relevant`. + list_files_filter_relevant = function(path = private$.path, max_files = 1000, ...) { regexes <- private$.regexes assertthat::assert_that(!is.null(regexes)) - if (private$.filesystem == "gds") { - d <- gds_list_files_filter_relevant( - gdsdir = path, regexes = regexes, token = ica_token, page_size = max_files, ... - ) - } else if (private$.filesystem == "s3") { + if (private$.filesystem == "s3") { d <- s3_list_files_filter_relevant( s3dir = path, regexes = regexes, max_objects = max_files, ... ) @@ -185,29 +156,16 @@ Wf <- R6::R6Class( data = list(tibble::tibble(input_path = x)) ) }, - #' @description Download files from GDS/S3 to local filesystem. + #' @description Download files from S3 to local filesystem. #' @param path Path with raw results. #' @param outdir Path to output directory. - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param max_files Max number of files to list. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). - #' @param recursive Should files be returned recursively _in and under_ the specified - #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - download_files = function(path = private$.path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, dryrun = FALSE, recursive = NULL) { + download_files = function(path = private$.path, outdir, max_files = 1000, dryrun = FALSE) { regexes <- private$.regexes assertthat::assert_that(!is.null(regexes)) - if (private$.filesystem == "gds") { - d <- dr_gds_download( - gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token, - page_size = max_files, dryrun = dryrun, recursive = recursive - ) - if (!dryrun) { - private$.filesystem <- "local" - private$.path <- outdir - } - } else if (private$.filesystem == "s3") { + if (private$.filesystem == "s3") { d <- dr_s3_download( s3dir = path, outdir = outdir, regexes = regexes, max_objects = max_files, dryrun = dryrun diff --git a/R/dragen.R b/R/dragen.R index 4cba49b..6bb9bea 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -902,24 +902,6 @@ dtw_Wf_dragen <- function(path, prefix, outdir, #' prefix = prefix, #' format = "tsv" #' ) -#' #---- GDS ----# -#' prefix <- "PRJ222358" -#' p <- file.path( -#' "gds://production/analysis_data/SBJ03001/wgs_tumor_normal", -#' "20241108fc293a38/L2201805_L2201797_dragen_somatic" -#' ) -#' outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case -#' d1 <- Wf_dragen$new(path = p, prefix = prefix) -#' d1$list_files(max_files = 100) -#' d1$list_files_filter_relevant(max_files = 300) -#' d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) -#' d_tidy <- d1$tidy_files(d) -#' d_write <- d1$write( -#' d_tidy, -#' outdir = file.path(p, "dracarys_tidy"), -#' prefix = prefix, -#' format = "tsv" -#' ) #' } #' @export Wf_dragen <- R6::R6Class( diff --git a/R/fs_icav1.R b/R/fs_icav1.R deleted file mode 100644 index 803aa7d..0000000 --- a/R/fs_icav1.R +++ /dev/null @@ -1,279 +0,0 @@ -#' List Files in ICAv1 GDS Directory -#' -#' Lists files in a GDS directory. -#' -#' @param gdsdir Full path to GDS directory. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @param page_size Page size (def: 10 via ICA API). -#' @param include_url Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API). -#' @param page_token Page token (def: NULL). Used internally for recursion. -#' @param no_recurse Do not recurse through the file list i.e. just give the first items -#' without recursing further down the list using . -#' @param recursive Should files be returned recursively _in and under_ the specified -#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). -#' -#' @return A tibble with file ID, basename, size, last modified timestamp, -#' full GDS path, and presigned URL if requested. -#' @examples -#' \dontrun{ -#' gdsdir <- file.path( -#' "gds://production/analysis_data/SBJ00699/umccrise", -#' "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" -#' ) -#' token <- ica_token_validate() -#' page_size <- 11 -#' include_url <- F -#' page_token <- NULL -#' no_recurse <- TRUE -#' recursive <- NULL -#' gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) -#' } -#' @export -gds_list_files_dir <- function(gdsdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), page_size = NULL, - include_url = FALSE, no_recurse = TRUE, page_token = NULL, - recursive = NULL) { - assertthat::assert_that(is.logical(no_recurse), is.logical(include_url)) - assertthat::assert_that(is.null(recursive) || is.logical(recursive)) - token <- ica_token_validate(token) - assertthat::assert_that(grepl("^gds://", gdsdir)) - gdsdir_original <- gdsdir - if (!grepl("/$", gdsdir)) { - gdsdir <- glue("{gdsdir}/") - } - base_url <- "https://aps2.platform.illumina.com/v1" - volname <- sub("gds://(.*?)/.*", "\\1", gdsdir) - path2 <- sub("gds://(.*?)/(.*)", "\\2", gdsdir) - page_size <- ifelse(is.null(page_size), "", glue("&pageSize={page_size}")) - query_url <- glue("{base_url}/files?volume.name={volname}&path=/{path2}*{page_size}") - if (include_url) { - query_url <- glue("{query_url}&include=PresignedUrl") - } - if (!is.null(page_token)) { - query_url <- glue("{query_url}&pageToken={page_token}") - } - if (!is.null(recursive)) { - # without specifying recursive, it's true by default - recursive <- ifelse(recursive, "true", "false") - query_url <- glue("{query_url}&recursive={recursive}") - } - query_res <- httr::GET( - query_url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - j <- jsonlite::fromJSON(httr::content(x = query_res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE) - if (j[["itemCount"]] == 0) { - if (gds_likely_file(gdsdir_original)) { - cli::cli_abort("{date_log()} ERROR: Is the input directory a file perhaps?\n{.file {gdsdir_original}}") - } - # if there is a nextPageToken then abort, else continue - if (!is.null(j[["nextPageToken"]])) { - msg <- paste0( - "{date_log()} ERROR: ", - "No GDS files listed in the input directory. Please confirm you can ", - "access the following GDS input directory with your token: ", - "{.file {gdsdir_original}}" - ) - cli::cli_abort(msg) - } - } # endif - d <- j[["items"]] |> - purrr::map(\(x) c( - file_id = x[["id"]], - path = x[["path"]], - size = x[["sizeInBytes"]], - lastmodified = x[["timeModified"]], - presigned_url = x[["presignedUrl"]] - )) |> - dplyr::bind_rows() - if (nrow(d) == 0) { - # We've iterated through all available items, and the next page has 0 items. - # So dplyr::bind_rows(d, NULL) will return d. - return(NULL) - } - res <- d |> - dplyr::mutate( - size = fs::as_fs_bytes(.data$size), - bname = basename(.data$path), - path = glue("gds://{volname}{.data$path}") - ) |> - dplyr::select(dplyr::any_of(c("bname", "size", "lastmodified", "file_id", "path", "presigned_url"))) - - if (!is.null(j[["nextPageToken"]]) && !no_recurse) { - res2 <- gds_list_files_dir( - gdsdir = gdsdir, token = token, page_size = NULL, - include_url = include_url, no_recurse = FALSE, page_token = j[["nextPageToken"]], - recursive = NULL - ) - res <- dplyr::bind_rows(res, res2) - } - res -} - -#' List Relevant Files In ICAv1 GDS Directory -#' -#' Lists relevant files in a GDS directory. -#' -#' @inheritParams gds_list_files_dir -#' @param pattern Pattern to further filter the returned file type tibble. -#' @param regexes Tibble with `regex` and `fun`ction name (see example). -#' @return A tibble with file type, basename, size, last modified timestamp, file_id, full path, -#' and presigned URL if requested. -#' @examples -#' \dontrun{ -#' regexes <- tibble::tibble(regex = "multiqc_data\\.json$", fun = "MultiqcJsonFile") -#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" -#' gds_list_files_filter_relevant(gdsdir) -#' } -#' @export -gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_FILE_REGEX, - token = Sys.getenv("ICA_ACCESS_TOKEN"), - page_size = 100, include_url = FALSE, - no_recurse = TRUE, page_token = NULL, - recursive = NULL) { - pattern <- pattern %||% ".*" # keep all recognisable files by default - assertthat::assert_that(all(c("regex", "fun") %in% colnames(regexes))) - cols_sel <- c("type", "bname", "size", "lastmodified", "file_id", "path", "presigned_url") - d <- dracarys::gds_list_files_dir( - gdsdir = gdsdir, token = token, page_size = page_size, include_url = include_url, - no_recurse = no_recurse, page_token = page_token, recursive = recursive - ) |> - dplyr::rowwise() |> - dplyr::mutate(type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes))) |> - dplyr::ungroup() |> - dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select(dplyr::any_of(cols_sel)) - d -} - -#' dracarys GDS Download -#' -#' Download only GDS files that can be processed by dracarys. -#' -#' @inheritParams gds_list_files_dir -#' @inheritParams gds_list_files_filter_relevant -#' @param outdir Local output directory. -#' @param dryrun If TRUE, just list the files that will be downloaded (don't -#' download them). -#' @examples -#' \dontrun{ -#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" -#' outdir <- sub("gds:/", "~/icav1/g", gdsdir) -#' regexes <- tibble::tribble( -#' ~regex, ~fun, -#' "multiqc_data\\.json$", "MultiqcJsonFile", -#' "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" -#' ) -#' dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T) -#' } -#' -#' @export -dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), - pattern = NULL, page_size = 100, dryrun = FALSE, - regexes = DR_FILE_REGEX, recursive = NULL) { - e <- emojifont::emoji - fs::dir_create(outdir) - d <- gds_list_files_filter_relevant( - gdsdir = gdsdir, pattern = pattern, regexes = regexes, - token = token, page_size = page_size, include_url = FALSE, - no_recurse = FALSE, page_token = NULL, - recursive = recursive - ) - msg <- glue( - "GDS input path is: {gdsdir}", - "\nNo relevant files found under there.", - "\nPlease check that path with `ica files list`, and try to adjust page size." - ) - assertthat::assert_that(nrow(d) > 0, msg = msg) - d <- d |> - dplyr::mutate( - gdspath_minus_gdsdir = sub(glue("{gdsdir}/"), "", .data$path), - gdspath_minus_gdsdir_outdir = file.path(outdir, dirname(.data$gdspath_minus_gdsdir)) |> - fs::dir_create() |> - normalizePath(), - localpath = file.path(.data$gdspath_minus_gdsdir_outdir, .data$bname), - gdspath = .data$path - ) |> - dplyr::select("type", "bname", "size", "lastmodified", "file_id", "localpath", "gdspath") - tot_size <- d |> - dplyr::summarise(tot_size = sum(.data$size)) |> - dplyr::pull(tot_size) - # download recognisable dracarys files to outdir//{bname} - if (!dryrun) { - txt <- paste0( - "{e('arrow_heading_down')} {nrow(d)} files ({tot_size}): {.file {gdsdir}}\n" - ) - cli::cli_alert_info(txt) - res <- d |> - dplyr::rowwise() |> - dplyr::mutate( - dl = gds_file_download_api( - gds_fileid = .data$file_id, out_file = .data$localpath, token = token - ), - localpath = normalizePath(.data$localpath) - ) |> - dplyr::ungroup() |> - dplyr::select("type", "bname", "size", "lastmodified", "localpath", "gdspath", "file_id") - return(res) - } else { - cli::cli_alert_info("{date_log()} {e('camera')} Just list relevant files from {.file {gdsdir}}") - d |> - dplyr::select("type", "bname", "size", "lastmodified", "gdspath", "file_id", localpath2be = "localpath") |> - as.data.frame() |> - print() - } -} - -#' GDS File Presigned URL -#' -#' Returns presigned URL of given GDS file. -#' -#' @param gds_fileid GDS file ID. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @return Presigned URL if valid. -#' @export -gds_file_presignedurl <- function(gds_fileid, token) { - token <- ica_token_validate(token) - base_url <- "https://aps2.platform.illumina.com/v1" - url <- glue("{base_url}/files/{gds_fileid}") - res <- httr::GET( - url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - presigned_url <- jsonlite::fromJSON(httr::content(x = res, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)[["presignedUrl"]] - assertthat::assert_that(grepl("^https://stratus-gds-aps2.s3.ap-southeast-2.amazonaws.com", presigned_url)) - presigned_url -} - -#' GDS File Download via API -#' -#' @param gds_fileid GDS file ID. -#' @param out_file Path to output file. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' -#' @examples -#' \dontrun{ -#' gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0" -#' out <- tempfile() -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' } -#' @export -gds_file_download_api <- function(gds_fileid, out_file, token) { - presigned_url <- gds_file_presignedurl(gds_fileid, token) - # keep quiet instead of logging presigned urls - status_code <- utils::download.file(url = presigned_url, destfile = out_file, quiet = TRUE) - assertthat::assert_that(status_code == 0) - normalizePath(out_file) -} - -#' GDS File Download via CLI -#' -#' @param gds Full path to GDS file. -#' @param out Path to output file. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @export -gds_file_download_cli <- function(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) { - token <- ica_token_validate(token) - system(glue("ica files download {gds} {out} --access-token {token}")) -} diff --git a/R/ica.R b/R/ica.R deleted file mode 100644 index 951095b..0000000 --- a/R/ica.R +++ /dev/null @@ -1,108 +0,0 @@ -#' List FASTQs In GDS Directory -#' -#' @param gdsdir GDS directory. -#' @param token ICA access token. -#' @param include_url Include presigned URLs to all files within the GDS directory. -#' @param page_size Page size. -#' -#' @return A tibble with type, bname, size, file_id, path, and presigned URL. -#' -#' @examples -#' \dontrun{ -#' prim <- "gds://production/primary_data" -#' run <- "240719_A00130_0323_BHMCYHDSXC/202407205bad380d/BiModal_BM-5L" -#' gdsdir <- file.path(prim, run) -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' include_url <- F -#' page_size <- 100 -#' gds_files_list_fastq(gdsdir, token, include_url, page_size) -#' } -#' @export -gds_files_list_fastq <- function(gdsdir, token, include_url = FALSE, page_size = 100) { - fq_regex <- tibble::tribble( - ~regex, ~fun, - "fastq\\.gz$", "FASTQ" - ) - g <- gds_list_files_filter_relevant( - gdsdir = gdsdir, pattern = NULL, regexes = fq_regex, - token = token, page_size = page_size, include_url = include_url - ) - g |> - dplyr::mutate( - size_chr = as.character(.data$size), - size_num = as.numeric(.data$size) - ) |> - dplyr::select( - "type", "bname", "size", "lastmodified", "size_chr", "size_num", "file_id", "path" - ) -} - -#' List GDS Volumes -#' -#' Lists GDS volumes accessible by the provided ICA token. -#' -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @param page_size Page size (def: 10). -#' -#' @return A tibble with vol name and vol id. -#' @export -gds_volumes_list <- function(token, page_size = 10) { - token <- ica_token_validate(token) - base_url <- "https://aps2.platform.illumina.com/v1" - query_url <- glue("{base_url}/volumes?pageSize={page_size}") - - res <- httr::GET( - query_url, - httr::add_headers(Authorization = glue("Bearer {token}")), - httr::accept_json() - ) - j <- jsonlite::fromJSON(httr::content(x = res, type = "text", encoding = "UTF-8"), simplifyVector = FALSE) - purrr::map_df(j[["items"]], function(x) c(name = x[["name"]], id = x[["id"]])) -} - - -#' Validate ICA access token -#' -#' Validates ICA access token by parsing it and checking its expiration date. -#' @param token ICA access token (def: $ICA_ACCESS_TOKEN env var). -#' @return Returns the token if valid, or else errors out. -#' @export -ica_token_validate <- function(token = Sys.getenv("ICA_ACCESS_TOKEN")) { - # https://github.com/r-lib/jose/blob/429a46/R/jwt.R#L171 - .ica_token_check_expiration_time <- function(payload) { - if (length(payload$exp)) { - stopifnot("exp claim is a number" = is.numeric(payload$exp)) - expdate <- structure(payload$exp, class = c("POSIXct", "POSIXt")) - if (expdate < (Sys.time() - 60)) { - stop(paste("Token has expired on", expdate), call. = FALSE) - } - } - if (length(payload$nbf)) { - stopifnot("nbf claim is a number" = is.numeric(payload$nbf)) - nbfdate <- structure(payload$nbf, class = c("POSIXct", "POSIXt")) - if (nbfdate > (Sys.time() + 60)) { - stop(paste("Token is not valid before", nbfdate), call. = FALSE) - } - } - } - # giving a friendlier error msg in case this isn't even valid jwt - tmp <- strsplit(token, ".", fixed = TRUE)[[1]] - msg <- "The input token is not a valid JWT" - assertthat::assert_that(length(tmp) %in% c(2, 3), msg = msg) - l <- jose::jwt_split(token) - .ica_token_check_expiration_time(l[["payload"]]) - token -} - -ica_token_exp <- function(token = Sys.getenv("ICA_ACCESS_TOKEN")) { - l <- jose::jwt_split(token) - structure(l$payload$exp, class = c("POSIXct", "POSIXt")) -} - -gds_likely_file <- function(x) { - e <- c( - "txt", "tsv", "csv", "html", "json", "stdout", "stderr", "stdouterr", - "log", "vcf", "gz", "bam", "bai" - ) - tolower(tools::file_ext(x)) %in% e -} diff --git a/R/sash.R b/R/sash.R index 883d659..b326a15 100644 --- a/R/sash.R +++ b/R/sash.R @@ -75,7 +75,7 @@ Wf_sash <- R6::R6Class( SampleID_tumor = NULL, SampleID_normal = NULL, #' @description Create a new Wf_sash object. - #' @param path Path to directory with raw workflow results (from GDS, S3, or + #' @param path Path to directory with raw workflow results (from S3 or #' local filesystem). #' @param SubjectID The SubjectID of the sample. #' @param SampleID_tumor The SampleID of the tumor sample. @@ -229,7 +229,7 @@ Wf_sash <- R6::R6Class( #' #' Downloads files from the `sash` workflow and writes them in a tidy format. #' -#' @param path Path to directory with raw workflow results (from GDS, S3, or +#' @param path Path to directory with raw workflow results (from S3 or #' local filesystem). #' @param SubjectID The SubjectID of the sample. #' @param SampleID_tumor The SampleID of the tumor sample. @@ -237,7 +237,6 @@ Wf_sash <- R6::R6Class( #' @param outdir Path to output directory. #' @param format Format of output files. #' @param max_files Max number of files to list. -#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). #' @param regexes Tibble with file `regex` and `fun`ction to parse it. Use only @@ -253,7 +252,6 @@ Wf_sash <- R6::R6Class( #' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") #' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") #' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") #' d <- Wf_sash_download_tidy_write( #' path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, #' outdir = outdir, @@ -263,7 +261,6 @@ Wf_sash <- R6::R6Class( #' @export Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor, SampleID_normal, outdir, format = "rds", max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), regexes = NULL, dryrun = FALSE) { s <- Wf_sash$new( path = path, SubjectID = SubjectID, @@ -273,7 +270,7 @@ Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor, SampleI s$regexes <- regexes } d_dl <- s$download_files( - outdir = outdir, ica_token = ica_token, + outdir = outdir, max_files = max_files, dryrun = dryrun ) if (!dryrun) { diff --git a/R/tidy.R b/R/tidy.R index cb5e2f0..0e1d07e 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -64,13 +64,12 @@ tidy_files <- function(x, envir = parent.frame()) { #' Tidies UMCCR workflow results into a list of tibbles and writes individual #' tibbles to TSV, Parquet, SparkDF, or RDS format. #' -#' @param in_dir Directory path to UMCCR workflow results (can be GDS, S3 or local). +#' @param in_dir Directory path to UMCCR workflow results (can be S3 or local). #' @param prefix Prefix of output file(s). #' @param out_dir Output directory. -#' @param local_dir If `indir` is a GDS or S3 directory, 'recognisable' files -#' will be first downloaded to this directory (def: /dracarys__sync). +#' @param local_dir If `indir` is a S3 directory, 'recognisable' files +#' will be first downloaded to this directory (def: /dracarys_s3_sync). #' @param dryrun Just list the files that will be downloaded (def: FALSE). -#' @param token ICA access token (by default uses $ICA_ACCESS_TOKEN env var). #' @param out_format Format of output (tsv, parquet, both) (def: tsv). #' @param pattern Pattern to further filter the returned file type tibble (see #' `name` column in the `DR_FILE_REGEX` tibble). @@ -83,15 +82,9 @@ tidy_files <- function(x, envir = parent.frame()) { #' "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", #' "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" #' ) -#' in_dir <- paste0( -#' "gds://production/analysis_data/SBJ01639/tso_ctdna_tumor_only/", -#' "202204045ad5743c/L2200214/Results/PRJ220425_L2200214" -#' ) -#' o1 <- sub("^gds://", "", in_dir) #' o1 <- sub("s3:/", "~/s3", in_dir) #' out_dir <- o1 #' out_dir <- file.path(fs::path_home(), "icav1/g", o1) -#' # in_dir <- file.path(out_dir, "dracarys_gds_sync") #' prefix <- "SBJ01639" #' prefix <- "PTC_SSqCMM05pc_L2100067" #' out_format <- "rds" @@ -99,7 +92,6 @@ tidy_files <- function(x, envir = parent.frame()) { #' #' in_dir <- here::here(glue("nogit/tso/2022-12-13/SBJ02858/dracarys_gds_sync")) #' out_dir <- file.path(in_dir, "../out") -#' gds_local_dir <- NULL #' prefix <- "SBJ02858" #' dryrun <- F #' umccr_tidy(in_dir = in_dir, out_dir = out_dir, prefix = prefix, dryrun = F) @@ -107,35 +99,27 @@ tidy_files <- function(x, envir = parent.frame()) { #' @export umccr_tidy <- function(in_dir = NULL, out_dir = NULL, prefix = NULL, local_dir = NULL, out_format = "tsv", - dryrun = FALSE, token = Sys.getenv("ICA_ACCESS_TOKEN"), - pattern = NULL, regexes = DR_FILE_REGEX) { + dryrun = FALSE, pattern = NULL, regexes = DR_FILE_REGEX) { assertthat::assert_that(!is.null(in_dir), !is.null(out_dir), !is.null(prefix)) dr_output_format_valid(out_format) e <- emojifont::emoji - if (grepl("^gds://", in_dir) || grepl("^s3://", in_dir)) { - # in_dir is gds/s3 - cloud_type <- ifelse(grepl("^gds://", in_dir), "gds", "s3") + if (grepl("^s3://", in_dir)) { + # in_dir is s3 + cloud_type <- "s3" local_dir <- local_dir %||% file.path(out_dir, glue("dracarys_{cloud_type}_sync")) pat <- pattern %||% ".*" # keep all recognisable files - if (cloud_type == "gds") { - dr_gds_download( - gdsdir = in_dir, outdir = local_dir, token = token, - pattern = pat, dryrun = dryrun - ) - } else { - dr_s3_download( - s3dir = in_dir, outdir = local_dir, - pattern = pat, dryrun = dryrun - ) - } + dr_s3_download( + s3dir = in_dir, outdir = local_dir, + pattern = pat, dryrun = dryrun + ) # Now use the downloaded results in_dir <- local_dir } else { - # in_dir is not gds or s3 + # in_dir is not s3 if (!is.null(local_dir)) { cli::cli_warn(glue( - "You have specified the 'local_dir' option to download GDS or S3 results, ", + "You have specified the 'local_dir' option to download S3 results, ", "but your input directory is local. Ignoring that option." )) } diff --git a/R/tso.R b/R/tso.R index 6df0cde..5eeb15a 100644 --- a/R/tso.R +++ b/R/tso.R @@ -23,29 +23,6 @@ #' prefix = prefix, #' format = "tsv" #' ) -#' -#' #---- GDS ----# -#' p <- file.path( -#' "gds://production/analysis_data/SBJ05563/tso_ctdna_tumor_only", -#' "20240914d41300cd/L2401388/Results" -#' ) -#' prefix <- "PRJ241446_L2401388" -#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' t2 <- Wf_tso_ctdna_tumor_only$new(path = p, prefix = prefix) -#' t2$list_files(max_files = 100, ica_token = token) -#' t2$list_files_filter_relevant(max_files = 100, ica_token = token) -#' d <- t2$download_files( -#' outdir = outdir, ica_token = token, -#' max_files = 100, dryrun = F -#' ) -#' d_tidy <- t2$tidy_files(d) -#' d_write <- t2$write( -#' d_tidy, -#' outdir = file.path(outdir, "dracarys_tidy"), -#' prefix = prefix, -#' format = "tsv" -#' ) #' } #' @export Wf_tso_ctdna_tumor_only <- R6::R6Class( @@ -55,7 +32,7 @@ Wf_tso_ctdna_tumor_only <- R6::R6Class( #' @field prefix The SampleID_LibraryID prefix of the tumor sample (needed for path lookup). prefix = NULL, #' @description Create a new Wf_tso_ctdna_tumor_only object. - #' @param path Path to directory with raw workflow results (from GDS, S3, or + #' @param path Path to directory with raw workflow results (from S3 or #' local filesystem). #' @param prefix The SampleID_LibraryID prefix of the tumor sample (needed for path lookup). initialize = function(path = NULL, prefix = NULL) { @@ -194,14 +171,13 @@ Wf_tso_ctdna_tumor_only <- R6::R6Class( #' #' Downloads files from the `tso_ctdna_tumor_only` workflow and writes them in a tidy format. #' -#' @param path Path to directory with raw workflow results (from GDS, S3, or +#' @param path Path to directory with raw workflow results (from S3 or #' local filesystem). #' @param prefix The SubjectID_LibraryID prefix of the sample (needed for path lookup). #' @param outdir Path to output directory with raw files. #' @param outdir_tidy Path to output directory with tidy files. #' @param format Format of output files. #' @param max_files Max number of files to list. -#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). #' @return Tibble of tidy tibbles. @@ -214,7 +190,6 @@ Wf_tso_ctdna_tumor_only <- R6::R6Class( #' ) #' prefix <- "PRJ241446_L2401388" #' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") #' d <- dtw_Wf_tso_ctdna_tumor_only( #' path = p, prefix = prefix, outdir = outdir, #' format = "tsv", @@ -226,11 +201,10 @@ dtw_Wf_tso_ctdna_tumor_only <- function(path, prefix, outdir, outdir_tidy = file.path(outdir, "dracarys_tidy"), format = "rds", max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), dryrun = FALSE) { obj <- Wf_tso_ctdna_tumor_only$new(path = path, prefix = prefix) d_dl <- obj$download_files( - outdir = outdir, ica_token = ica_token, + outdir = outdir, max_files = max_files, dryrun = dryrun ) if (!dryrun) { diff --git a/R/tso_dragen.R b/R/tso_dragen.R deleted file mode 100644 index e69de29..0000000 diff --git a/R/umccrise.R b/R/umccrise.R index 21894d3..b14e944 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -26,32 +26,6 @@ #' prefix = glue("{SubjectID}__{SampleID_tumor}"), #' format = "tsv" #' ) -#' -#' #---- GDS ----# -#' SubjectID <- "SBJ04662" -#' SampleID_tumor <- "PRJ240647" -#' SampleID_normal <- "PRJ240646" -#' p1_gds <- "gds://production/analysis_data" -#' p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") -#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") -#' um2 <- Wf_umccrise$new( -#' path = p, SubjectID = SubjectID, -#' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal -#' ) -#' um2$list_files(max_files = 8) -#' um2$list_files_filter_relevant(ica_token = token, max_files = 1000) -#' d <- um2$download_files( -#' outdir = outdir, ica_token = token, -#' max_files = 1000, dryrun = F -#' ) -#' d_tidy <- um2$tidy_files(d) -#' d_write <- um2$write( -#' d_tidy, -#' outdir = file.path(outdir, "dracarys_tidy"), -#' prefix = glue("{SubjectID}__{SampleID_tumor}"), -#' format = "tsv" -#' ) #' } #' #' @export @@ -66,7 +40,7 @@ Wf_umccrise <- R6::R6Class( SampleID_tumor = NULL, SampleID_normal = NULL, #' @description Create a new Wf_umccrise object. - #' @param path Path to directory with raw workflow results (from GDS, S3, or + #' @param path Path to directory with raw workflow results (from S3 or #' local filesystem). #' @param SubjectID The SubjectID of the sample. #' @param SampleID_tumor The SampleID of the tumor sample. @@ -243,7 +217,7 @@ Wf_umccrise <- R6::R6Class( #' #' Downloads files from the `umccrise` workflow and writes them in a tidy format. #' -#' @param path Path to directory with raw workflow results (from GDS, S3, or +#' @param path Path to directory with raw workflow results (from S3 or #' local filesystem). #' @param SubjectID The SubjectID of the sample. #' @param SampleID_tumor The SampleID of the tumor sample. @@ -251,7 +225,6 @@ Wf_umccrise <- R6::R6Class( #' @param outdir Path to output directory. #' @param format Format of output files. #' @param max_files Max number of files to list. -#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). #' @return Tibble of tidy data as list-cols. @@ -264,7 +237,6 @@ Wf_umccrise <- R6::R6Class( #' p1_gds <- "gds://production/analysis_data" #' p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") #' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -#' token <- Sys.getenv("ICA_ACCESS_TOKEN") #' d <- Wf_umccrise_download_tidy_write( #' path = p, SubjectID = SubjectID, #' SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal, @@ -276,15 +248,13 @@ Wf_umccrise <- R6::R6Class( Wf_umccrise_download_tidy_write <- function(path, SubjectID, SampleID_tumor, SampleID_normal, outdir, format = "rds", max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), dryrun = FALSE) { um <- Wf_umccrise$new( path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal ) d_dl <- um$download_files( - outdir = outdir, ica_token = ica_token, - max_files = max_files, dryrun = dryrun + outdir = outdir, max_files = max_files, dryrun = dryrun ) if (!dryrun) { d_tidy <- um$tidy_files(d_dl) diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml index 9835ae8..5ae753e 100644 --- a/conda/recipe/meta.yaml +++ b/conda/recipe/meta.yaml @@ -29,8 +29,6 @@ requirements: - r-ggrepel - r-glue - r-here - - r-httr - - r-jose - r-jsonlite - r-knitr - r-lubridate @@ -61,8 +59,6 @@ requirements: - r-ggrepel - r-glue - r-here - - r-httr - - r-jose - r-jsonlite - r-knitr - r-lubridate diff --git a/inst/cli/tidy.R b/inst/cli/tidy.R index a5c7680..ac47a4d 100644 --- a/inst/cli/tidy.R +++ b/inst/cli/tidy.R @@ -1,10 +1,10 @@ tidy_add_args <- function(subp) { tidy <- subp$add_parser("tidy", help = "Tidy UMCCR Workflow Outputs") - tidy$add_argument("-i", "--in_dir", help = glue("{emoji('snowman')} Directory with untidy UMCCR workflow results. Can be GDS, S3 or local."), required = TRUE) + tidy$add_argument("-i", "--in_dir", help = glue("{emoji('snowman')} Directory with untidy UMCCR workflow results. Can be S3 or local."), required = TRUE) tidy$add_argument("-o", "--out_dir", help = glue("{emoji('fire')} Directory to output tidy results."), required = TRUE) tidy$add_argument("-p", "--prefix", help = glue("{emoji('violin')} Prefix string used for all results."), required = TRUE) - tidy$add_argument("-t", "--token", help = glue("{emoji('see_no_evil')} ICA access token. Default: ICA_ACCESS_TOKEN env var."), default = Sys.getenv("ICA_ACCESS_TOKEN")) - tidy$add_argument("-l", "--local_dir", help = glue("{emoji('inbox_tray')} If input is a GDS/S3 directory, download the recognisable files to this directory. Default: '/dracarys__sync'.")) + # tidy$add_argument("-t", "--token", help = glue("{emoji('see_no_evil')} ICA access token. Default: ICA_ACCESS_TOKEN env var."), default = Sys.getenv("ICA_ACCESS_TOKEN")) + tidy$add_argument("-l", "--local_dir", help = glue("{emoji('inbox_tray')} If input is an S3 directory, download the recognisable files to this directory. Default: '/dracarys_s3_sync'.")) tidy$add_argument("-f", "--format", help = glue("{emoji('art')} Format of output. Default: %(default)s."), default = "tsv") tidy$add_argument("-n", "--dryrun", help = glue("{emoji('camel')} Dry run - just show files to be tidied."), action = "store_true") tidy$add_argument("-q", "--quiet", help = glue("{emoji('sleeping')} Shush all the logs."), action = "store_true") @@ -22,7 +22,7 @@ tidy_parse_args <- function(args) { rds_dir <- normalizePath(args$rds_dir) rds_path <- file.path(rds_dir, glue("{args$prefix}_dracarys_data.rds")) } - token <- ica_token_validate(args$token) + # token <- ica_token_validate(args$token) tidy_args <- list( in_dir = args$in_dir, @@ -31,7 +31,7 @@ tidy_parse_args <- function(args) { local_dir = local_dir, out_format = args$format, dryrun = args$dryrun, - token = token, + # token = token, pattern = args$pattern ) diff --git a/inst/rmd/umccr_workflows/alignment_qc/.gitignore b/inst/reports/wgts-qc/.gitignore similarity index 68% rename from inst/rmd/umccr_workflows/alignment_qc/.gitignore rename to inst/reports/wgts-qc/.gitignore index fcd32d6..63bdc92 100644 --- a/inst/rmd/umccr_workflows/alignment_qc/.gitignore +++ b/inst/reports/wgts-qc/.gitignore @@ -1,3 +1,5 @@ /nogit/ /html/ /params/ + +/.quarto/ diff --git a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R b/inst/reports/wgts-qc/dl_and_tidy.R similarity index 95% rename from inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R rename to inst/reports/wgts-qc/dl_and_tidy.R index c1977d4..55663b9 100755 --- a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R +++ b/inst/reports/wgts-qc/dl_and_tidy.R @@ -18,12 +18,11 @@ c("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION") |> token <- rportal::orca_jwt() |> rportal::jwt_validate() dates <- c( - "2024-11-23", - "2024-11-24" + "2024-12-01" ) |> stringr::str_remove_all("-") |> paste(collapse = "|") -wf0 <- rportal::orca_workflow_list(wf_name = "wgts-qc", token = token, page_size = 500) +wf0 <- rportal::orca_workflow_list(wf_name = "wgts-qc", token = token, page_size = 50) # get pld wf1 <- wf0 |> filter(grepl(dates, .data$portalRunId)) |> @@ -121,7 +120,7 @@ data_tidy <- wf_lims |> ) |> ungroup() -outdir1 <- fs::dir_create("inst/rmd/umccr_workflows/alignment_qc/nogit/tidy_data_rds") -date1 <- "2024-11-24" +outdir1 <- fs::dir_create("inst/reports/wgts-qc/nogit/tidy_data_rds") +date1 <- "2024-12-03" data_tidy |> saveRDS(here(glue("{outdir1}/{date1}_wgts.rds"))) diff --git a/inst/reports/wgts-qc/render.sh b/inst/reports/wgts-qc/render.sh new file mode 100644 index 0000000..480d75d --- /dev/null +++ b/inst/reports/wgts-qc/render.sh @@ -0,0 +1,8 @@ +date1="${1:-$(date +%Y-%m-%d)}" +out="alignqc_${date1}.html" +tidy_data="nogit/tidy_data_rds/${date1}_wgts.rds" + +quarto render summary.qmd \ + -P tidy_data:${tidy_data} \ + -o ${out} \ + --output-dir "nogit/html" diff --git a/inst/reports/wgts-qc/summary.qmd b/inst/reports/wgts-qc/summary.qmd index 88ceafd..13e042f 100644 --- a/inst/reports/wgts-qc/summary.qmd +++ b/inst/reports/wgts-qc/summary.qmd @@ -24,7 +24,7 @@ format: grid: body-width: 1300px params: - tidy_data: "~/projects/dracarys/inst/rmd/umccr_workflows/alignment_qc/nogit/tidy_data_rds/2024-11-24_wgts.rds" + tidy_data: "~/projects/dracarys/inst/reports/wgts-qc/nogit/tidy_data_rds/2024-12-03_wgts.rds" --- ```{r} @@ -207,6 +207,7 @@ d0 |> ```{r} #| label: mapmetrics d_map <- dr_unnest("mapmetrics") |> + filter(RG == "Total") |> arrange(desc(umccrId), type) |> mutate( umccrId = get_lib_url(lid = .data$libraryId, text = .data$umccrId), @@ -461,8 +462,8 @@ gc_data_plot <- gc_data |> title = "Read GC Content", subtitle = glue("Total number of reads with each GC content\npercentile between 0% and 100%") ) -# plotly::ggplotly(gc_data_plot) -gc_data_plot +plotly::ggplotly(gc_data_plot) +# gc_data_plot ``` ### GC Content Quality ('GC Content Mean Quality Scores') @@ -489,8 +490,8 @@ f1_plot <- ggplot() + title = "GC Content Quality", subtitle = glue("Average Phred-scale read mean quality for reads with\neach GC content percentile between 0% and 100%.") ) -# plotly::ggplotly(f1_plot) -f1_plot +plotly::ggplotly(f1_plot) +# f1_plot ``` ### Positional Base Content ('Per-Position Sequence Content') @@ -602,7 +603,7 @@ read_len_plot ```{r} #| label: fqc_sequencePositions -#| fig-height: 42 +#| fig-height: 80 f1 <- dr_unnest("fqc_sequencePositions") f1 |> ggplot(aes(x = bp, y = value, colour = seq)) + diff --git a/inst/scripts/ica_download.R b/inst/scripts/ica_download.R deleted file mode 100644 index 7bc63a9..0000000 --- a/inst/scripts/ica_download.R +++ /dev/null @@ -1,113 +0,0 @@ -# List/Download files from ICA for testing -require(fs) -require(here) -require(httr) -require(jsonlite) -require(tidyverse) -require(dracarys) - -#---- functions ----# - -dr_download_multiqc <- function(gdsdir, outdir, token = Sys.getenv("ICA_TOKEN_PROD")) { - fs::dir_create(outdir) - d <- gds_files_list(gdsdir = gdsdir, token = token) |> - dplyr::mutate(type = purrr::map_chr(bname, dracarys::match_regex)) |> - dplyr::select(.data$dname, .data$type, .data$size, .data$path, .data$bname) - - # download dracarys files to outdir/{dname}.json - d |> - dplyr::filter(.data$type == "multiqc") |> - dplyr::mutate(out = file.path(outdir, glue("{dname}.json"))) |> - dplyr::rowwise() |> - dplyr::mutate(cmd = gds_file_download(path, out, token)) -} - -#---- download ----# -#---- dragen wgs validation ----# -samples_wgs <- c("2016.249.18.WH.P025", "SBJ00303") # SEQC50 -for (sname in samples_wgs) { - dr_download( - gdsdir = glue("gds://development/validation_data/wgs/{sname}/analysis/dragen_somatic/"), - outdir = here(glue("nogit/wgs/dragen/{sname}")), - token = Sys.getenv("ICA_TOKEN_PROD") - ) -} - -#---- tso ----# -samples_tso <- c( - # "SBJ00595", "SBJ00998", "SBJ00999" ,"SBJ01001", "SBJ01003", "SBJ01032", - # "SBJ01040", "SBJ01043", "SBJ01046", "SBJ01059", "SBJ01131", "SBJ01132", - # "SBJ01133", "SBJ01134", "SBJ01135", "SBJ01136", "SBJ01137", "SBJ01138", - # "SBJ01139", "SBJ01140", "SBJ01141", "SBJ01142", "SBJ01143", "SBJ01144", - # "SBJ01146", "SBJ01150", "SBJ01156", "SBJ01157", "SBJ01158", "SBJ01159", - # "SBJ01160", "SBJ01162", "SBJ01163", "SBJ01164", "SBJ01195", "SBJ01196", - # "SBJ01197", "SBJ01211", "SBJ01212", "SBJ01213", "SBJ01214", "SBJ01215", - # "SBJ01216", "SBJ01217", "SBJ01218", "SBJ01219", "SBJ01220", "SBJ01221", - # "SBJ01222", "SBJ01223", "SBJ01224", "SBJ02679", "SBJ02299", "SBJ02298", - # "SBJ02096", "SBJ02067", "SBJ02559", "SBJ02558" -) -for (sname in samples_tso) { - dr_download( - gdsdir = glue("gds://production/analysis_data/{sname}/tso_ctdna_tumor_only/"), - outdir = here(glue("nogit/tso/{sname}")), - token = Sys.getenv("ICA_TOKEN_PROD") - ) -} - -#---- dragen wts ----# -samples_wts <- c( - "SBJ02298", "SBJ02299", "SBJ02300" -) -for (sname in samples_wts) { - dr_download( - gdsdir = glue("gds://production/analysis_data/{sname}/wts_tumor_only/"), - outdir = here(glue("nogit/wts/dragen/{sname}")), - token = Sys.getenv("DRACARYS_TOKEN_PROD") - ) -} - -#---- multiqc ----# -samples <- c( - "SBJ02402", - "SBJ02403", - "SBJ02404", - "SBJ02405", - "SBJ02406", - "SBJ02407" -) - -for (sname in samples) { - workflow <- "wts_tumor_only" - dr_download_multiqc( - gdsdir = glue("gds://production/analysis_data/{sname}/{workflow}/"), - outdir = here(glue("nogit/multiqc/dragen/{workflow}/{sname}")), - token = Sys.getenv("DRACARYS_TOKEN_PROD") - ) -} - -for (sname in samples) { - workflow <- "wgs_alignment_qc" - dr_download_multiqc( - gdsdir = glue("gds://production/analysis_data/{sname}/{workflow}/"), - outdir = here(glue("nogit/multiqc/dragen/{workflow}/{sname}")), - token = Sys.getenv("DRACARYS_TOKEN_PROD") - ) -} - -for (sname in samples) { - workflow <- "wgs_tumor_normal" - dr_download_multiqc( - gdsdir = glue("gds://production/analysis_data/{sname}/{workflow}/"), - outdir = here(glue("nogit/multiqc/dragen/{workflow}/{sname}")), - token = Sys.getenv("DRACARYS_TOKEN_PROD") - ) -} - -for (sname in samples) { - workflow <- "umccrise" - dr_download_multiqc( - gdsdir = glue("gds://production/analysis_data/{sname}/{workflow}/"), - outdir = here(glue("nogit/multiqc/dragen/{workflow}/{sname}")), - token = Sys.getenv("DRACARYS_TOKEN_PROD") - ) -} diff --git a/inst/scripts/multiqc_run.R b/inst/scripts/multiqc_run.R deleted file mode 100644 index c665364..0000000 --- a/inst/scripts/multiqc_run.R +++ /dev/null @@ -1,47 +0,0 @@ -require(dracarys) -require(here) -require(glue) -require(dplyr) -require(readr) - -# SQL -# select * from data_portal.data_portal_gdsfile where regexp_like(path, 'multiqc_data.json') order by time_created desc; -d <- glue("nogit/multiqc/sql/2023-05-01_gds_multiqcjson_query_dd36b739-af05-4954-8818-3ada048d7394.csv") |> - here() |> - read_csv(col_names = TRUE) - -wf <- c("umccrise", "wgs_alignment_qc", "wgs_tumor_normal") -wf <- "wts_tumor_only" - -x <- d |> - filter(!grepl("bclconvert|interop", path)) |> - mutate( - sbj = sub("/analysis_data/(SBJ.*?)/.*", "\\1", path), - workflow = sub("/analysis_data/SBJ.*/(.*?)/.*", "\\1", path), - dir = dirname(path), - gds_indir = glue("gds://{volume_name}{dir}/"), - unique_hash = substr(unique_hash, 1, 6), - time_created = as.Date(time_created) - ) |> - select(sbj, workflow, gds_indir, time_created, unique_hash) |> - # filter(time_created >= "2023-04-30") |> - filter(workflow %in% wf) |> - mutate( - outdir = here(glue("nogit/warehouse/{workflow}/{sbj}/{time_created}_{unique_hash}")), - local_indir = file.path(outdir, "dracarys_gds_sync") - ) |> - arrange(sbj, time_created) |> - select(sbj, gds_indir, outdir, local_indir, time_created) - - -token <- Sys.getenv("ICA_ACCESS_TOKEN_PRO") -dryrun <- TRUE -dryrun <- FALSE - -for (i in seq_len(nrow(x))) { - print(i) - print(x$gds_indir[i]) - dracarys::umccr_tidy(in_dir = x$gds_indir[i], out_dir = x$outdir[i], prefix = x$sbj[i], dryrun = dryrun, token = token) - # print(x$local_indir[i]) - # dracarys::umccr_tidy(in_dir = x$local_indir[i], out_dir = x$outdir[i], prefix = x$sbj[i], dryrun = dryrun, token = token) -} diff --git a/inst/scripts/pcgr_run.R b/inst/scripts/pcgr_run.R deleted file mode 100644 index e518bbc..0000000 --- a/inst/scripts/pcgr_run.R +++ /dev/null @@ -1,56 +0,0 @@ -require(dracarys) -require(here) -require(dplyr) -require(readr) - -# SQL -# select * from data_portal.data_portal_gdsfile where (regexp_like(path, '-somatic.pcgr.json.gz') AND NOT regexp_like(path, 'pcgr_run')) order by time_created desc; - -d <- here("nogit/pcgr/sql/279893c1-79e4-44d7-808e-e34635bd9e50_2023-01-16.csv") |> - read_csv(col_names = TRUE) - - -x <- d |> - mutate( - sbj = sub("/analysis_data/(SBJ.*?)/.*", "\\1", path), - dir = dirname(path), - gds_indir = glue("gds://{volume_name}{dir}/") - ) |> - group_by(sbj) |> - # TODO: check if results already exist for same sbj - mutate( - n_samp = n(), - sbj2 = if_else(n_samp > 1, glue("{sbj}_{dplyr::row_number()}"), sbj) - ) |> - ungroup() |> - select(sbj, sbj2, gds_indir, date = time_created) |> - arrange(sbj2) |> - mutate( - outdir = here(glue("nogit/pcgr/2023-01-16/{sbj2}")), - local_indir = file.path(outdir, "dracarys_gds_sync") - ) |> - select(sbj2, gds_indir, outdir, local_indir, date) - - -token <- Sys.getenv("ICA_ACCESS_TOKEN_PROD") -dryrun <- F -for (i in 101:276) { - print(i) - # print(x$gds_indir[i]) - # print(x$local_indir[i]) - umccr_tidy(in_dir = x$gds_indir[i], out_dir = x$outdir[i], prefix = x$sbj2[i], dryrun = dryrun, token = token) - # umccr_tidy(in_dir = x$local_indir[i], out_dir = x$outdir[i], prefix = x$sbj2[i], dryrun = FALSE, token = token) -} - -# x |> -# mutate( -# cmd = glue("./dracarys.R tso -i {gds_indir} -o {outdir} -r {outdir}/report_dir -p {sbj2} --rds_dir {outdir}/rds_dir --quiet_rmd") -# ) |> -# select(cmd) |> -# write_tsv(here("inst/cli/run.sh"), col_names = FALSE) - -res <- read_rds(here("nogit/pcgr/rds/res_2023-01-17.rds")) - -dplyr::left_join(x, res, by = c("sbj2" = "sbj")) |> - dplyr::select(sbj = sbj2, date, fracIndels, predicted_class, tmb_estimate, n_tmb, gds_indir) |> - readr::write_tsv(here("nogit/pcgr/res_2023-01-17.tsv")) diff --git a/inst/scripts/portal.R b/inst/scripts/portal.R deleted file mode 100644 index 6acb527..0000000 --- a/inst/scripts/portal.R +++ /dev/null @@ -1,36 +0,0 @@ -require(tidyverse) -require(jsonlite) -require(here) -require(glue) - -pmeta <- "nogit/data_portal/2023-05-07_workflows_5f43da12-0b6a-41ce-86c9-9ee62df8792e.csv" - -#---- cttso (succeeded)----# - -lims <- read_tsv("~/Downloads/Google LIMS - Sheet1.tsv") -table(wf1$LibraryID_w_rerun %in% lims$LibraryID) -lims |> filter(LibraryID %in% wf1$LibraryID_w_rerun) - -x <- wf1 |> - mutate( - year1 = lubridate::year(start), - month1 = sprintf("%02d", lubridate::month(start)), - outdir = here(glue("nogit/warehouse/cttso/{year1}/{month1}/{wfr_id}")), - local_indir = file.path(outdir, "dracarys_gds_sync") - ) |> - select( - -c(year1, month1) - ) - -token <- Sys.getenv("ICA_ACCESS_TOKEN_PRO") -dryrun <- TRUE -dryrun <- FALSE - -# for (i in seq_len(nrow(x))) { -for (i in 11:20) { - print(i) - print(x$gds_indir[i]) - dracarys::umccr_tidy(in_dir = x$gds_indir[i], out_dir = x$outdir[i], prefix = x$LibraryID_w_rerun[i], dryrun = dryrun, token = token) - # print(x$local_indir[i]) - # dracarys::umccr_tidy(in_dir = x$local_indir[i], out_dir = x$outdir[i], prefix = x$sbj[i], dryrun = dryrun, token = token) -} diff --git a/inst/scripts/summarise.R b/inst/scripts/summarise.R deleted file mode 100644 index 61b8fdb..0000000 --- a/inst/scripts/summarise.R +++ /dev/null @@ -1,41 +0,0 @@ -require(ggforce) -require(tidyverse) -require(arrow) - -# Summarise and plot TMB results - -d <- - tibble( - x = list.files( - here::here("nogit/tso"), - pattern = "tmb\\.json\\.gz$", - recursive = TRUE, full.names = TRUE - ) - ) |> - rowwise() |> - mutate( - obj = list(TsoTmbFile$new(x)), - sample = sub(".tmb.json.gz", "", obj$bname()), - y = list(read(obj)) - ) |> - unnest(y) |> - select(-c(x, obj)) |> - pivot_longer(TmbPerMb:CodingRegionSizeMb) - -theme_set(theme_bw()) -d |> - ggplot(aes(x = "", y = value, label = sample, colour = sample)) + - geom_violin(fill = "transparent", colour = "grey80", alpha = 0.4) + - # geom_point() + - ggforce::geom_sina(aes( - group = name, - colour = sample, - ), seed = 42) + - facet_wrap(~name, scales = "free") - -d |> - ggplot(aes(x = value)) + - geom_histogram(fill = "purple", colour = "black") + - facet_wrap(~name, scales = "free") - -arrow::read_parquet(here::here("nogit/tso/tmb/56.TMB.parquet")) diff --git a/inst/scripts/tso_run.R b/inst/scripts/tso_run.R deleted file mode 100644 index 516f4ad..0000000 --- a/inst/scripts/tso_run.R +++ /dev/null @@ -1,30 +0,0 @@ -require(tidyverse) -require(jsonlite) -require(here) -require(glue) - -pmeta <- here::here("nogit/data_portal/2023-05-07_workflows_5f43da12-0b6a-41ce-86c9-9ee62df8792e.csv") |> - dracarys::cttso_metadata() - -x <- pmeta |> - dplyr::mutate( - year1 = lubridate::year(start), - month1 = sprintf("%02d", lubridate::month(start)), - outdir = here::here(glue("nogit/warehouse/cttso/{year1}/{month1}/{wfr_id}")), - local_indir = file.path(outdir, "dracarys_gds_sync") - ) |> - dplyr::select( - -c(year1, month1) - ) - -token <- Sys.getenv("ICA_ACCESS_TOKEN_PRO") -dryrun <- TRUE -dryrun <- FALSE - -for (i in seq_len(nrow(x))) { - print(i) - print(x$gds_indir[i]) - dracarys::umccr_tidy(in_dir = x$gds_indir[i], out_dir = x$outdir[i], prefix = x$LibraryID_w_rerun[i], dryrun = dryrun, token = token) - # print(x$local_indir[i]) - # dracarys::umccr_tidy(in_dir = x$local_indir[i], out_dir = x$outdir[i], prefix = x$sbj[i], dryrun = dryrun, token = token) -} diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R deleted file mode 100644 index ea4d2f4..0000000 --- a/inst/scripts/umccrise_run.R +++ /dev/null @@ -1,62 +0,0 @@ -require(dracarys) -require(rportal, include.only = "portaldb_query_workflow") -require(here, include.only = "here") -require(glue, include.only = "glue") -require(readr, include.only = "read_rds") -require(dplyr) -require(tidyr, include.only = "separate_wider_delim") - -start_date <- "2024-08-29" -query_workflow_umccrise <- function(start_date) { - q1 <- glue( - "WHERE \"type_name\" = 'umccrise' AND \"start\" > date(\'{start_date}\') ", - "ORDER BY \"start\" DESC;" - ) - rportal::portaldb_query_workflow(q1) -} - -query_limsrow_libids <- function(libids) { - assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) - libids <- unique(libids) |> - paste(collapse = "|") - q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');") - rportal::portaldb_query_limsrow(q1) -} - -# p_raw <- query_workflow_umccrise(start_date) -p_raw_rds <- here(glue("nogit/data_portal/workflows/{start_date}.rds")) -# saveRDS(p_raw, file = p_raw_rds) -p_raw <- readr::read_rds(p_raw_rds) - -p <- p_raw |> - rportal::meta_umccrise(status = "Succeeded") -# lims_raw <- query_limsrow_libids(p$LibraryID_tumor) -lims_raw_rds <- here(glue("nogit/data_portal/lims/{start_date}.rds")) -# saveRDS(lims_raw, file = lims_raw_rds) -# L2100192 is L2100192_rerun in the lims, 15 libs are rerun/topup/topup2 -lims_raw <- readr::read_rds(lims_raw_rds) -lims <- lims_raw |> - tidyr::separate_wider_delim( - library_id, - delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" - ) |> - select( - subject_id, library_id, sample_id, sample_name, - external_subject_id, external_sample_id, - project_name, project_owner, - source, quality - ) |> - distinct() -table(lims$library_id %in% p$LibraryID_tumor) # double-check - -d <- p |> - left_join(lims, by = c("LibraryID_tumor" = "library_id")) |> - mutate(gds_outdir_umccrise = glue("{.data$gds_outdir_umccrise}/{.data$SubjectID}__{.data$SampleID_tumor}")) |> - select( - wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal, - SampleID_tumor, SampleID_normal, gds_outdir_umccrise, gds_indir_dragen_somatic, external_subject_id, external_sample_id, - project_owner, project_name, source, quality - ) -d - -saveRDS(d, file = here(glue("nogit/data_portal/workflows/umccrise_tidy_{start_date}.rds"))) diff --git a/man/Wf.Rd b/man/Wf.Rd index d6d30c7..e2ceb03 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -9,7 +9,7 @@ workflow run from a UMCCR workflow manager. A workflow has: \itemize{ -\item a directory path with all the raw output files (either on GDS, S3 or +\item a directory path with all the raw output files (either on S3 or local filesystem) \item a subset of files that are of interest for ingestion \itemize{ @@ -42,20 +42,6 @@ um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) um1$list_files(max_files = 10) um1$list_files_filter_relevant(max_files = 10) -#---- GDS ----# -p1_gds <- "gds://production/analysis_data" -p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063") -outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -token <- Sys.getenv("ICA_ACCESS_TOKEN") -um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes) -um2$list_files(max_files = 10) -um2$list_files_filter_relevant(ica_token = token, max_files = 500) -d <- um2$download_files( - outdir = outdir, ica_token = token, - max_files = 1000, dryrun = T -) -d_tidy <- um2$tidy_files(d) - #---- S3 ----# p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3" p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables" @@ -134,12 +120,7 @@ Print details about the Workflow. \subsection{Method \code{list_files()}}{ List all files under given path. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$list_files( - path = private$.path, - max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - ... -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$list_files(path = private$.path, max_files = 1000)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -147,11 +128,7 @@ List all files under given path. \describe{ \item{\code{path}}{Path with raw results.} -\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} - -\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{\code{...}}{Passed on to \code{gds_list_files_dir} function.} +\item{\code{max_files}}{Max number of files to list.} } \if{html}{\out{}} } @@ -162,12 +139,7 @@ List all files under given path. \subsection{Method \code{list_files_filter_relevant()}}{ List dracarys files under given path \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf$list_files_filter_relevant( - path = private$.path, - max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - ... -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf$list_files_filter_relevant(path = private$.path, max_files = 1000, ...)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -175,12 +147,9 @@ List dracarys files under given path \describe{ \item{\code{path}}{Path with raw results.} -\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} - -\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} +\item{\code{max_files}}{Max number of files to list.} -\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or -the \code{s3_list_files_filter_relevant} function.} +\item{\code{...}}{Passed on to \code{s3_list_files_filter_relevant}.} } \if{html}{\out{}} } @@ -208,15 +177,13 @@ For DOWNLOAD_ONLY files, just return the input path. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf-download_files}{}}} \subsection{Method \code{download_files()}}{ -Download files from GDS/S3 to local filesystem. +Download files from S3 to local filesystem. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf$download_files( path = private$.path, outdir, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, - dryrun = FALSE, - recursive = NULL + dryrun = FALSE )}\if{html}{\out{
}} } @@ -227,15 +194,10 @@ Download files from GDS/S3 to local filesystem. \item{\code{outdir}}{Path to output directory.} -\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - \item{\code{max_files}}{Max number of files to list.} \item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't download them).} - -\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified -GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} } \if{html}{\out{}} } diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index eeeabf1..c4d198c 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -27,24 +27,6 @@ d_write <- d1$write( prefix = prefix, format = "tsv" ) -#---- GDS ----# -prefix <- "PRJ222358" -p <- file.path( - "gds://production/analysis_data/SBJ03001/wgs_tumor_normal", - "20241108fc293a38/L2201805_L2201797_dragen_somatic" -) -outdir <- file.path(sub("gds:/", normalizePath("~/icav1/g"), p)) # for GDS case -d1 <- Wf_dragen$new(path = p, prefix = prefix) -d1$list_files(max_files = 100) -d1$list_files_filter_relevant(max_files = 300) -d <- d1$download_files(max_files = 100, outdir = outdir, dryrun = F) -d_tidy <- d1$tidy_files(d) -d_write <- d1$write( - d_tidy, - outdir = file.path(p, "dracarys_tidy"), - prefix = prefix, - format = "tsv" -) } } \section{Super class}{ diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd index 23b54d0..30a77fd 100644 --- a/man/Wf_sash.Rd +++ b/man/Wf_sash.Rd @@ -125,7 +125,7 @@ Create a new Wf_sash object. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or +\item{\code{path}}{Path to directory with raw workflow results (from S3 or local filesystem).} \item{\code{SubjectID}}{The SubjectID of the sample.} diff --git a/man/Wf_sash_download_tidy_write.Rd b/man/Wf_sash_download_tidy_write.Rd index 2a3260e..989f843 100644 --- a/man/Wf_sash_download_tidy_write.Rd +++ b/man/Wf_sash_download_tidy_write.Rd @@ -12,13 +12,12 @@ Wf_sash_download_tidy_write( outdir, format = "rds", max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), regexes = NULL, dryrun = FALSE ) } \arguments{ -\item{path}{Path to directory with raw workflow results (from GDS, S3, or +\item{path}{Path to directory with raw workflow results (from S3 or local filesystem).} \item{SubjectID}{The SubjectID of the sample.} @@ -33,8 +32,6 @@ local filesystem).} \item{max_files}{Max number of files to list.} -\item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - \item{regexes}{Tibble with file \code{regex} and \code{fun}ction to parse it. Use only if you want to override the default regexes used for this workflow.} @@ -54,7 +51,6 @@ SampleID_tumor <- "PRJ230004" p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -token <- Sys.getenv("ICA_ACCESS_TOKEN") d <- Wf_sash_download_tidy_write( path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, outdir = outdir, diff --git a/man/Wf_tso_ctdna_tumor_only.Rd b/man/Wf_tso_ctdna_tumor_only.Rd index a58245e..81d0297 100644 --- a/man/Wf_tso_ctdna_tumor_only.Rd +++ b/man/Wf_tso_ctdna_tumor_only.Rd @@ -26,29 +26,6 @@ d_write <- t1$write( prefix = prefix, format = "tsv" ) - -#---- GDS ----# -p <- file.path( - "gds://production/analysis_data/SBJ05563/tso_ctdna_tumor_only", - "20240914d41300cd/L2401388/Results" -) -prefix <- "PRJ241446_L2401388" -outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -token <- Sys.getenv("ICA_ACCESS_TOKEN") -t2 <- Wf_tso_ctdna_tumor_only$new(path = p, prefix = prefix) -t2$list_files(max_files = 100, ica_token = token) -t2$list_files_filter_relevant(max_files = 100, ica_token = token) -d <- t2$download_files( - outdir = outdir, ica_token = token, - max_files = 100, dryrun = F -) -d_tidy <- t2$tidy_files(d) -d_write <- t2$write( - d_tidy, - outdir = file.path(outdir, "dracarys_tidy"), - prefix = prefix, - format = "tsv" -) } } \section{Super class}{ @@ -105,7 +82,7 @@ Create a new Wf_tso_ctdna_tumor_only object. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or +\item{\code{path}}{Path to directory with raw workflow results (from S3 or local filesystem).} \item{\code{prefix}}{The SampleID_LibraryID prefix of the tumor sample (needed for path lookup).} diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index de10726..8baee43 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -29,32 +29,6 @@ d_write <- um1$write( prefix = glue("{SubjectID}__{SampleID_tumor}"), format = "tsv" ) - -#---- GDS ----# -SubjectID <- "SBJ04662" -SampleID_tumor <- "PRJ240647" -SampleID_normal <- "PRJ240646" -p1_gds <- "gds://production/analysis_data" -p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") -outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -token <- Sys.getenv("ICA_ACCESS_TOKEN") -um2 <- Wf_umccrise$new( - path = p, SubjectID = SubjectID, - SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal -) -um2$list_files(max_files = 8) -um2$list_files_filter_relevant(ica_token = token, max_files = 1000) -d <- um2$download_files( - outdir = outdir, ica_token = token, - max_files = 1000, dryrun = F -) -d_tidy <- um2$tidy_files(d) -d_write <- um2$write( - d_tidy, - outdir = file.path(outdir, "dracarys_tidy"), - prefix = glue("{SubjectID}__{SampleID_tumor}"), - format = "tsv" -) } } @@ -115,7 +89,7 @@ Create a new Wf_umccrise object. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or +\item{\code{path}}{Path to directory with raw workflow results (from S3 or local filesystem).} \item{\code{SubjectID}}{The SubjectID of the sample.} diff --git a/man/Wf_umccrise_download_tidy_write.Rd b/man/Wf_umccrise_download_tidy_write.Rd index 99eeacc..20ee6c4 100644 --- a/man/Wf_umccrise_download_tidy_write.Rd +++ b/man/Wf_umccrise_download_tidy_write.Rd @@ -12,12 +12,11 @@ Wf_umccrise_download_tidy_write( outdir, format = "rds", max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), dryrun = FALSE ) } \arguments{ -\item{path}{Path to directory with raw workflow results (from GDS, S3, or +\item{path}{Path to directory with raw workflow results (from S3 or local filesystem).} \item{SubjectID}{The SubjectID of the sample.} @@ -32,8 +31,6 @@ local filesystem).} \item{max_files}{Max number of files to list.} -\item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} } @@ -51,7 +48,6 @@ SampleID_normal <- "PRJ240646" p1_gds <- "gds://production/analysis_data" p <- file.path(p1_gds, "SBJ04662/umccrise/20240302e66750fe/L2400240__L2400239") outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -token <- Sys.getenv("ICA_ACCESS_TOKEN") d <- Wf_umccrise_download_tidy_write( path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, SampleID_normal = SampleID_normal, diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd deleted file mode 100644 index 1faaf8a..0000000 --- a/man/dr_gds_download.Rd +++ /dev/null @@ -1,52 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fs_icav1.R -\name{dr_gds_download} -\alias{dr_gds_download} -\title{dracarys GDS Download} -\usage{ -dr_gds_download( - gdsdir, - outdir, - token = Sys.getenv("ICA_ACCESS_TOKEN"), - pattern = NULL, - page_size = 100, - dryrun = FALSE, - regexes = DR_FILE_REGEX, - recursive = NULL -) -} -\arguments{ -\item{gdsdir}{Full path to GDS directory.} - -\item{outdir}{Local output directory.} - -\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{pattern}{Pattern to further filter the returned file type tibble.} - -\item{page_size}{Page size (def: 10 via ICA API).} - -\item{dryrun}{If TRUE, just list the files that will be downloaded (don't -download them).} - -\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} - -\item{recursive}{Should files be returned recursively \emph{in and under} the specified -GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} -} -\description{ -Download only GDS files that can be processed by dracarys. -} -\examples{ -\dontrun{ -gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" -outdir <- sub("gds:/", "~/icav1/g", gdsdir) -regexes <- tibble::tribble( - ~regex, ~fun, - "multiqc_data\\\\.json$", "MultiqcJsonFile", - "-somatic\\\\.pcgr\\\\.json\\\\.gz$", "pcgrjson" -) -dr_gds_download(gdsdir = gdsdir, outdir = outdir, regexes = regexes, dryrun = T) -} - -} diff --git a/man/dtw_Wf_tso_ctdna_tumor_only.Rd b/man/dtw_Wf_tso_ctdna_tumor_only.Rd index a5e18d5..f93d2b7 100644 --- a/man/dtw_Wf_tso_ctdna_tumor_only.Rd +++ b/man/dtw_Wf_tso_ctdna_tumor_only.Rd @@ -11,12 +11,11 @@ dtw_Wf_tso_ctdna_tumor_only( outdir_tidy = file.path(outdir, "dracarys_tidy"), format = "rds", max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), dryrun = FALSE ) } \arguments{ -\item{path}{Path to directory with raw workflow results (from GDS, S3, or +\item{path}{Path to directory with raw workflow results (from S3 or local filesystem).} \item{prefix}{The SubjectID_LibraryID prefix of the sample (needed for path lookup).} @@ -29,8 +28,6 @@ local filesystem).} \item{max_files}{Max number of files to list.} -\item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} } @@ -48,7 +45,6 @@ p <- file.path( ) prefix <- "PRJ241446_L2401388" outdir <- file.path(sub("gds:/", "~/icav1/g", p)) -token <- Sys.getenv("ICA_ACCESS_TOKEN") d <- dtw_Wf_tso_ctdna_tumor_only( path = p, prefix = prefix, outdir = outdir, format = "tsv", diff --git a/man/gds_file_download_api.Rd b/man/gds_file_download_api.Rd deleted file mode 100644 index b10a438..0000000 --- a/man/gds_file_download_api.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fs_icav1.R -\name{gds_file_download_api} -\alias{gds_file_download_api} -\title{GDS File Download via API} -\usage{ -gds_file_download_api(gds_fileid, out_file, token) -} -\arguments{ -\item{gds_fileid}{GDS file ID.} - -\item{out_file}{Path to output file.} - -\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -} -\description{ -GDS File Download via API -} -\examples{ -\dontrun{ -gds_fileid <- "fil.f9aa2ba7af0c4330095d08dadd2e16b0" -out <- tempfile() -token <- Sys.getenv("ICA_ACCESS_TOKEN") -} -} diff --git a/man/gds_file_download_cli.Rd b/man/gds_file_download_cli.Rd deleted file mode 100644 index f5946fb..0000000 --- a/man/gds_file_download_cli.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fs_icav1.R -\name{gds_file_download_cli} -\alias{gds_file_download_cli} -\title{GDS File Download via CLI} -\usage{ -gds_file_download_cli(gds, out, token = Sys.getenv("ICA_ACCESS_TOKEN")) -} -\arguments{ -\item{gds}{Full path to GDS file.} - -\item{out}{Path to output file.} - -\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -} -\description{ -GDS File Download via CLI -} diff --git a/man/gds_file_presignedurl.Rd b/man/gds_file_presignedurl.Rd deleted file mode 100644 index 841db4a..0000000 --- a/man/gds_file_presignedurl.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fs_icav1.R -\name{gds_file_presignedurl} -\alias{gds_file_presignedurl} -\title{GDS File Presigned URL} -\usage{ -gds_file_presignedurl(gds_fileid, token) -} -\arguments{ -\item{gds_fileid}{GDS file ID.} - -\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -} -\value{ -Presigned URL if valid. -} -\description{ -Returns presigned URL of given GDS file. -} diff --git a/man/gds_files_list_fastq.Rd b/man/gds_files_list_fastq.Rd deleted file mode 100644 index c21e76e..0000000 --- a/man/gds_files_list_fastq.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R -\name{gds_files_list_fastq} -\alias{gds_files_list_fastq} -\title{List FASTQs In GDS Directory} -\usage{ -gds_files_list_fastq(gdsdir, token, include_url = FALSE, page_size = 100) -} -\arguments{ -\item{gdsdir}{GDS directory.} - -\item{token}{ICA access token.} - -\item{include_url}{Include presigned URLs to all files within the GDS directory.} - -\item{page_size}{Page size.} -} -\value{ -A tibble with type, bname, size, file_id, path, and presigned URL. -} -\description{ -List FASTQs In GDS Directory -} -\examples{ -\dontrun{ -prim <- "gds://production/primary_data" -run <- "240719_A00130_0323_BHMCYHDSXC/202407205bad380d/BiModal_BM-5L" -gdsdir <- file.path(prim, run) -token <- Sys.getenv("ICA_ACCESS_TOKEN") -include_url <- F -page_size <- 100 -gds_files_list_fastq(gdsdir, token, include_url, page_size) -} -} diff --git a/man/gds_list_files_dir.Rd b/man/gds_list_files_dir.Rd deleted file mode 100644 index d60b546..0000000 --- a/man/gds_list_files_dir.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fs_icav1.R -\name{gds_list_files_dir} -\alias{gds_list_files_dir} -\title{List Files in ICAv1 GDS Directory} -\usage{ -gds_list_files_dir( - gdsdir, - token = Sys.getenv("ICA_ACCESS_TOKEN"), - page_size = NULL, - include_url = FALSE, - no_recurse = TRUE, - page_token = NULL, - recursive = NULL -) -} -\arguments{ -\item{gdsdir}{Full path to GDS directory.} - -\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{page_size}{Page size (def: 10 via ICA API).} - -\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).} - -\item{no_recurse}{Do not recurse through the file list i.e. just give the first items -without recursing further down the list using .} - -\item{page_token}{Page token (def: NULL). Used internally for recursion.} - -\item{recursive}{Should files be returned recursively \emph{in and under} the specified -GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} -} -\value{ -A tibble with file ID, basename, size, last modified timestamp, -full GDS path, and presigned URL if requested. -} -\description{ -Lists files in a GDS directory. -} -\examples{ -\dontrun{ -gdsdir <- file.path( - "gds://production/analysis_data/SBJ00699/umccrise", - "202203277dcf8562/L2200352__L2100146/SBJ00699__MDX220105/coverage" -) -token <- ica_token_validate() -page_size <- 11 -include_url <- F -page_token <- NULL -no_recurse <- TRUE -recursive <- NULL -gds_list_files_dir(gdsdir, token, page_size, include_url, no_recurse, page_token, recursive) -} -} diff --git a/man/gds_list_files_filter_relevant.Rd b/man/gds_list_files_filter_relevant.Rd deleted file mode 100644 index 1e280b4..0000000 --- a/man/gds_list_files_filter_relevant.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fs_icav1.R -\name{gds_list_files_filter_relevant} -\alias{gds_list_files_filter_relevant} -\title{List Relevant Files In ICAv1 GDS Directory} -\usage{ -gds_list_files_filter_relevant( - gdsdir, - pattern = NULL, - regexes = DR_FILE_REGEX, - token = Sys.getenv("ICA_ACCESS_TOKEN"), - page_size = 100, - include_url = FALSE, - no_recurse = TRUE, - page_token = NULL, - recursive = NULL -) -} -\arguments{ -\item{gdsdir}{Full path to GDS directory.} - -\item{pattern}{Pattern to further filter the returned file type tibble.} - -\item{regexes}{Tibble with \code{regex} and \code{fun}ction name (see example).} - -\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{page_size}{Page size (def: 10 via ICA API).} - -\item{include_url}{Include presigned URLs to all files within the GDS directory (def: FALSE via ICA API).} - -\item{no_recurse}{Do not recurse through the file list i.e. just give the first items -without recursing further down the list using .} - -\item{page_token}{Page token (def: NULL). Used internally for recursion.} - -\item{recursive}{Should files be returned recursively \emph{in and under} the specified -GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} -} -\value{ -A tibble with file type, basename, size, last modified timestamp, file_id, full path, -and presigned URL if requested. -} -\description{ -Lists relevant files in a GDS directory. -} -\examples{ -\dontrun{ -regexes <- tibble::tibble(regex = "multiqc_data\\\\.json$", fun = "MultiqcJsonFile") -gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" -gds_list_files_filter_relevant(gdsdir) -} -} diff --git a/man/gds_volumes_list.Rd b/man/gds_volumes_list.Rd deleted file mode 100644 index c2f1e84..0000000 --- a/man/gds_volumes_list.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R -\name{gds_volumes_list} -\alias{gds_volumes_list} -\title{List GDS Volumes} -\usage{ -gds_volumes_list(token, page_size = 10) -} -\arguments{ -\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{page_size}{Page size (def: 10).} -} -\value{ -A tibble with vol name and vol id. -} -\description{ -Lists GDS volumes accessible by the provided ICA token. -} diff --git a/man/ica_token_validate.Rd b/man/ica_token_validate.Rd deleted file mode 100644 index dec5324..0000000 --- a/man/ica_token_validate.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ica.R -\name{ica_token_validate} -\alias{ica_token_validate} -\title{Validate ICA access token} -\usage{ -ica_token_validate(token = Sys.getenv("ICA_ACCESS_TOKEN")) -} -\arguments{ -\item{token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} -} -\value{ -Returns the token if valid, or else errors out. -} -\description{ -Validates ICA access token by parsing it and checking its expiration date. -} diff --git a/man/umccr_tidy.Rd b/man/umccr_tidy.Rd index daa531f..cac77e5 100644 --- a/man/umccr_tidy.Rd +++ b/man/umccr_tidy.Rd @@ -11,27 +11,24 @@ umccr_tidy( local_dir = NULL, out_format = "tsv", dryrun = FALSE, - token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, regexes = DR_FILE_REGEX ) } \arguments{ -\item{in_dir}{Directory path to UMCCR workflow results (can be GDS, S3 or local).} +\item{in_dir}{Directory path to UMCCR workflow results (can be S3 or local).} \item{out_dir}{Output directory.} \item{prefix}{Prefix of output file(s).} -\item{local_dir}{If \code{indir} is a GDS or S3 directory, 'recognisable' files -will be first downloaded to this directory (def: /dracarys__sync).} +\item{local_dir}{If \code{indir} is a S3 directory, 'recognisable' files +will be first downloaded to this directory (def: /dracarys_s3_sync).} \item{out_format}{Format of output (tsv, parquet, both) (def: tsv).} \item{dryrun}{Just list the files that will be downloaded (def: FALSE).} -\item{token}{ICA access token (by default uses $ICA_ACCESS_TOKEN env var).} - \item{pattern}{Pattern to further filter the returned file type tibble (see \code{name} column in the \code{DR_FILE_REGEX} tibble).} @@ -50,15 +47,9 @@ in_dir <- file.path( "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596", "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067" ) -in_dir <- paste0( - "gds://production/analysis_data/SBJ01639/tso_ctdna_tumor_only/", - "202204045ad5743c/L2200214/Results/PRJ220425_L2200214" -) -o1 <- sub("^gds://", "", in_dir) o1 <- sub("s3:/", "~/s3", in_dir) out_dir <- o1 out_dir <- file.path(fs::path_home(), "icav1/g", o1) -# in_dir <- file.path(out_dir, "dracarys_gds_sync") prefix <- "SBJ01639" prefix <- "PTC_SSqCMM05pc_L2100067" out_format <- "rds" @@ -66,7 +57,6 @@ umccr_tidy(in_dir = in_dir, out_dir = out_dir, prefix = prefix, out_format = out in_dir <- here::here(glue("nogit/tso/2022-12-13/SBJ02858/dracarys_gds_sync")) out_dir <- file.path(in_dir, "../out") -gds_local_dir <- NULL prefix <- "SBJ02858" dryrun <- F umccr_tidy(in_dir = in_dir, out_dir = out_dir, prefix = prefix, dryrun = F)