Merge pull request #94 from umccr/um_hrd_sig

umccrise: support presigned URL parsing
umccr · Sep 5, 2023 · 4436c11 · 4436c11
2 parents 94c1aa1 + 65ee029
commit 4436c11
Show file tree

Hide file tree

Showing 15 changed files with 399 additions and 176 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -30,6 +30,8 @@ Imports:
     ggplot2,
     ggrepel,
     glue,
+    googledrive,
+    googlesheets4,
     here,
     httr,
     jose,

diff --git a/NAMESPACE b/NAMESPACE
@@ -40,6 +40,7 @@ export(gds_file_presignedurl)
 export(gds_files_list)
 export(gds_files_list_filter_relevant)
 export(gds_volumes_list)
+export(glims_read)
 export(ica_token_validate)
 export(match_regex)
 export(meta_bcl_convert)

diff --git a/R/regex.R b/R/regex.R
@@ -60,8 +60,8 @@ DR_FILE_REGEX <- tibble::tribble(
   "somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile",
   "chord\\.tsv\\.gz$", "UmChordTsvFile",
   "hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
-  "snv_2015\\.tsv\\.gz$", "UmSigsSnv2015File",
-  "snv_2020\\.tsv\\.gz$", "UmSigsSnv2020File",
+  "snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
+  "snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
   "-qc_summary\\.tsv\\.gz$", "UmQcSumFile"
 )
 

diff --git a/R/umccrise.R b/R/umccrise.R
@@ -9,7 +9,7 @@
 #' x <- "/path/to/chord.tsv.gz"
 #' d <- UmChordTsvFile$new(x)
 #' d_parsed <- d$read() # or read(d)
-#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both")
+#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv")
 #' }
 #' @export
 UmChordTsvFile <- R6::R6Class(
@@ -29,7 +29,7 @@ UmChordTsvFile <- R6::R6Class(
         p_BRCA1 = "d",
         p_BRCA2 = "d"
       )
-      readr::read_tsv(x, col_types = ct)
+      read_tsvgz(x, col_types = ct)
     },
 
     #' @description
@@ -38,12 +38,14 @@ UmChordTsvFile <- R6::R6Class(
     #' @param d Parsed object from `self$read()`.
     #' @param prefix Prefix of output file(s).
     #' @param out_dir Output directory.
-    #' @param out_format Format of output file(s) (one of 'tsv' (def.),
-    #' 'parquet', 'both').
-    write = function(d, out_dir, prefix, out_format = "tsv") {
-      prefix <- file.path(out_dir, prefix)
-      prefix2 <- glue("{prefix}_chord")
-      write_dracarys(obj = d, prefix = prefix2, out_format = out_format)
+    #' @param out_format Format of output file(s).
+    #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
+    write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) {
+      if (!is.null(out_dir)) {
+        prefix <- file.path(out_dir, prefix)
+      }
+      # prefix2 <- glue("{prefix}_chord")
+      write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
     }
   )
 )
@@ -59,7 +61,7 @@ UmChordTsvFile <- R6::R6Class(
 #' x <- "/path/to/hrdetect.tsv.gz"
 #' d <- UmHrdetectTsvFile$new(x)
 #' d_parsed <- d$read() # or read(d)
-#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both")
+#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv")
 #' }
 #' @export
 UmHrdetectTsvFile <- R6::R6Class(
@@ -76,7 +78,7 @@ UmHrdetectTsvFile <- R6::R6Class(
         .default = "d",
         sample = "c"
       )
-      readr::read_tsv(x, col_types = ct) |>
+      read_tsvgz(x, col_types = ct) |>
         dplyr::select(-c("sample"))
     },
 
@@ -86,12 +88,14 @@ UmHrdetectTsvFile <- R6::R6Class(
     #' @param d Parsed object from `self$read()`.
     #' @param prefix Prefix of output file(s).
     #' @param out_dir Output directory.
-    #' @param out_format Format of output file(s) (one of 'tsv' (def.),
-    #' 'parquet', 'both').
-    write = function(d, out_dir, prefix, out_format = "tsv") {
-      prefix <- file.path(out_dir, prefix)
-      prefix2 <- glue("{prefix}_hrdetect")
-      write_dracarys(obj = d, prefix = prefix2, out_format = out_format)
+    #' @param out_format Format of output file(s).
+    #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
+    write = function(d, out_dir, prefix, out_format = "tsv", drid = NULL) {
+      if (!is.null(out_dir)) {
+        prefix <- file.path(out_dir, prefix)
+      }
+      # prefix2 <- glue("{prefix}_hrdetect")
+      write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
     }
   )
 )
@@ -107,7 +111,7 @@ UmHrdetectTsvFile <- R6::R6Class(
 #' x <- "/path/to/snv_2015.tsv.gz"
 #' d <- UmSigsSnvFile$new(x)
 #' d_parsed <- d$read() # or read(d)
-#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both")
+#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv")
 #' }
 #' @export
 UmSigsSnvFile <- R6::R6Class(
@@ -125,10 +129,7 @@ UmSigsSnvFile <- R6::R6Class(
         .default = "d",
         Signature = "c"
       )
-      list(
-        data = readr::read_tsv(x, col_types = ct),
-        version = version
-      )
+      read_tsvgz(x, col_types = ct)
     },
 
     #' @description
@@ -137,13 +138,14 @@ UmSigsSnvFile <- R6::R6Class(
     #' @param d Parsed object from `self$read()`.
     #' @param prefix Prefix of output file(s).
     #' @param out_dir Output directory.
-    #' @param out_format Format of output file(s) (one of 'tsv' (def.),
-    #' 'parquet', 'both').
+    #' @param out_format Format of output file(s).
+    #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
     write = function(d, out_dir, prefix, out_format = "tsv") {
-      prefix <- file.path(out_dir, prefix)
-      version <- d[["version"]]
-      prefix2 <- glue("{prefix}_sigs_snv{version}")
-      write_dracarys(obj = d[["data"]], prefix = prefix2, out_format = out_format)
+      if (!is.null(out_dir)) {
+        prefix <- file.path(out_dir, prefix)
+      }
+      # prefix2 <- glue("{prefix}_sigs_snv")
+      write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
     }
   )
 )
@@ -159,7 +161,7 @@ UmSigsSnvFile <- R6::R6Class(
 #' x <- "/path/to/snv_2015.tsv.gz"
 #' d <- UmQcSumFile$new(x)
 #' d_parsed <- d$read() # or read(d)
-#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both")
+#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv")
 #' }
 #' @export
 UmQcSumFile <- R6::R6Class(
@@ -172,8 +174,8 @@ UmQcSumFile <- R6::R6Class(
     #' @return A tibble.
     read = function() {
       x <- self$path
-      d <- readr::read_tsv(x, col_types = readr::cols(.default = "c"))
-      d <- d |>
+      d <- read_tsvgz(x, col_types = readr::cols(.default = "c"))
+      d |>
         dplyr::select("variable", "value") |>
         tidyr::pivot_wider(names_from = "variable", values_from = "value") |>
         dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |>
@@ -189,7 +191,9 @@ UmQcSumFile <- R6::R6Class(
           deleted_genes_hmf = as.numeric(.data$DeletedGenes),
           msi_hmf = sub("(.*) \\(.*\\)", "\\1", .data$MSI_mb_tmp),
           tmb_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TMB) |> as.numeric(),
-          tml_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TML) |> as.numeric()
+          tml_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TML) |> as.numeric(),
+          hypermutated = ifelse("Hypermutated" %in% d$variable, .data[["Hypermutated"]], NA) |> as.character(),
+          bpi_enabled = ifelse("BPI Enabled" %in% d$variable, .data[["BPI Enabled"]], NA) |> as.character(),
         ) |>
         dplyr::select(
           qc_status_hmf = "QC_Status",
@@ -198,6 +202,7 @@ UmQcSumFile <- R6::R6Class(
           "hrd_chord", "hrd_hrdetect", "contamination_hmf",
           "deleted_genes_hmf", "tmb_hmf", "tml_hmf",
           wgd_hmf = "WGD",
+          hypermutated, bpi_enabled
         )
     },
 
@@ -208,12 +213,14 @@ UmQcSumFile <- R6::R6Class(
     #' @param d Parsed object from `self$read()`.
     #' @param prefix Prefix of output file(s).
     #' @param out_dir Output directory.
-    #' @param out_format Format of output file(s) (one of 'tsv' (def.),
-    #' 'parquet', 'both').
-    write = function(d, out_dir, prefix, out_format = "tsv") {
-      prefix <- file.path(out_dir, prefix)
-      prefix2 <- glue("{prefix}_qc_summary")
-      write_dracarys(obj = d, prefix = prefix2, out_format = out_format)
+    #' @param out_format Format of output file(s).
+    #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
+    write = function(d, out_dir, prefix, out_format = "tsv", drid = NULL) {
+      if (!is.null(out_dir)) {
+        prefix <- file.path(out_dir, prefix)
+      }
+      # prefix2 <- glue("{prefix}_qc_summary")
+      write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
     }
   )
 )
diff --git a/R/utils.R b/R/utils.R
@@ -127,6 +127,16 @@ empty_tbl <- function(cnames, ctypes = readr::cols(.default = "c")) {
   readr::read_csv("\n", col_names = cnames, col_types = ctypes)
 }
 
+read_tsvgz <- function(x, ...) {
+  if (is_url(x)) {
+    res <- base::url(x) |>
+      base::gzcon() |>
+      readr::read_tsv(...)
+    return(res)
+  }
+  readr::read_tsv(x, ...)
+}
+
 read_jsongz_jsonlite <- function(x, ...) {
   if (is_url(x)) {
     # https://github.com/jeroen/jsonlite/issues/414
@@ -149,3 +159,25 @@ read_jsongz_rjsonio <- function(x, ...) {
   }
   RJSONIO::fromJSON(x, ...)
 }
+
+#' Read Google LIMS
+#'
+#' Reads UMCCR's Google LIMS spreadsheet.
+#'
+#' @return Tibble with all columns and rows from the Google LIMS spreadsheet.
+#' @export
+glims_read <- function() {
+  lims_key <- googledrive::drive_find("^Google LIMS$", shared_drive = "LIMS")$id
+  lims <- lims_key |>
+    googlesheets4::read_sheet("Sheet1", na = c(".", "", "-"), col_types = "c")
+  lims |> readr::type_convert(col_types = readr::cols(.default = "c", Timestamp = "T"))
+}
+
+
+#' @noRd
+dummy1 <- function() {
+  # Solves R CMD check: Namespaces in Imports field not imported from
+  scales::pretty_breaks
+  argparse::ArgumentParser
+  here::here
+}
diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
@@ -28,6 +28,8 @@ requirements:
     - r-ggplot2
     - r-ggrepel
     - r-glue
+    - r-googledrive
+    - r-googlesheets4
     - r-here
     - r-httr
     - r-jose
@@ -59,6 +61,8 @@ requirements:
     - r-ggplot2
     - r-ggrepel
     - r-glue
+    - r-googledrive
+    - r-googlesheets4
     - r-here
     - r-httr
     - r-jose

diff --git a/inst/rmd/umccr_portal/portal_summary.Rmd b/inst/rmd/umccr_portal/portal_summary.Rmd
@@ -55,13 +55,6 @@ kable_empty_wf <- function(wf) {
     kableExtra::kable_minimal(full_width = TRUE, position = "left")
 }
 
-glims_read <- function() {
-  lims_key <- googledrive::drive_find("^Google LIMS$", shared_drive = "LIMS")$id
-  lims <- lims_key |>
-    googlesheets4::read_sheet("Sheet1", na = c(".", "", "-"), col_types = "c")
-  lims |> readr::type_convert(col_types = readr::cols(.default = "c", Timestamp = "T"))
-}
-
 dt_view <- function(x, ...) {
   x |>
     dplyr::mutate(across(where(is.character), as.factor)) |>
@@ -133,7 +126,7 @@ wf_order <- c(
 )
 
 lims_rds <- here(glue("nogit/data_portal/lims/{as.Date(date_end)}.rds"))
-# lims_raw <- glims_read()
+# lims_raw <- dracarys::glims_read()
 # saveRDS(lims_raw, file = lims_rds)
 lims_raw <- readr::read_rds(lims_rds)
 pmeta_rds <- here(glue("nogit/data_portal/workflows/{as.Date(date_end)}.rds"))

diff --git a/inst/rmd/umccr_workflows/bcl_convert/single.Rmd b/inst/rmd/umccr_workflows/bcl_convert/single.Rmd
@@ -9,9 +9,9 @@ output:
   rmdformats::material:
     highlight: kate
 params:
-  title: "UMCCR BCL Convert Report"
+  title: "UMCCR bcl_convert Report"
   gds_outdir: "X"
-description: "UMCCR BCL Convert Report"
+description: "UMCCR bcl_convert Report"
 title: "`r params$title`"
 ---