diff --git a/DESCRIPTION b/DESCRIPTION index 0b4308e..adc925f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,6 +30,8 @@ Imports: ggplot2, ggrepel, glue, + googledrive, + googlesheets4, here, httr, jose, diff --git a/NAMESPACE b/NAMESPACE index cd8425f..2291398 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ export(gds_file_presignedurl) export(gds_files_list) export(gds_files_list_filter_relevant) export(gds_volumes_list) +export(glims_read) export(ica_token_validate) export(match_regex) export(meta_bcl_convert) diff --git a/R/regex.R b/R/regex.R index 49b8ad4..698ffed 100644 --- a/R/regex.R +++ b/R/regex.R @@ -60,8 +60,8 @@ DR_FILE_REGEX <- tibble::tribble( "somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersFile", "chord\\.tsv\\.gz$", "UmChordTsvFile", "hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile", - "snv_2015\\.tsv\\.gz$", "UmSigsSnv2015File", - "snv_2020\\.tsv\\.gz$", "UmSigsSnv2020File", + "snv_2015\\.tsv\\.gz$", "UmSigsSnvFile", + "snv_2020\\.tsv\\.gz$", "UmSigsSnvFile", "-qc_summary\\.tsv\\.gz$", "UmQcSumFile" ) diff --git a/R/umccrise.R b/R/umccrise.R index faeab24..a8800b1 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -9,7 +9,7 @@ #' x <- "/path/to/chord.tsv.gz" #' d <- UmChordTsvFile$new(x) #' d_parsed <- d$read() # or read(d) -#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv") #' } #' @export UmChordTsvFile <- R6::R6Class( @@ -29,7 +29,7 @@ UmChordTsvFile <- R6::R6Class( p_BRCA1 = "d", p_BRCA2 = "d" ) - readr::read_tsv(x, col_types = ct) + read_tsvgz(x, col_types = ct) }, #' @description @@ -38,12 +38,14 @@ UmChordTsvFile <- R6::R6Class( #' @param d Parsed object from `self$read()`. #' @param prefix Prefix of output file(s). #' @param out_dir Output directory. - #' @param out_format Format of output file(s) (one of 'tsv' (def.), - #' 'parquet', 'both'). - write = function(d, out_dir, prefix, out_format = "tsv") { - prefix <- file.path(out_dir, prefix) - prefix2 <- glue("{prefix}_chord") - write_dracarys(obj = d, prefix = prefix2, out_format = out_format) + #' @param out_format Format of output file(s). + #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { + if (!is.null(out_dir)) { + prefix <- file.path(out_dir, prefix) + } + # prefix2 <- glue("{prefix}_chord") + write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) } ) ) @@ -59,7 +61,7 @@ UmChordTsvFile <- R6::R6Class( #' x <- "/path/to/hrdetect.tsv.gz" #' d <- UmHrdetectTsvFile$new(x) #' d_parsed <- d$read() # or read(d) -#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv") #' } #' @export UmHrdetectTsvFile <- R6::R6Class( @@ -76,7 +78,7 @@ UmHrdetectTsvFile <- R6::R6Class( .default = "d", sample = "c" ) - readr::read_tsv(x, col_types = ct) |> + read_tsvgz(x, col_types = ct) |> dplyr::select(-c("sample")) }, @@ -86,12 +88,14 @@ UmHrdetectTsvFile <- R6::R6Class( #' @param d Parsed object from `self$read()`. #' @param prefix Prefix of output file(s). #' @param out_dir Output directory. - #' @param out_format Format of output file(s) (one of 'tsv' (def.), - #' 'parquet', 'both'). - write = function(d, out_dir, prefix, out_format = "tsv") { - prefix <- file.path(out_dir, prefix) - prefix2 <- glue("{prefix}_hrdetect") - write_dracarys(obj = d, prefix = prefix2, out_format = out_format) + #' @param out_format Format of output file(s). + #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + write = function(d, out_dir, prefix, out_format = "tsv", drid = NULL) { + if (!is.null(out_dir)) { + prefix <- file.path(out_dir, prefix) + } + # prefix2 <- glue("{prefix}_hrdetect") + write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) } ) ) @@ -107,7 +111,7 @@ UmHrdetectTsvFile <- R6::R6Class( #' x <- "/path/to/snv_2015.tsv.gz" #' d <- UmSigsSnvFile$new(x) #' d_parsed <- d$read() # or read(d) -#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv") #' } #' @export UmSigsSnvFile <- R6::R6Class( @@ -125,10 +129,7 @@ UmSigsSnvFile <- R6::R6Class( .default = "d", Signature = "c" ) - list( - data = readr::read_tsv(x, col_types = ct), - version = version - ) + read_tsvgz(x, col_types = ct) }, #' @description @@ -137,13 +138,14 @@ UmSigsSnvFile <- R6::R6Class( #' @param d Parsed object from `self$read()`. #' @param prefix Prefix of output file(s). #' @param out_dir Output directory. - #' @param out_format Format of output file(s) (one of 'tsv' (def.), - #' 'parquet', 'both'). + #' @param out_format Format of output file(s). + #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). write = function(d, out_dir, prefix, out_format = "tsv") { - prefix <- file.path(out_dir, prefix) - version <- d[["version"]] - prefix2 <- glue("{prefix}_sigs_snv{version}") - write_dracarys(obj = d[["data"]], prefix = prefix2, out_format = out_format) + if (!is.null(out_dir)) { + prefix <- file.path(out_dir, prefix) + } + # prefix2 <- glue("{prefix}_sigs_snv") + write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) } ) ) @@ -159,7 +161,7 @@ UmSigsSnvFile <- R6::R6Class( #' x <- "/path/to/snv_2015.tsv.gz" #' d <- UmQcSumFile$new(x) #' d_parsed <- d$read() # or read(d) -#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv") #' } #' @export UmQcSumFile <- R6::R6Class( @@ -172,8 +174,8 @@ UmQcSumFile <- R6::R6Class( #' @return A tibble. read = function() { x <- self$path - d <- readr::read_tsv(x, col_types = readr::cols(.default = "c")) - d <- d |> + d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) + d |> dplyr::select("variable", "value") |> tidyr::pivot_wider(names_from = "variable", values_from = "value") |> dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |> @@ -189,7 +191,9 @@ UmQcSumFile <- R6::R6Class( deleted_genes_hmf = as.numeric(.data$DeletedGenes), msi_hmf = sub("(.*) \\(.*\\)", "\\1", .data$MSI_mb_tmp), tmb_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TMB) |> as.numeric(), - tml_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TML) |> as.numeric() + tml_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TML) |> as.numeric(), + hypermutated = ifelse("Hypermutated" %in% d$variable, .data[["Hypermutated"]], NA) |> as.character(), + bpi_enabled = ifelse("BPI Enabled" %in% d$variable, .data[["BPI Enabled"]], NA) |> as.character(), ) |> dplyr::select( qc_status_hmf = "QC_Status", @@ -198,6 +202,7 @@ UmQcSumFile <- R6::R6Class( "hrd_chord", "hrd_hrdetect", "contamination_hmf", "deleted_genes_hmf", "tmb_hmf", "tml_hmf", wgd_hmf = "WGD", + hypermutated, bpi_enabled ) }, @@ -208,12 +213,14 @@ UmQcSumFile <- R6::R6Class( #' @param d Parsed object from `self$read()`. #' @param prefix Prefix of output file(s). #' @param out_dir Output directory. - #' @param out_format Format of output file(s) (one of 'tsv' (def.), - #' 'parquet', 'both'). - write = function(d, out_dir, prefix, out_format = "tsv") { - prefix <- file.path(out_dir, prefix) - prefix2 <- glue("{prefix}_qc_summary") - write_dracarys(obj = d, prefix = prefix2, out_format = out_format) + #' @param out_format Format of output file(s). + #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + write = function(d, out_dir, prefix, out_format = "tsv", drid = NULL) { + if (!is.null(out_dir)) { + prefix <- file.path(out_dir, prefix) + } + # prefix2 <- glue("{prefix}_qc_summary") + write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) } ) ) diff --git a/R/utils.R b/R/utils.R index a938ab3..8c4367a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -127,6 +127,16 @@ empty_tbl <- function(cnames, ctypes = readr::cols(.default = "c")) { readr::read_csv("\n", col_names = cnames, col_types = ctypes) } +read_tsvgz <- function(x, ...) { + if (is_url(x)) { + res <- base::url(x) |> + base::gzcon() |> + readr::read_tsv(...) + return(res) + } + readr::read_tsv(x, ...) +} + read_jsongz_jsonlite <- function(x, ...) { if (is_url(x)) { # https://github.com/jeroen/jsonlite/issues/414 @@ -149,3 +159,25 @@ read_jsongz_rjsonio <- function(x, ...) { } RJSONIO::fromJSON(x, ...) } + +#' Read Google LIMS +#' +#' Reads UMCCR's Google LIMS spreadsheet. +#' +#' @return Tibble with all columns and rows from the Google LIMS spreadsheet. +#' @export +glims_read <- function() { + lims_key <- googledrive::drive_find("^Google LIMS$", shared_drive = "LIMS")$id + lims <- lims_key |> + googlesheets4::read_sheet("Sheet1", na = c(".", "", "-"), col_types = "c") + lims |> readr::type_convert(col_types = readr::cols(.default = "c", Timestamp = "T")) +} + + +#' @noRd +dummy1 <- function() { + # Solves R CMD check: Namespaces in Imports field not imported from + scales::pretty_breaks + argparse::ArgumentParser + here::here +} diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml index 9be6dba..fc18a27 100644 --- a/conda/recipe/meta.yaml +++ b/conda/recipe/meta.yaml @@ -28,6 +28,8 @@ requirements: - r-ggplot2 - r-ggrepel - r-glue + - r-googledrive + - r-googlesheets4 - r-here - r-httr - r-jose @@ -59,6 +61,8 @@ requirements: - r-ggplot2 - r-ggrepel - r-glue + - r-googledrive + - r-googlesheets4 - r-here - r-httr - r-jose diff --git a/inst/rmd/umccr_portal/portal_summary.Rmd b/inst/rmd/umccr_portal/portal_summary.Rmd index bfefdeb..42cbb96 100644 --- a/inst/rmd/umccr_portal/portal_summary.Rmd +++ b/inst/rmd/umccr_portal/portal_summary.Rmd @@ -55,13 +55,6 @@ kable_empty_wf <- function(wf) { kableExtra::kable_minimal(full_width = TRUE, position = "left") } -glims_read <- function() { - lims_key <- googledrive::drive_find("^Google LIMS$", shared_drive = "LIMS")$id - lims <- lims_key |> - googlesheets4::read_sheet("Sheet1", na = c(".", "", "-"), col_types = "c") - lims |> readr::type_convert(col_types = readr::cols(.default = "c", Timestamp = "T")) -} - dt_view <- function(x, ...) { x |> dplyr::mutate(across(where(is.character), as.factor)) |> @@ -133,7 +126,7 @@ wf_order <- c( ) lims_rds <- here(glue("nogit/data_portal/lims/{as.Date(date_end)}.rds")) -# lims_raw <- glims_read() +# lims_raw <- dracarys::glims_read() # saveRDS(lims_raw, file = lims_rds) lims_raw <- readr::read_rds(lims_rds) pmeta_rds <- here(glue("nogit/data_portal/workflows/{as.Date(date_end)}.rds")) diff --git a/inst/rmd/umccr_workflows/bcl_convert/single.Rmd b/inst/rmd/umccr_workflows/bcl_convert/single.Rmd index 14e7176..fb7ad37 100644 --- a/inst/rmd/umccr_workflows/bcl_convert/single.Rmd +++ b/inst/rmd/umccr_workflows/bcl_convert/single.Rmd @@ -9,9 +9,9 @@ output: rmdformats::material: highlight: kate params: - title: "UMCCR BCL Convert Report" + title: "UMCCR bcl_convert Report" gds_outdir: "X" -description: "UMCCR BCL Convert Report" +description: "UMCCR bcl_convert Report" title: "`r params$title`" --- diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index fa4ec58..4689ad5 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -5,7 +5,6 @@ output: html_document: toc: true theme: cosmo - css: style.css code_download: true rmdformats::material: highlight: kate @@ -22,109 +21,269 @@ knitr::opts_chunk$set( ) ``` +```{css} +.navbar-brand { + padding: 5px 15px; +} + +.dropdown:hover > .dropdown-menu { + display: block; +} +.dropdown > .dropdown-toggle:active { + pointer-events: none; +} + +.main-container { + max-width: 1900px !important; + margin-left: auto; + margin-right: auto; +} + +.navbar-default { + color: #750075; + background-color: #750075; +} +``` + ```{r load_pkgs} { - require(tidyverse) - # require(dracarys) # only needed for session info + require(dplyr) + require(readr, include.only = c("read_rds")) + require(purrr, include.only = c("map")) + require(tidyr, include.only = c("unnest", "unnest_wider")) + require(dracarys) require(glue, include.only = "glue") require(here, include.only = "here") require(knitr, include.only = "kable") require(DT, include.only = "datatable") require(fs, include.only = c("dir_ls")) + require(ggplot2, include.only = c("ggplot", "aes")) + require(lubridate, include.only = c("as_datetime")) + require(plotly, include.only = c("ggplotly")) } ``` ```{r data_setup, eval=FALSE} options(width = 150) -res <- here("nogit/umccrise/2023-01-18") |> - dir_ls(recurse = TRUE, type = "file") |> - as_tibble_col(column_name = "path") |> - filter(!grepl("dracarys_gds_sync", path)) |> # get rid of synced data +token <- dracarys::ica_token_validate(Sys.getenv("ICA_ACCESS_TOKEN_PRO")) +pmeta <- here("nogit/umccrise/rds/portal_meta/2023-09-04_pmeta_final.rds") |> + readr::read_rds() +gds_map <- pmeta |> + rowwise() |> + mutate( + gds_contents = list(dracarys::gds_files_list_filter_relevant( + gdsdir = .data$gds_outdir_umccrise, token = token, include = "PresignedUrl" + )) + ) |> + ungroup() |> + tidyr::unnest("gds_contents") |> + select( + "portal_run_id", "SubjectID", "LibraryID_tumor", "LibraryID_normal", + "start", "end", "type", "bname", "size", "file_id", "path", "presigned_url" + ) |> + filter(type != "MultiqcFile") + +saveRDS(gds_map, here("nogit/umccrise/rds/gds_map_2023-09-05.rds")) + +parse_files <- function(gds_map, row_slice, rds_out) { + start_time <- Sys.time() + dat1 <- gds_map |> + dplyr::slice(row_slice) |> + dplyr::rowwise() |> + dplyr::mutate( + gen = list(dracarys::dr_func_eval(.data$type)), + obj = list(.data$gen$new(.data$presigned_url)), + objp = list(.data$obj$read()) + ) |> + dplyr::ungroup() + end_time <- Sys.time() + total_time <- end_time - start_time + print(total_time) + readr::write_rds(x = dat1, file = rds_out) +} + +gds_map <- readr::read_rds(here("nogit/umccrise/rds/gds_map_2023-09-05.rds")) +rds_path_out <- here("nogit/umccrise/rds/results") +x0 <- parse_files(gds_map, 1:10, file.path(rds_path_out, "x0.rds")) +x1 <- parse_files(gds_map, 1:500, file.path(rds_path_out, "x1.rds")) +x2 <- parse_files(gds_map, 501:1000, file.path(rds_path_out, "x2.rds")) +x3 <- parse_files(gds_map, 1001:1500, file.path(rds_path_out, "x3.rds")) +x4 <- parse_files(gds_map, 1501:2000, file.path(rds_path_out, "x4.rds")) +x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds")) +``` + +```{r data_load} +lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |> + readr::read_rds() +dat1 <- fs::dir_ls(here("nogit/umccrise/rds/results")) |> + purrr::map(readr::read_rds) |> + dplyr::bind_rows() + +o <- dat1 |> mutate( type = case_when( - grepl("chord", path) ~ "chord", - grepl("hrdetect", path) ~ "hrdetect", - grepl("snv2015", path) ~ "sigs2015", - grepl("snv2020", path) ~ "sigs2020", - grepl("qc_summary", path) ~ "qcsum", - TRUE ~ "UNKNOWN" + grepl("snv_2015.tsv.gz", bname) ~ "UmSigsSnvFile2015", + grepl("snv_2020.tsv.gz", bname) ~ "UmSigsSnvFile2020", + .default = .data$type ), - sbj2 = basename(dirname(path)), - sbj = sub("(.*)_.*", "\\1", sbj2) + date_analysed_aest = as.character(.data$end), ) |> - # slice(1:10) |> - rowwise() |> - mutate(dat = list(read_tsv(path, show_col_types = FALSE))) |> - select(sbj, sbj2, type, dat) |> - pivot_wider(names_from = type, values_from = dat) -qc <- res |> - select("sbj", "sbj2", "qcsum") |> - unnest_wider(qcsum) -hrd <- res |> - hoist(chord, hrd_chord1 = "p_hrd") |> - hoist(hrdetect, hrd_hrdetect1 = "Probability") |> - select(sbj, sbj2, hrd_chord1, hrd_hrdetect1) -sigs <- res |> - select(sbj, sbj2, sigs2015, sigs2020) |> - pivot_longer(c(sigs2015, sigs2020), names_to = "version", values_to = "dat") |> - unnest(dat) -pcgr <- read_rds(here("nogit/pcgr/rds/res_2023-01-17.rds")) |> - rename(sbj2 = "sbj", tmb_pcgr = "tmb_estimate", tmb_n_pcgr = "n_tmb") -all <- left_join(pcgr, hrd, by = "sbj2") |> - left_join(qc, by = c("sbj2", "sbj")) |> - select(-c("hrd_chord1", "hrd_hrdetect1")) |> - select(sbj, sbj2, everything()) - -# ids <- read_csv(here("nogit/data_portal/labmetadata_2023-01-19.csv"), col_types = cols(.default = "c")) |> -# # select(subject_id, phenotype, assay, library_id, sample_name, sample_id, external_sample_id, external_subject_id) |> -# filter(subject_id %in% all$sbj, -# type == "WGS") - -# add libids based on sbj2 -sbj2_to_libids <- read_rds(here("nogit/umccrise/rds/x_2023-01-20.rds")) |> - select(sbj2, libids, analysis_date = date, gds_indir) - -all <- all |> - left_join(sbj2_to_libids, by = "sbj2") |> - select(sbj, sbj2, libids, analysis_date, gds_indir, - qc_status_hmf, - sex_hmf, - purity_hmf, - ploidy_hmf, - msi_fracIndels_pcgr = fracIndels, - msi_predicted_class_pcgr = predicted_class, - msi_hmf, - msi_mb_hmf, - hrd_chord, - hrd_hrdetect, - tmb_pcgr, - tmb_n_pcgr, - tmb_hmf, - tml_hmf, - contamination_hmf, - deleted_genes_hmf, - wgd_hmf, - everything() + select( + date_analysed_aest, + SubjectID, + LibraryID_tumor, + LibraryID_normal, + type, + objp, + portal_run_id ) -sigs <- sigs |> - left_join(sbj2_to_libids, by = "sbj2") |> - select(sbj, sbj2, libids, analysis_date, gds_indir, everything()) +lims <- lims_raw |> + dplyr::filter(LibraryID %in% c(o$LibraryID_tumor)) |> + dplyr::select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |> + dplyr::distinct() + + +o2 <- o |> + dplyr::left_join(lims, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> + dplyr::mutate( + url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"), + sbj_url = glue("{.data$SubjectID}"), + url = glue("{.data$url}"), + portal_run_id = glue("dr.{portal_run_id}") + ) |> + dplyr::rename(portal_url = url) -# limsrow <- read_csv(here("nogit/data_portal/limsrow_2023-01-19.csv"), col_types = cols(.default = "c")) +dt_view <- function(x, scroll_y = 1000, ...) { + x |> + dplyr::mutate(across(where(is.character), as.factor)) |> + DT::datatable( + filter = list(position = "top", clear = FALSE, plain = TRUE), + class = "cell-border display compact", + rownames = TRUE, + extensions = c("Scroller", "Buttons", "KeyTable", "FixedColumns"), + options = list( + scroller = TRUE, scrollY = scroll_y, scrollX = TRUE, + autoWidth = FALSE, keys = TRUE, + buttons = c("csv", "copy"), dom = "Blfrtip", + fixedColumns = TRUE + ), + escape = FALSE, + ... + ) +} -readr::write_tsv(all, here("nogit/umccrise/res_2023-01-25.tsv.gz")) -readr::write_tsv(sigs, here("nogit/umccrise/sigs_2023-01-20.tsv.gz")) +qcsum <- o2 |> + filter(type == "UmQcSumFile") |> + unnest_wider(objp) +hrd_chord <- o2 |> + filter(type == "UmChordTsvFile") |> + unnest_wider(objp) |> + select(portal_run_id, + # chord_p_hrd = p_hrd, + chord_hr_status = hr_status, + chord_hrd_type = hrd_type, + chord_p_BRCA1 = p_BRCA1, + chord_p_BRCA2 = p_BRCA2 + ) +# don't need hrdetect details +# hrd_hrdetect <- o2 |> +# filter(type == "UmHrdetectTsvFile") |> +# unnest_wider(objp) |> +# select(portal_run_id, hrdetect_prob = Probability) +sigs_2015 <- o2 |> + filter(type == "UmSigsSnvFile2015") |> + unnest_wider(objp) |> + select(-c(type)) +sigs_2020 <- o2 |> + filter(type == "UmSigsSnvFile2020") |> + unnest_wider(objp) |> + select(-c(type)) ``` +## umccrise Results + +```{r final_tab} +cols_select <- c( + "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID", + "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal", + "hrd_chord", "hrd_hrdetect", + "chord_hr_status", "chord_hrd_type", "chord_p_BRCA1", "chord_p_BRCA2", + "qc_status_hmf", "sex_hmf", "purity_hmf", "ploidy_hmf", "msi_hmf", + "msi_mb_hmf", "contamination_hmf", + "deleted_genes_hmf", "tmb_hmf", "tml_hmf", "wgd_hmf", "hypermutated", + "bpi_enabled", "portal_run_id", "portal_url" +) +d <- qcsum |> + dplyr::left_join(hrd_chord, by = "portal_run_id") |> + dplyr::select(dplyr::all_of(cols_select), dplyr::everything(), -c("type")) +dt_view(d, caption = "umccrise Results Summary") +``` + +### HRD Results + +```{r hrd_plot, fig.width=15, fig.height = 10} +p <- d |> + dplyr::mutate( + sbj = glue("{SubjectID}_{LibraryID_tumor}"), + date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") + ) |> + dplyr::select( + date, + sbj, + chord = hrd_chord, hrdetect = hrd_hrdetect, + ) |> + tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") +p1 <- p |> + ggplot(aes(x = date, y = probability, label = sbj)) + + ggplot2::geom_point(aes(colour = method)) + + ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) + + ggplot2::theme_bw() + + ggplot2::ggtitle("CHORD vs. HRDetect per SubjectID") + +plotly::ggplotly(p1) +``` + + +## Metadata Summary {.tabset .tabset-pills} + +### ProjectOwner + +```{r ProjectOwner} +count(d, ProjectOwner) |> dt_view(scroll_y = 400) +``` + +### ProjectName + +```{r ProjectName} +count(d, ProjectName) |> dt_view(scroll_y = 400) +``` + +### Type + +```{r Type} +count(d, Type) |> dt_view(scroll_y = 400) +``` + +### Workflow + +```{r Workflow} +count(d, Workflow) |> dt_view(scroll_y = 400) +``` + + + --- -## Session Info +
+Session Info -```{r session_info, echo=FALSE, eval=FALSE} +```{r session_info, echo=FALSE} pkgs_of_interest <- c("base", "dracarys", "tidyverse", "tidyselect", "ggplot2", "dplyr", "tidyr", "readr") # need dracarys for session info si <- dracarys::session_info_kable(pkgs_of_interest) si$si_pkg si$si_pl ``` + +
diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R index c317ab4..8c40a6d 100644 --- a/inst/scripts/umccrise_run.R +++ b/inst/scripts/umccrise_run.R @@ -1,45 +1,46 @@ require(dracarys) require(here) +require(glue) require(dplyr) require(readr) -# SQL -# select * from data_portal.data_portal_gdsfile where regexp_like(path, 'cancer_report_tables/.*qc_summary.tsv.gz') order by time_created desc; +# read last 1000 umccrise runs from portal +# 475 from 2022-01-24 until 2023-09-03, of which 449 Succeeded +date1 <- "2023-09-04" +pmeta_raw_rds <- here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_raw.rds")) +# pmeta_raw <- dracarys::portal_meta_read(rows = 1000, params = "&type_name=umccrise") +# saveRDS(pmeta_raw, file = pmeta_raw_rds) +pmeta <- readr::read_rds(pmeta_raw_rds) |> + dracarys::meta_umccrise(status = "Succeeded") +lims_raw_rds <- here(glue("nogit/umccrise/rds/lims/{date1}_lims_raw.rds")) +# lims_raw <- dracarys::glims_read() +# saveRDS(lims_raw, file = lims_raw_rds) +lims_raw <- readr::read_rds(lims_raw_rds) +lims <- lims_raw |> + filter(Type == "WGS") |> + filter(LibraryID %in% c(pmeta$LibraryID_normal, pmeta$LibraryID_tumor)) +table(pmeta$LibraryID_tumor %in% lims$LibraryID) +table(pmeta$LibraryID_normal %in% lims$LibraryID) -d <- here("nogit/umccrise/sql/7c348d27-adcb-427f-a8e8-549bcc0c8490_2023-01-18.csv") |> - read_csv(col_names = TRUE) +# The final results sit under gds_outdir_umccrise/__/ +# We need to get the SampleID_tumor for runs before 2023-04-07. We can do that +# by using the LibraryID_tumor to match up with the glims. +missing_tumor_sampleid <- pmeta |> + filter(end < "2023-04-07") |> + pull(LibraryID_tumor) +table(missing_tumor_sampleid %in% lims$LibraryID) +libid2sampid <- lims |> + filter(LibraryID %in% missing_tumor_sampleid) |> + select(LibraryID_tumor = LibraryID, SampleID_tumor = SampleID) -x <- d |> - mutate( - sbj = sub("/analysis_data/(SBJ.*?)/.*", "\\1", path), - dir = dirname(path), - gds_indir = glue("gds://{volume_name}{dir}/"), - libids = dirname(dirname(dirname(path))) |> basename() - ) |> - group_by(sbj) |> - # TODO: check if results already exist for same sbj - mutate( - n_samp = n(), - sbj2 = if_else(n_samp > 1, glue("{sbj}_{dplyr::row_number()}"), sbj) - ) |> - ungroup() |> - arrange(sbj2) |> - mutate( - outdir = here(glue("nogit/umccrise/2023-01-18/{sbj2}")), - local_indir = file.path(outdir, "dracarys_gds_sync") - ) |> - select(sbj, sbj2, libids, gds_indir, outdir, local_indir, date = time_created) +d <- pmeta |> + left_join(libid2sampid, by = "LibraryID_tumor") |> + mutate(SampleID_tumor = if_else(is.na(SampleID_tumor.x), SampleID_tumor.y, SampleID_tumor.x)) |> + select(-c(SampleID_tumor.x, SampleID_tumor.y)) |> + relocate(SampleID_tumor, .before = SampleID_normal) |> + mutate(gds_outdir_umccrise = glue("{.data$gds_outdir_umccrise}/{.data$SubjectID}__{.data$SampleID_tumor}")) +d -write_rds(x, here("nogit/umccrise/rds/x_2023-01-20.rds")) - -token <- Sys.getenv("ICA_ACCESS_TOKEN_PROD") -dryrun <- F -for (i in 1:276) { - print(i) - # print(x$gds_indir[i]) - # print(x$local_indir[i]) - umccr_tidy(in_dir = x$gds_indir[i], out_dir = x$outdir[i], prefix = x$sbj2[i], dryrun = dryrun, token = token, pattern = "um__qcsum") - # umccr_tidy(in_dir = x$gds_indir[i], out_dir = x$outdir[i], prefix = x$sbj2[i], dryrun = dryrun, token = token) - # umccr_tidy(in_dir = x$local_indir[i], out_dir = x$outdir[i], prefix = x$sbj2[i], dryrun = FALSE, token = token) -} +# final portal meta for umccrise runs +saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds"))) diff --git a/man/UmChordTsvFile.Rd b/man/UmChordTsvFile.Rd index 7270398..358e6db 100644 --- a/man/UmChordTsvFile.Rd +++ b/man/UmChordTsvFile.Rd @@ -12,7 +12,7 @@ Contains methods for reading and displaying contents of the x <- "/path/to/chord.tsv.gz" d <- UmChordTsvFile$new(x) d_parsed <- d$read() # or read(d) -d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv") } } \section{Super class}{ @@ -55,7 +55,13 @@ A tibble. \subsection{Method \code{write()}}{ Writes a tidy version of the \code{chord.tsv.gz} file output from umccrise. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmChordTsvFile$write(d, out_dir, prefix, out_format = "tsv")}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{UmChordTsvFile$write( + d, + out_dir = NULL, + prefix, + out_format = "tsv", + drid = NULL +)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -67,8 +73,9 @@ Writes a tidy version of the \code{chord.tsv.gz} file output from umccrise. \item{\code{prefix}}{Prefix of output file(s).} -\item{\code{out_format}}{Format of output file(s) (one of 'tsv' (def.), -'parquet', 'both').} +\item{\code{out_format}}{Format of output file(s).} + +\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} } \if{html}{\out{}} } diff --git a/man/UmHrdetectTsvFile.Rd b/man/UmHrdetectTsvFile.Rd index 9ca51ce..e2f7777 100644 --- a/man/UmHrdetectTsvFile.Rd +++ b/man/UmHrdetectTsvFile.Rd @@ -12,7 +12,7 @@ Contains methods for reading and displaying contents of the x <- "/path/to/hrdetect.tsv.gz" d <- UmHrdetectTsvFile$new(x) d_parsed <- d$read() # or read(d) -d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv") } } \section{Super class}{ @@ -55,7 +55,7 @@ A tibble. \subsection{Method \code{write()}}{ Writes a tidy version of the \code{hrdetect.tsv.gz} file output from umccrise. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmHrdetectTsvFile$write(d, out_dir, prefix, out_format = "tsv")}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{UmHrdetectTsvFile$write(d, out_dir, prefix, out_format = "tsv", drid = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -67,8 +67,9 @@ Writes a tidy version of the \code{hrdetect.tsv.gz} file output from umccrise. \item{\code{prefix}}{Prefix of output file(s).} -\item{\code{out_format}}{Format of output file(s) (one of 'tsv' (def.), -'parquet', 'both').} +\item{\code{out_format}}{Format of output file(s).} + +\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} } \if{html}{\out{}} } diff --git a/man/UmQcSumFile.Rd b/man/UmQcSumFile.Rd index 5f81779..eaf8b83 100644 --- a/man/UmQcSumFile.Rd +++ b/man/UmQcSumFile.Rd @@ -12,7 +12,7 @@ Contains methods for reading and displaying contents of the x <- "/path/to/snv_2015.tsv.gz" d <- UmQcSumFile$new(x) d_parsed <- d$read() # or read(d) -d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv") } } \section{Super class}{ @@ -56,7 +56,7 @@ A tibble. Writes a tidy version of the \code{qc_summary.tsv.gz} QC summary file output from umccrise. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{UmQcSumFile$write(d, out_dir, prefix, out_format = "tsv")}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{UmQcSumFile$write(d, out_dir, prefix, out_format = "tsv", drid = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -68,8 +68,9 @@ from umccrise. \item{\code{prefix}}{Prefix of output file(s).} -\item{\code{out_format}}{Format of output file(s) (one of 'tsv' (def.), -'parquet', 'both').} +\item{\code{out_format}}{Format of output file(s).} + +\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} } \if{html}{\out{}} } diff --git a/man/UmSigsSnvFile.Rd b/man/UmSigsSnvFile.Rd index 7a21553..78480fd 100644 --- a/man/UmSigsSnvFile.Rd +++ b/man/UmSigsSnvFile.Rd @@ -12,7 +12,7 @@ Contains methods for reading and displaying contents of the x <- "/path/to/snv_2015.tsv.gz" d <- UmSigsSnvFile$new(x) d_parsed <- d$read() # or read(d) -d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "both") +d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv") } } \section{Super class}{ @@ -67,8 +67,9 @@ Writes a tidy version of the \code{snv_20XX.tsv.gz} signature file output from u \item{\code{prefix}}{Prefix of output file(s).} -\item{\code{out_format}}{Format of output file(s) (one of 'tsv' (def.), -'parquet', 'both').} +\item{\code{out_format}}{Format of output file(s).} + +\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} } \if{html}{\out{}} } diff --git a/man/glims_read.Rd b/man/glims_read.Rd new file mode 100644 index 0000000..502df9a --- /dev/null +++ b/man/glims_read.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{glims_read} +\alias{glims_read} +\title{Read Google LIMS} +\usage{ +glims_read() +} +\value{ +Tibble with all columns and rows from the Google LIMS spreadsheet. +} +\description{ +Reads UMCCR's Google LIMS spreadsheet. +}