From 88100ebb018f84f6449f7d750e74eb38990d7d66 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 6 Sep 2023 17:03:33 +1000 Subject: [PATCH 01/10] sig breakdown --- inst/rmd/umccr_workflows/umccrise/multi.Rmd | 89 +++++++++++++++++---- 1 file changed, 75 insertions(+), 14 deletions(-) diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index 4689ad5..98091c2 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -64,7 +64,7 @@ knitr::opts_chunk$set( ``` ```{r data_setup, eval=FALSE} -options(width = 150) +# options(width = 150) token <- dracarys::ica_token_validate(Sys.getenv("ICA_ACCESS_TOKEN_PRO")) pmeta <- here("nogit/umccrise/rds/portal_meta/2023-09-04_pmeta_final.rds") |> readr::read_rds() @@ -194,17 +194,19 @@ hrd_chord <- o2 |> sigs_2015 <- o2 |> filter(type == "UmSigsSnvFile2015") |> unnest_wider(objp) |> - select(-c(type)) + select(-c(type)) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) sigs_2020 <- o2 |> filter(type == "UmSigsSnvFile2020") |> unnest_wider(objp) |> - select(-c(type)) + select(-c(type)) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) ``` ## umccrise Results ```{r final_tab} -cols_select <- c( +cols_select1 <- c( "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID", "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal", "hrd_chord", "hrd_hrdetect", @@ -214,16 +216,36 @@ cols_select <- c( "deleted_genes_hmf", "tmb_hmf", "tml_hmf", "wgd_hmf", "hypermutated", "bpi_enabled", "portal_run_id", "portal_url" ) -d <- qcsum |> +cols_select2 <- c( + "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID", + "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal", + "Sig_group", "Rank", "Signature", "Contribution", "RelFreq", + "portal_run_id", "portal_url" +) +d1 <- qcsum |> dplyr::left_join(hrd_chord, by = "portal_run_id") |> - dplyr::select(dplyr::all_of(cols_select), dplyr::everything(), -c("type")) -dt_view(d, caption = "umccrise Results Summary") + dplyr::select(dplyr::all_of(cols_select1), dplyr::everything(), -c("type")) +dt_view(d1, caption = "umccrise Results Summary") + +d2 <- dplyr::bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |> + dplyr::select(dplyr::all_of(cols_select2), dplyr::everything()) +dt_view(d2, caption = "Signature contributions (2015 and 2020)") + +d2 |> + group_by(date_analysed_aest, SubjectID, LibraryID_tumor, Sig_group) |> + mutate( + Signature_all = paste(glue("{Signature} ({RelFreq})"), collapse = ", ") + ) |> + ungroup() |> + select(date_analysed_aest:Sig_group, Signature_all) |> + distinct() |> + arrange(desc(date_analysed_aest), SubjectID) ``` ### HRD Results ```{r hrd_plot, fig.width=15, fig.height = 10} -p <- d |> +d1p <- d1 |> dplyr::mutate( sbj = glue("{SubjectID}_{LibraryID_tumor}"), date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") @@ -234,8 +256,8 @@ p <- d |> chord = hrd_chord, hrdetect = hrd_hrdetect, ) |> tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") -p1 <- p |> - ggplot(aes(x = date, y = probability, label = sbj)) + +p1 <- d1p |> + ggplot2::ggplot(aes(x = date, y = probability, label = sbj)) + ggplot2::geom_point(aes(colour = method)) + ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) + ggplot2::theme_bw() + @@ -244,31 +266,70 @@ p1 <- p |> plotly::ggplotly(p1) ``` +### Signature Results + +**TODO** + +```{r fig.width = 15, fig.height=50, eval=FALSE} +sig_order2015 <- paste0("Sig", 1:30) +sig_order2020 <- paste0( + "SBS", + c( + 1:6, + paste0(7, c("a", "b", "c", "d")), + 8:9, + paste0(10, c("a", "b", "c", "d")), + 11:16, + paste0(17, c("a", "b")), + 18:60, + 84:94 + ) +) + +d2p <- d2 |> + # dplyr::filter(Rank %in% c(1:5)) |> + dplyr::mutate( + sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")), + date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") + ) |> + dplyr::select( + date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq + ) |> + dplyr::mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020))) +p2 <- + d2p |> + ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) + + ggplot2::geom_bar(position = "fill", stat = "identity") + + ggplot2::facet_wrap(~Sig_group, ncol = 1) + +plotly::ggplotly(p2) +``` + ## Metadata Summary {.tabset .tabset-pills} ### ProjectOwner ```{r ProjectOwner} -count(d, ProjectOwner) |> dt_view(scroll_y = 400) +count(d1, ProjectOwner) |> dt_view(scroll_y = 400) ``` ### ProjectName ```{r ProjectName} -count(d, ProjectName) |> dt_view(scroll_y = 400) +count(d1, ProjectName) |> dt_view(scroll_y = 400) ``` ### Type ```{r Type} -count(d, Type) |> dt_view(scroll_y = 400) +count(d1, Type) |> dt_view(scroll_y = 400) ``` ### Workflow ```{r Workflow} -count(d, Workflow) |> dt_view(scroll_y = 400) +count(d1, Workflow) |> dt_view(scroll_y = 400) ``` From c351abddeac4821219681b2241278a5b87ea7fa2 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 8 Sep 2023 11:18:55 +1000 Subject: [PATCH 02/10] sigs: keep top 2 ranked --- inst/rmd/umccr_workflows/umccrise/multi.Rmd | 63 +++++++++++++-------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index 98091c2..581cab1 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -47,10 +47,10 @@ knitr::opts_chunk$set( ```{r load_pkgs} { - require(dplyr) + require(dplyr) # import all dplyr funcs require(readr, include.only = c("read_rds")) require(purrr, include.only = c("map")) - require(tidyr, include.only = c("unnest", "unnest_wider")) + require(tidyr, include.only = c("unnest")) require(dracarys) require(glue, include.only = "glue") require(here, include.only = "here") @@ -88,14 +88,14 @@ saveRDS(gds_map, here("nogit/umccrise/rds/gds_map_2023-09-05.rds")) parse_files <- function(gds_map, row_slice, rds_out) { start_time <- Sys.time() dat1 <- gds_map |> - dplyr::slice(row_slice) |> - dplyr::rowwise() |> - dplyr::mutate( + slice(row_slice) |> + rowwise() |> + mutate( gen = list(dracarys::dr_func_eval(.data$type)), obj = list(.data$gen$new(.data$presigned_url)), objp = list(.data$obj$read()) ) |> - dplyr::ungroup() + ungroup() end_time <- Sys.time() total_time <- end_time - start_time print(total_time) @@ -117,7 +117,7 @@ lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |> readr::read_rds() dat1 <- fs::dir_ls(here("nogit/umccrise/rds/results")) |> purrr::map(readr::read_rds) |> - dplyr::bind_rows() + bind_rows() o <- dat1 |> mutate( @@ -139,24 +139,24 @@ o <- dat1 |> ) lims <- lims_raw |> - dplyr::filter(LibraryID %in% c(o$LibraryID_tumor)) |> - dplyr::select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |> - dplyr::distinct() + filter(LibraryID %in% c(o$LibraryID_tumor)) |> + select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |> + distinct() o2 <- o |> - dplyr::left_join(lims, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> - dplyr::mutate( + left_join(lims, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> + mutate( url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"), sbj_url = glue("{.data$SubjectID}"), url = glue("{.data$url}"), portal_run_id = glue("dr.{portal_run_id}") ) |> - dplyr::rename(portal_url = url) + rename(portal_url = url) dt_view <- function(x, scroll_y = 1000, ...) { x |> - dplyr::mutate(across(where(is.character), as.factor)) |> + mutate(across(where(is.character), as.factor)) |> DT::datatable( filter = list(position = "top", clear = FALSE, plain = TRUE), class = "cell-border display compact", @@ -223,12 +223,27 @@ cols_select2 <- c( "portal_run_id", "portal_url" ) d1 <- qcsum |> - dplyr::left_join(hrd_chord, by = "portal_run_id") |> - dplyr::select(dplyr::all_of(cols_select1), dplyr::everything(), -c("type")) + left_join(hrd_chord, by = "portal_run_id") |> + select(all_of(cols_select1), everything(), -c("type")) dt_view(d1, caption = "umccrise Results Summary") -d2 <- dplyr::bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |> - dplyr::select(dplyr::all_of(cols_select2), dplyr::everything()) +d2 <- bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |> + select(all_of(cols_select2), everything()) + +d2_filt <- d2 |> + filter( + Sig_group == "s2015" + ) |> + group_by(portal_run_id) |> + mutate(tot_sig_vars = sum(Contribution)) |> + arrange(Rank) |> + slice_head(n = 2) |> + mutate(rn = row_number()) |> + ungroup() |> + mutate( + sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})") + ) + dt_view(d2, caption = "Signature contributions (2015 and 2020)") d2 |> @@ -246,11 +261,11 @@ d2 |> ```{r hrd_plot, fig.width=15, fig.height = 10} d1p <- d1 |> - dplyr::mutate( + mutate( sbj = glue("{SubjectID}_{LibraryID_tumor}"), date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") ) |> - dplyr::select( + select( date, sbj, chord = hrd_chord, hrdetect = hrd_hrdetect, @@ -287,15 +302,15 @@ sig_order2020 <- paste0( ) d2p <- d2 |> - # dplyr::filter(Rank %in% c(1:5)) |> - dplyr::mutate( + # filter(Rank %in% c(1:5)) |> + mutate( sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")), date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") ) |> - dplyr::select( + select( date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq ) |> - dplyr::mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020))) + mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020))) p2 <- d2p |> ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) + From f013e05f9667c2905ccfc61f9eff502e0e6a04c3 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 8 Sep 2023 15:27:39 +1000 Subject: [PATCH 03/10] cleanup --- inst/rmd/umccr_workflows/umccrise/multi.Rmd | 56 ++++++++------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index 581cab1..a718c54 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -175,10 +175,10 @@ dt_view <- function(x, scroll_y = 1000, ...) { qcsum <- o2 |> filter(type == "UmQcSumFile") |> - unnest_wider(objp) + tidyr::unnest_wider(objp) hrd_chord <- o2 |> filter(type == "UmChordTsvFile") |> - unnest_wider(objp) |> + tidyr::unnest_wider(objp) |> select(portal_run_id, # chord_p_hrd = p_hrd, chord_hr_status = hr_status, @@ -193,12 +193,12 @@ hrd_chord <- o2 |> # select(portal_run_id, hrdetect_prob = Probability) sigs_2015 <- o2 |> filter(type == "UmSigsSnvFile2015") |> - unnest_wider(objp) |> + tidyr::unnest_wider(objp) |> select(-c(type)) |> tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) sigs_2020 <- o2 |> filter(type == "UmSigsSnvFile2020") |> - unnest_wider(objp) |> + tidyr::unnest_wider(objp) |> select(-c(type)) |> tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) ``` @@ -216,21 +216,12 @@ cols_select1 <- c( "deleted_genes_hmf", "tmb_hmf", "tml_hmf", "wgd_hmf", "hypermutated", "bpi_enabled", "portal_run_id", "portal_url" ) -cols_select2 <- c( - "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID", - "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal", - "Sig_group", "Rank", "Signature", "Contribution", "RelFreq", - "portal_run_id", "portal_url" -) -d1 <- qcsum |> - left_join(hrd_chord, by = "portal_run_id") |> - select(all_of(cols_select1), everything(), -c("type")) -dt_view(d1, caption = "umccrise Results Summary") - -d2 <- bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |> - select(all_of(cols_select2), everything()) +# signatures +dsig <- bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |> + select(portal_run_id, Sig_group, Rank, Signature, Contribution, RelFreq) -d2_filt <- d2 |> +# keep top two ranked sigs from 2015 +dsig_filt <- dsig |> filter( Sig_group == "s2015" ) |> @@ -238,29 +229,27 @@ d2_filt <- d2 |> mutate(tot_sig_vars = sum(Contribution)) |> arrange(Rank) |> slice_head(n = 2) |> - mutate(rn = row_number()) |> + # some sigs have same Rank so use explicit sig_rank + mutate(sig_rank = row_number()) |> ungroup() |> mutate( sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})") - ) - -dt_view(d2, caption = "Signature contributions (2015 and 2020)") - -d2 |> - group_by(date_analysed_aest, SubjectID, LibraryID_tumor, Sig_group) |> - mutate( - Signature_all = paste(glue("{Signature} ({RelFreq})"), collapse = ", ") ) |> - ungroup() |> - select(date_analysed_aest:Sig_group, Signature_all) |> - distinct() |> - arrange(desc(date_analysed_aest), SubjectID) + select(portal_run_id, sig_rank, sig_summary) |> + tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |> + mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |> + select(portal_run_id, sig_top2) + +dall <- qcsum |> + left_join(hrd_chord, by = "portal_run_id") |> + select(all_of(cols_select1), everything(), -c("type")) |> + left_join(dsig_filt, by = "portal_run_id") ``` ### HRD Results ```{r hrd_plot, fig.width=15, fig.height = 10} -d1p <- d1 |> +p1 <- dall |> mutate( sbj = glue("{SubjectID}_{LibraryID_tumor}"), date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") @@ -270,8 +259,7 @@ d1p <- d1 |> sbj, chord = hrd_chord, hrdetect = hrd_hrdetect, ) |> - tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") -p1 <- d1p |> + tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") |> ggplot2::ggplot(aes(x = date, y = probability, label = sbj)) + ggplot2::geom_point(aes(colour = method)) + ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) + From f0081fd7a991551a21cd37c24e1fb974c3237cd2 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 8 Sep 2023 17:01:36 +1000 Subject: [PATCH 04/10] join sigs with excel sheet --- inst/rmd/umccr_workflows/umccrise/multi.Rmd | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index a718c54..4cec7c0 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -246,6 +246,17 @@ dall <- qcsum |> left_join(dsig_filt, by = "portal_run_id") ``` +```{r join_excel_layla} +excel_all <- here("nogit/umccrise/Combined analysis Jan22_Aug23.xlsx") |> + readxl::read_xlsx(sheet = "All") +excel_all |> + select("...1", portal_run_id) |> + left_join(dall |> select(portal_run_id, sig_top2)) |> + rename(N = "...1") |> + readr::write_csv("sigs_top2_2023-09-08.csv") +``` + + ### HRD Results ```{r hrd_plot, fig.width=15, fig.height = 10} From 13a0dd909e8192f3b9379d4d10fe42784e66c6bb Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 12 Sep 2023 01:41:38 +1000 Subject: [PATCH 05/10] add aws s3 funcs --- .gitignore | 1 + R/s3.R | 67 +++++++++++++++++++++++++++++++++++++ inst/scripts/umccrise_run.R | 14 ++++++++ 3 files changed, 82 insertions(+) create mode 100644 R/s3.R diff --git a/.gitignore b/.gitignore index 88c5a3d..e2fec9e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] .Rproj.user +.Rhistory /nogit /docs diff --git a/R/s3.R b/R/s3.R new file mode 100644 index 0000000..e40ce38 --- /dev/null +++ b/R/s3.R @@ -0,0 +1,67 @@ +#' s3_files_list_filter_relevant("s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables/", presign = TRUE) +s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE) { + assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign)) + pattern <- pattern %||% ".*" # keep all recognisable files by default + b <- sub("s3://(.*?)/.*", "\\1", s3dir) + p <- sub("s3://(.*?)/(.*)", "\\2", s3dir) + cmd <- glue( + "aws --output json s3api list-objects-v2 --bucket {b} --prefix {p} ", + "--max-items {max_items} --page-size {page_size}" + ) + l <- system(cmd, intern = TRUE) + j <- jsonlite::fromJSON(l) + assertthat::assert_that("Contents" %in% names(j)) + d <- j[["Contents"]] |> + tibble::as_tibble() |> + dplyr::mutate( + path = glue("s3://{b}/{.data$Key}"), + date1 = .data$LastModified, + size = fs::as_fs_bytes(.data$Size) + ) |> + dplyr::rowwise() |> + dplyr::mutate( + bname = basename(.data$path), + type = purrr::map_chr(.data$bname, match_regex) + ) |> + dplyr::ungroup() |> + dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> + dplyr::select(path, date1, size, type) + + if (presign) { + d <- d |> + dplyr::rowwise() |> + dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path)) |> + dplyr::ungroup() + } + d +} + +s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) { + p <- system(glue("aws s3 presign {s3path} --expires-in {expiry_seconds}"), intern = TRUE) + p +} + +# search for files on S3 +s3_search <- function(search, rows) { + au_tz <- "Australia/Melbourne" + utc_tz <- "UTC" + base_url <- "https://api.portal.prod.umccr.org/iam/s3" + url1 <- utils::URLencode(glue::glue("{base_url}?rowsPerPage={rows}&search={search}")) + awscurl_cmd <- glue::glue( + "awscurl '{url1}' ", + "--header 'Accept: application/json'" + ) + message(glue::glue("Running {awscurl_cmd}")) + j <- system(awscurl_cmd, intern = TRUE) + date_fmt <- "%Y-%m-%dT%H:%M:%S" + d <- j |> + jsonlite::fromJSON() |> + purrr::pluck("results") |> + tibble::as_tibble() + d |> + dplyr::mutate( + date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt), + date_aest = lubridate::with_tz(.data$date1, tz = au_tz) + ) |> + dplyr::select(path = key, bucket, size, date_aest, id, unique_hash) +} diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R index 8c40a6d..f76ba72 100644 --- a/inst/scripts/umccrise_run.R +++ b/inst/scripts/umccrise_run.R @@ -3,7 +3,9 @@ require(here) require(glue) require(dplyr) require(readr) +require(paws) +#---- GDS ----# # read last 1000 umccrise runs from portal # 475 from 2022-01-24 until 2023-09-03, of which 449 Succeeded date1 <- "2023-09-04" @@ -44,3 +46,15 @@ d # final portal meta for umccrise runs saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds"))) + +#---- S3 ----# +pat <- "qc_summary.tsv.gz" +rows <- 1000 +d <- s3_search(search = pat, rows = rows) + +d |> + mutate( + dir1 = dirname(path), + dir1 = dirname(dir1) + ) |> + select(dir1, path) From ee6bd469507166199c969883786ec193e62d06a8 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 12 Sep 2023 10:15:02 +1000 Subject: [PATCH 06/10] doc s3_files_list_filter_relevant --- NAMESPACE | 1 + R/s3.R | 22 ++++++++++++++--- man/s3_files_list_filter_relevant.Rd | 37 ++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 man/s3_files_list_filter_relevant.Rd diff --git a/NAMESPACE b/NAMESPACE index 2291398..1355ba0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,6 +62,7 @@ export(multiqc_tidy_json) export(portal_meta_read) export(rdf2tab) export(read) +export(s3_files_list_filter_relevant) export(session_info_kable) export(time_metrics_process) export(tso_rmd) diff --git a/R/s3.R b/R/s3.R index e40ce38..9b92244 100644 --- a/R/s3.R +++ b/R/s3.R @@ -1,4 +1,20 @@ -#' s3_files_list_filter_relevant("s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables/", presign = TRUE) +#' List Relevant Files In S3 Directory +#' +#' Lists relevant files in a S3 directory. +#' +#' @param s3dir GDS directory. +#' @param pattern Pattern to further filter the returned file type tibble. +#' @param page_size The size of each page to get in the AWS service call (def: 1000). +#' @param max_items The total number of items to return in the command’s output (def: 1000). +#' @param presign Include presigned URLs (def: FALSE). +#' +#' @return A tibble with path, date, file size, file type, and presigned URL if requested. +#' @examples +#' \dontrun{ +#' s3dir <- "s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables" +#' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) +#' } +#' @export s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE) { assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign)) pattern <- pattern %||% ".*" # keep all recognisable files by default @@ -25,7 +41,7 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 100 ) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select(path, date1, size, type) + dplyr::select("path", "date1", "size", "type") if (presign) { d <- d |> @@ -63,5 +79,5 @@ s3_search <- function(search, rows) { date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt), date_aest = lubridate::with_tz(.data$date1, tz = au_tz) ) |> - dplyr::select(path = key, bucket, size, date_aest, id, unique_hash) + dplyr::select(path = "key", "bucket", "size", "date_aest", "id", "unique_hash") } diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd new file mode 100644 index 0000000..f8af42e --- /dev/null +++ b/man/s3_files_list_filter_relevant.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/s3.R +\name{s3_files_list_filter_relevant} +\alias{s3_files_list_filter_relevant} +\title{List Relevant Files In S3 Directory} +\usage{ +s3_files_list_filter_relevant( + s3dir, + pattern = NULL, + page_size = 1000, + max_items = 1000, + presign = FALSE +) +} +\arguments{ +\item{s3dir}{GDS directory.} + +\item{pattern}{Pattern to further filter the returned file type tibble.} + +\item{page_size}{The size of each page to get in the AWS service call (def: 1000).} + +\item{max_items}{The total number of items to return in the command’s output (def: 1000).} + +\item{presign}{Include presigned URLs (def: FALSE).} +} +\value{ +A tibble with path, date, file size, file type, and presigned URL if requested. +} +\description{ +Lists relevant files in a S3 directory. +} +\examples{ +\dontrun{ +s3dir <- "s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables" +s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) +} +} From 0e1a53fb23fef63ee0507000c4314c6f1c814ce9 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 12 Sep 2023 21:26:57 +1000 Subject: [PATCH 07/10] doc s3_search func --- NAMESPACE | 1 + R/s3.R | 35 +++++++++++++++++++++------- man/s3_files_list_filter_relevant.Rd | 4 ++-- man/s3_search.Rd | 26 +++++++++++++++++++++ 4 files changed, 55 insertions(+), 11 deletions(-) create mode 100644 man/s3_search.Rd diff --git a/NAMESPACE b/NAMESPACE index 1355ba0..688287c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -63,6 +63,7 @@ export(portal_meta_read) export(rdf2tab) export(read) export(s3_files_list_filter_relevant) +export(s3_search) export(session_info_kable) export(time_metrics_process) export(tso_rmd) diff --git a/R/s3.R b/R/s3.R index 9b92244..eef476c 100644 --- a/R/s3.R +++ b/R/s3.R @@ -1,6 +1,6 @@ -#' List Relevant Files In S3 Directory +#' List Relevant Files In AWS S3 Directory #' -#' Lists relevant files in a S3 directory. +#' Lists relevant files in an AWS S3 directory. #' #' @param s3dir GDS directory. #' @param pattern Pattern to further filter the returned file type tibble. @@ -57,17 +57,32 @@ s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) { p } -# search for files on S3 -s3_search <- function(search, rows) { +#' Search AWS S3 Objects +#' +#' Searches for the given pattern in the UMCCR `umccr-primary-data-prod` AWS S3 +#' bucket. +#' +#' @param pat Pattern to search for (e.g. 'multiqc_data.json'). +#' @param rows Max number of rows to return. +#' +#' @return Tibble with S3 path, object size, date modified, id, unique hash. +#' +#' @examples +#' \dontrun{ +#' pat <- "qc_summary.tsv.gz" +#' s3_search(pat, 10) +#' } +#' @export +s3_search <- function(pat, rows) { au_tz <- "Australia/Melbourne" utc_tz <- "UTC" base_url <- "https://api.portal.prod.umccr.org/iam/s3" - url1 <- utils::URLencode(glue::glue("{base_url}?rowsPerPage={rows}&search={search}")) - awscurl_cmd <- glue::glue( + url1 <- utils::URLencode(glue("{base_url}?rowsPerPage={rows}&search={pat}")) + awscurl_cmd <- glue( "awscurl '{url1}' ", "--header 'Accept: application/json'" ) - message(glue::glue("Running {awscurl_cmd}")) + message(glue("Running {awscurl_cmd}")) j <- system(awscurl_cmd, intern = TRUE) date_fmt <- "%Y-%m-%dT%H:%M:%S" d <- j |> @@ -77,7 +92,9 @@ s3_search <- function(search, rows) { d |> dplyr::mutate( date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt), - date_aest = lubridate::with_tz(.data$date1, tz = au_tz) + date_aest = lubridate::with_tz(.data$date1, tz = au_tz), + path = glue("s3://{bucket}/{key}"), + size = fs::as_fs_bytes(.data$size) ) |> - dplyr::select(path = "key", "bucket", "size", "date_aest", "id", "unique_hash") + dplyr::select("path", "size", "date_aest", "id", "unique_hash") } diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd index f8af42e..0578bce 100644 --- a/man/s3_files_list_filter_relevant.Rd +++ b/man/s3_files_list_filter_relevant.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/s3.R \name{s3_files_list_filter_relevant} \alias{s3_files_list_filter_relevant} -\title{List Relevant Files In S3 Directory} +\title{List Relevant Files In AWS S3 Directory} \usage{ s3_files_list_filter_relevant( s3dir, @@ -27,7 +27,7 @@ s3_files_list_filter_relevant( A tibble with path, date, file size, file type, and presigned URL if requested. } \description{ -Lists relevant files in a S3 directory. +Lists relevant files in an AWS S3 directory. } \examples{ \dontrun{ diff --git a/man/s3_search.Rd b/man/s3_search.Rd new file mode 100644 index 0000000..c0d9f64 --- /dev/null +++ b/man/s3_search.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/s3.R +\name{s3_search} +\alias{s3_search} +\title{Search AWS S3 Objects} +\usage{ +s3_search(pat, rows) +} +\arguments{ +\item{pat}{Pattern to search for (e.g. 'multiqc_data.json').} + +\item{rows}{Max number of rows to return.} +} +\value{ +Tibble with S3 path, object size, date modified, id, unique hash. +} +\description{ +Searches for the given pattern in the UMCCR \code{umccr-primary-data-prod} AWS S3 +bucket. +} +\examples{ +\dontrun{ +pat <- "qc_summary.tsv.gz" +s3_search(pat, 10) +} +} From 77d59f3dc696ff1afae3086df14df2069de278a0 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 12 Sep 2023 23:56:37 +1000 Subject: [PATCH 08/10] generate s3 presigned urls --- R/s3.R | 9 ++-- inst/rmd/umccr_workflows/umccrise/multi.Rmd | 51 +++++++++++++++++++-- inst/scripts/umccrise_run.R | 26 ++++++++--- man/s3_files_list_filter_relevant.Rd | 5 +- 4 files changed, 77 insertions(+), 14 deletions(-) diff --git a/R/s3.R b/R/s3.R index eef476c..c798a70 100644 --- a/R/s3.R +++ b/R/s3.R @@ -7,6 +7,7 @@ #' @param page_size The size of each page to get in the AWS service call (def: 1000). #' @param max_items The total number of items to return in the command’s output (def: 1000). #' @param presign Include presigned URLs (def: FALSE). +#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)). #' #' @return A tibble with path, date, file size, file type, and presigned URL if requested. #' @examples @@ -15,7 +16,7 @@ #' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) #' } #' @export -s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE) { +s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200) { assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign)) pattern <- pattern %||% ".*" # keep all recognisable files by default b <- sub("s3://(.*?)/.*", "\\1", s3dir) @@ -31,7 +32,7 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 100 tibble::as_tibble() |> dplyr::mutate( path = glue("s3://{b}/{.data$Key}"), - date1 = .data$LastModified, + date_utc = .data$LastModified, size = fs::as_fs_bytes(.data$Size) ) |> dplyr::rowwise() |> @@ -41,12 +42,12 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 100 ) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> - dplyr::select("path", "date1", "size", "type") + dplyr::select("path", "date_utc", "size", "type") if (presign) { d <- d |> dplyr::rowwise() |> - dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path)) |> + dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path, expiry_seconds = expiry_sec)) |> dplyr::ungroup() } d diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index 4cec7c0..788f196 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -64,7 +64,48 @@ knitr::opts_chunk$set( ``` ```{r data_setup, eval=FALSE} -# options(width = 150) +#---- S3 ----# +s3 <- here::here(glue::glue("nogit/umccrise/rds/portal_meta/2023-09-12_pmeta_s3.rds")) |> + readr::read_rds() +s3_get_presigned1 <- function(x, row_slice) { + start_time <- Sys.time() + s3_map <- x |> + slice(row_slice) |> + rowwise() |> + mutate( + s3_contents = list(s3_files_list_filter_relevant( + s3dir = .data$dir1, presign = TRUE + )) + ) |> + ungroup() |> + tidyr::unnest("s3_contents") |> + select( + "SubjectID", "LibraryID_tumor", "SampleID_tumor", + "date_utc", "type", "size", "path", "presigned_url" + ) + end_time <- Sys.time() + total_time <- end_time - start_time + print(total_time) + s3_map +} +# 2 seconds per row +s3_map1 <- s3_get_presigned1(s3, 1:100) +s3_map2 <- s3_get_presigned1(s3, 101:200) +s3_map3 <- s3_get_presigned1(s3, 201:300) +s3_map4 <- s3_get_presigned1(s3, 301:400) +s3_map5 <- s3_get_presigned1(s3, 401:449) + +saveRDS(s3_map1, here("nogit/umccrise/rds/s3/map1_2023-09-12.rds")) +saveRDS(s3_map2, here("nogit/umccrise/rds/s3/map2_2023-09-12.rds")) +saveRDS(s3_map3, here("nogit/umccrise/rds/s3/map3_2023-09-12.rds")) +saveRDS(s3_map4, here("nogit/umccrise/rds/s3/map4_2023-09-12.rds")) +saveRDS(s3_map5, here("nogit/umccrise/rds/s3/map5_2023-09-12.rds")) +s3_map <- fs::dir_ls(here("nogit/umccrise/rds/s3"), regexp = "map.*rds") |> + purrr::map(readr::read_rds) |> + bind_rows() +saveRDS(s3_map, here("nogit/umccrise/rds/s3_map_2023-09-12.rds")) + +#---- GDS ----# token <- dracarys::ica_token_validate(Sys.getenv("ICA_ACCESS_TOKEN_PRO")) pmeta <- here("nogit/umccrise/rds/portal_meta/2023-09-04_pmeta_final.rds") |> readr::read_rds() @@ -84,10 +125,12 @@ gds_map <- pmeta |> filter(type != "MultiqcFile") saveRDS(gds_map, here("nogit/umccrise/rds/gds_map_2023-09-05.rds")) +``` -parse_files <- function(gds_map, row_slice, rds_out) { +```{r data_parse, eval=FALSE} +parse_files <- function(x, row_slice, rds_out) { start_time <- Sys.time() - dat1 <- gds_map |> + dat1 <- x |> slice(row_slice) |> rowwise() |> mutate( @@ -102,6 +145,8 @@ parse_files <- function(gds_map, row_slice, rds_out) { readr::write_rds(x = dat1, file = rds_out) } +rds_pa + gds_map <- readr::read_rds(here("nogit/umccrise/rds/gds_map_2023-09-05.rds")) rds_path_out <- here("nogit/umccrise/rds/results") x0 <- parse_files(gds_map, 1:10, file.path(rds_path_out, "x0.rds")) diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R index f76ba72..6602d87 100644 --- a/inst/scripts/umccrise_run.R +++ b/inst/scripts/umccrise_run.R @@ -3,7 +3,6 @@ require(here) require(glue) require(dplyr) require(readr) -require(paws) #---- GDS ----# # read last 1000 umccrise runs from portal @@ -45,16 +44,31 @@ d <- pmeta |> d # final portal meta for umccrise runs +# columns: +# "id", "wfr_name", "wfr_id", "version", "end_status", "start", "end", "portal_run_id", +# "SubjectID", "LibraryID_tumor", "LibraryID_normal", "SampleID_tumor", "SampleID_normal", +# "gds_outdir_umccrise", "gds_indir_dragen_somatic", "gds_indir_dragen_germline", "gds_infile_genomes_tar" saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds"))) #---- S3 ----# pat <- "qc_summary.tsv.gz" rows <- 1000 -d <- s3_search(search = pat, rows = rows) +d_s3_raw <- dracarys::s3_search(pat = pat, rows = rows) -d |> +d_s3 <- d_s3_raw |> + arrange(desc(date_aest)) |> mutate( - dir1 = dirname(path), - dir1 = dirname(dir1) + bname = basename(path), + dir1 = dirname(path), # path/to/dirA/cancer_report_tables + dir2 = basename(dirname(dir1)), # dirA + sbj_samp_lib = sub(".*__(.*)", "\\1", dir2), + SubjectID = sub("(SBJ[0-9]{5})_.*", "\\1", sbj_samp_lib), + SampleID_tumor = sub("SBJ.*?_(.*?)_.*", "\\1", sbj_samp_lib), + LibraryID_tumor = sub("SBJ.*?_.*?_(.*)", "\\1", sbj_samp_lib), + rerun = grepl("rerun", .data$LibraryID_tumor) ) |> - select(dir1, path) + select(dir1, SubjectID, LibraryID_tumor, SampleID_tumor, date = date_aest, rerun) + +date2 <- "2023-09-12" +saveRDS(d_s3, file = here(glue("nogit/umccrise/rds/portal_meta/{date2}_pmeta_s3.rds"))) +# now we have S3 paths and metadata, so all we need is to generate presigned URLs and read the data diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd index 0578bce..1194eea 100644 --- a/man/s3_files_list_filter_relevant.Rd +++ b/man/s3_files_list_filter_relevant.Rd @@ -9,7 +9,8 @@ s3_files_list_filter_relevant( pattern = NULL, page_size = 1000, max_items = 1000, - presign = FALSE + presign = FALSE, + expiry_sec = 43200 ) } \arguments{ @@ -22,6 +23,8 @@ s3_files_list_filter_relevant( \item{max_items}{The total number of items to return in the command’s output (def: 1000).} \item{presign}{Include presigned URLs (def: FALSE).} + +\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).} } \value{ A tibble with path, date, file size, file type, and presigned URL if requested. From e33d275c18636ab855d4161fe0d7f56d15d78b77 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 13 Sep 2023 01:01:32 +1000 Subject: [PATCH 09/10] ingest s3 files --- inst/rmd/umccr_workflows/umccrise/multi.Rmd | 55 ++++++++++++++------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index 788f196..4905c4a 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -145,10 +145,18 @@ parse_files <- function(x, row_slice, rds_out) { readr::write_rds(x = dat1, file = rds_out) } -rds_pa +rds_path_out <- here::here("nogit/umccrise/rds/results") +#---- S3 ----# +s3_map <- readr::read_rds(here("nogit/umccrise/rds/s3_map_2023-09-12.rds")) +s0 <- parse_files(s3_map, 1:10, file.path(rds_path_out, "s0.rds")) +s1 <- parse_files(s3_map, 1:500, file.path(rds_path_out, "s1.rds")) +s2 <- parse_files(s3_map, 501:1000, file.path(rds_path_out, "s2.rds")) +s3 <- parse_files(s3_map, 1001:1500, file.path(rds_path_out, "s3.rds")) +s4 <- parse_files(s3_map, 1501:2000, file.path(rds_path_out, "s4.rds")) +s5 <- parse_files(s3_map, 2001:2245, file.path(rds_path_out, "s5.rds")) +#---- GDS ----# gds_map <- readr::read_rds(here("nogit/umccrise/rds/gds_map_2023-09-05.rds")) -rds_path_out <- here("nogit/umccrise/rds/results") x0 <- parse_files(gds_map, 1:10, file.path(rds_path_out, "x0.rds")) x1 <- parse_files(gds_map, 1:500, file.path(rds_path_out, "x1.rds")) x2 <- parse_files(gds_map, 501:1000, file.path(rds_path_out, "x2.rds")) @@ -160,11 +168,25 @@ x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds")) ```{r data_load} lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |> readr::read_rds() -dat1 <- fs::dir_ls(here("nogit/umccrise/rds/results")) |> +dat_s3 <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |> + purrr::map(readr::read_rds) |> + bind_rows() +dat_gds <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "x[1-5]{1}.rds") |> purrr::map(readr::read_rds) |> bind_rows() -o <- dat1 |> +dat_s3_res <- dat_s3 |> + mutate( + type = case_when( + grepl("snv_2015.tsv.gz", path) ~ "UmSigsSnvFile2015", + grepl("snv_2020.tsv.gz", path) ~ "UmSigsSnvFile2020", + .default = .data$type + ), + date_utc2 = lubridate::as_datetime(.data$date_utc, format = "%Y-%m-%dT%H:%M:%S+00:00"), + date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne") + ) |> + select(date_analysed_aest, SubjectID, LibraryID_tumor, SampleID_tumor, type, objp) +dat_gds_res <- dat_gds |> mutate( type = case_when( grepl("snv_2015.tsv.gz", bname) ~ "UmSigsSnvFile2015", @@ -173,24 +195,23 @@ o <- dat1 |> ), date_analysed_aest = as.character(.data$end), ) |> - select( - date_analysed_aest, - SubjectID, - LibraryID_tumor, - LibraryID_normal, - type, - objp, - portal_run_id - ) + select(date_analysed_aest, SubjectID, LibraryID_tumor, LibraryID_normal, type, objp, portal_run_id) -lims <- lims_raw |> - filter(LibraryID %in% c(o$LibraryID_tumor)) |> +lims_s3 <- lims_raw |> + filter(LibraryID %in% dat_s3_res$LibraryID_tumor) |> + select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |> + distinct() +lims_gds <- lims_raw |> + filter(LibraryID %in% c(dat_gds_res$LibraryID_tumor)) |> select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |> distinct() +o1 <- dat_s3_res |> + left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) +#################### UP TO HERE^ ########################################### -o2 <- o |> - left_join(lims, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> +o2 <- dat_gds_res |> + left_join(lims_gds, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> mutate( url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"), sbj_url = glue("{.data$SubjectID}"), From 045c9cb51c0a9b5778c838707b6f7ec2161a1111 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 13 Sep 2023 17:12:44 +1000 Subject: [PATCH 10/10] update umccrise multi reporter with s3 results --- inst/rmd/umccr_workflows/umccrise/multi.Rmd | 92 ++++++++++++++------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index 4905c4a..21b7bb2 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -60,6 +60,7 @@ knitr::opts_chunk$set( require(ggplot2, include.only = c("ggplot", "aes")) require(lubridate, include.only = c("as_datetime")) require(plotly, include.only = c("ggplotly")) + require(openssl, include.only = c("sha256")) } ``` @@ -168,9 +169,21 @@ x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds")) ```{r data_load} lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |> readr::read_rds() -dat_s3 <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |> +dat_s3_raw <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |> purrr::map(readr::read_rds) |> bind_rows() +# create sha256 for umccrise directory to distinguish between runs +# keep first 8 digits and append to umccrise date folder. +dat_s3 <- dat_s3_raw |> + mutate( + um_dir = sub("s3://umccr-primary-data-prod/(.*)/cancer_report_tables/.*", "\\1", path), + date_dir = basename(dirname(dirname(um_dir))), + date_dir = gsub("-", "", date_dir), + hash256 = openssl::sha256(um_dir), + hash256 = substr(hash256, 1, 8), + portal_run_id = glue("fake.{date_dir}{hash256}") + ) |> + select(-c(um_dir, date_dir, hash256, SampleID_tumor)) dat_gds <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "x[1-5]{1}.rds") |> purrr::map(readr::read_rds) |> bind_rows() @@ -183,9 +196,10 @@ dat_s3_res <- dat_s3 |> .default = .data$type ), date_utc2 = lubridate::as_datetime(.data$date_utc, format = "%Y-%m-%dT%H:%M:%S+00:00"), - date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne") + date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne"), + date_analysed_aest = as.character(.data$date_analysed_aest) ) |> - select(date_analysed_aest, SubjectID, LibraryID_tumor, SampleID_tumor, type, objp) + select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id) dat_gds_res <- dat_gds |> mutate( type = case_when( @@ -195,7 +209,7 @@ dat_gds_res <- dat_gds |> ), date_analysed_aest = as.character(.data$end), ) |> - select(date_analysed_aest, SubjectID, LibraryID_tumor, LibraryID_normal, type, objp, portal_run_id) + select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id) lims_s3 <- lims_raw |> filter(LibraryID %in% dat_s3_res$LibraryID_tumor) |> @@ -207,9 +221,13 @@ lims_gds <- lims_raw |> distinct() o1 <- dat_s3_res |> - left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) -#################### UP TO HERE^ ########################################### - + left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> + mutate( + url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"), + sbj_url = glue("{.data$SubjectID}"), + url = glue("{.data$url}") + ) |> + rename(portal_url = url) o2 <- dat_gds_res |> left_join(lims_gds, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> mutate( @@ -220,7 +238,11 @@ o2 <- dat_gds_res |> ) |> rename(portal_url = url) +d <- list(s3 = o1, gds = o2) |> + bind_rows(.id = "s3_or_gds") + dt_view <- function(x, scroll_y = 1000, ...) { + options(DT.TOJSON_ARGS = list(na = "string")) x |> mutate(across(where(is.character), as.factor)) |> DT::datatable( @@ -239,10 +261,10 @@ dt_view <- function(x, scroll_y = 1000, ...) { ) } -qcsum <- o2 |> +qcsum <- d |> filter(type == "UmQcSumFile") |> tidyr::unnest_wider(objp) -hrd_chord <- o2 |> +hrd_chord <- d |> filter(type == "UmChordTsvFile") |> tidyr::unnest_wider(objp) |> select(portal_run_id, @@ -257,12 +279,12 @@ hrd_chord <- o2 |> # filter(type == "UmHrdetectTsvFile") |> # unnest_wider(objp) |> # select(portal_run_id, hrdetect_prob = Probability) -sigs_2015 <- o2 |> +sigs_2015 <- d |> filter(type == "UmSigsSnvFile2015") |> tidyr::unnest_wider(objp) |> select(-c(type)) |> tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) -sigs_2020 <- o2 |> +sigs_2020 <- d |> filter(type == "UmSigsSnvFile2020") |> tidyr::unnest_wider(objp) |> select(-c(type)) |> @@ -274,7 +296,7 @@ sigs_2020 <- o2 |> ```{r final_tab} cols_select1 <- c( "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID", - "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal", + "ProjectOwner", "ProjectName", "Type", "Workflow", "hrd_chord", "hrd_hrdetect", "chord_hr_status", "chord_hrd_type", "chord_p_BRCA1", "chord_p_BRCA2", "qc_status_hmf", "sex_hmf", "purity_hmf", "ploidy_hmf", "msi_hmf", @@ -309,10 +331,13 @@ dsig_filt <- dsig |> dall <- qcsum |> left_join(hrd_chord, by = "portal_run_id") |> select(all_of(cols_select1), everything(), -c("type")) |> - left_join(dsig_filt, by = "portal_run_id") + left_join(dsig_filt, by = "portal_run_id") |> + relocate(sig_top2, .before = "hrd_chord") |> + relocate(s3_or_gds, .after = "SubjectID") +dt_view(dall) ``` -```{r join_excel_layla} +```{r join_excel_layla, eval=FALSE} excel_all <- here("nogit/umccrise/Combined analysis Jan22_Aug23.xlsx") |> readxl::read_xlsx(sheet = "All") excel_all |> @@ -325,7 +350,7 @@ excel_all |> ### HRD Results -```{r hrd_plot, fig.width=15, fig.height = 10} +```{r hrd_plot, fig.width=15, fig.height = 15} p1 <- dall |> mutate( sbj = glue("{SubjectID}_{LibraryID_tumor}"), @@ -348,9 +373,7 @@ plotly::ggplotly(p1) ### Signature Results -**TODO** - -```{r fig.width = 15, fig.height=50, eval=FALSE} +```{r fig.width = 15, fig.height=65, eval=TRUE} sig_order2015 <- paste0("Sig", 1:30) sig_order2020 <- paste0( "SBS", @@ -366,8 +389,12 @@ sig_order2020 <- paste0( ) ) -d2p <- d2 |> - # filter(Rank %in% c(1:5)) |> +p2_prep <- dsig |> + filter( + Sig_group == "s2015", + Rank %in% c(1:3) + ) |> + left_join(dall |> select(portal_run_id, date_analysed_aest, SubjectID, LibraryID_tumor), by = "portal_run_id") |> mutate( sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")), date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") @@ -376,13 +403,14 @@ d2p <- d2 |> date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq ) |> mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020))) -p2 <- - d2p |> - ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) + +p2 <- p2_prep |> + filter(!grepl("ALLOCATE", sbj)) |> # get rid of ALLOCATE subject + ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) + ggplot2::geom_bar(position = "fill", stat = "identity") + - ggplot2::facet_wrap(~Sig_group, ncol = 1) + ggplot2::theme_bw(base_size = 7) +# ggplot2::facet_wrap(~Sig_group, ncol = 1) -plotly::ggplotly(p2) +plotly::ggplotly(p2, tooltip = c("x", "text", "fill")) ``` @@ -391,25 +419,31 @@ plotly::ggplotly(p2) ### ProjectOwner ```{r ProjectOwner} -count(d1, ProjectOwner) |> dt_view(scroll_y = 400) +count(dall, ProjectOwner) |> dt_view(scroll_y = 400) ``` ### ProjectName ```{r ProjectName} -count(d1, ProjectName) |> dt_view(scroll_y = 400) +count(dall, ProjectName) |> dt_view(scroll_y = 400) ``` ### Type ```{r Type} -count(d1, Type) |> dt_view(scroll_y = 400) +count(dall, Type) |> dt_view(scroll_y = 400) ``` ### Workflow ```{r Workflow} -count(d1, Workflow) |> dt_view(scroll_y = 400) +count(dall, Workflow) |> dt_view(scroll_y = 400) +``` + +### S3orGDS + +```{r s3orgds} +count(dall, s3_or_gds) |> dt_view(scroll_y = 400) ```