From 045c9cb51c0a9b5778c838707b6f7ec2161a1111 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 13 Sep 2023 17:12:44 +1000 Subject: [PATCH] update umccrise multi reporter with s3 results --- inst/rmd/umccr_workflows/umccrise/multi.Rmd | 92 ++++++++++++++------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index 4905c4a..21b7bb2 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -60,6 +60,7 @@ knitr::opts_chunk$set( require(ggplot2, include.only = c("ggplot", "aes")) require(lubridate, include.only = c("as_datetime")) require(plotly, include.only = c("ggplotly")) + require(openssl, include.only = c("sha256")) } ``` @@ -168,9 +169,21 @@ x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds")) ```{r data_load} lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |> readr::read_rds() -dat_s3 <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |> +dat_s3_raw <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |> purrr::map(readr::read_rds) |> bind_rows() +# create sha256 for umccrise directory to distinguish between runs +# keep first 8 digits and append to umccrise date folder. +dat_s3 <- dat_s3_raw |> + mutate( + um_dir = sub("s3://umccr-primary-data-prod/(.*)/cancer_report_tables/.*", "\\1", path), + date_dir = basename(dirname(dirname(um_dir))), + date_dir = gsub("-", "", date_dir), + hash256 = openssl::sha256(um_dir), + hash256 = substr(hash256, 1, 8), + portal_run_id = glue("fake.{date_dir}{hash256}") + ) |> + select(-c(um_dir, date_dir, hash256, SampleID_tumor)) dat_gds <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "x[1-5]{1}.rds") |> purrr::map(readr::read_rds) |> bind_rows() @@ -183,9 +196,10 @@ dat_s3_res <- dat_s3 |> .default = .data$type ), date_utc2 = lubridate::as_datetime(.data$date_utc, format = "%Y-%m-%dT%H:%M:%S+00:00"), - date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne") + date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne"), + date_analysed_aest = as.character(.data$date_analysed_aest) ) |> - select(date_analysed_aest, SubjectID, LibraryID_tumor, SampleID_tumor, type, objp) + select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id) dat_gds_res <- dat_gds |> mutate( type = case_when( @@ -195,7 +209,7 @@ dat_gds_res <- dat_gds |> ), date_analysed_aest = as.character(.data$end), ) |> - select(date_analysed_aest, SubjectID, LibraryID_tumor, LibraryID_normal, type, objp, portal_run_id) + select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id) lims_s3 <- lims_raw |> filter(LibraryID %in% dat_s3_res$LibraryID_tumor) |> @@ -207,9 +221,13 @@ lims_gds <- lims_raw |> distinct() o1 <- dat_s3_res |> - left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) -#################### UP TO HERE^ ########################################### - + left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> + mutate( + url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"), + sbj_url = glue("{.data$SubjectID}"), + url = glue("{.data$url}") + ) |> + rename(portal_url = url) o2 <- dat_gds_res |> left_join(lims_gds, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> mutate( @@ -220,7 +238,11 @@ o2 <- dat_gds_res |> ) |> rename(portal_url = url) +d <- list(s3 = o1, gds = o2) |> + bind_rows(.id = "s3_or_gds") + dt_view <- function(x, scroll_y = 1000, ...) { + options(DT.TOJSON_ARGS = list(na = "string")) x |> mutate(across(where(is.character), as.factor)) |> DT::datatable( @@ -239,10 +261,10 @@ dt_view <- function(x, scroll_y = 1000, ...) { ) } -qcsum <- o2 |> +qcsum <- d |> filter(type == "UmQcSumFile") |> tidyr::unnest_wider(objp) -hrd_chord <- o2 |> +hrd_chord <- d |> filter(type == "UmChordTsvFile") |> tidyr::unnest_wider(objp) |> select(portal_run_id, @@ -257,12 +279,12 @@ hrd_chord <- o2 |> # filter(type == "UmHrdetectTsvFile") |> # unnest_wider(objp) |> # select(portal_run_id, hrdetect_prob = Probability) -sigs_2015 <- o2 |> +sigs_2015 <- d |> filter(type == "UmSigsSnvFile2015") |> tidyr::unnest_wider(objp) |> select(-c(type)) |> tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) -sigs_2020 <- o2 |> +sigs_2020 <- d |> filter(type == "UmSigsSnvFile2020") |> tidyr::unnest_wider(objp) |> select(-c(type)) |> @@ -274,7 +296,7 @@ sigs_2020 <- o2 |> ```{r final_tab} cols_select1 <- c( "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID", - "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal", + "ProjectOwner", "ProjectName", "Type", "Workflow", "hrd_chord", "hrd_hrdetect", "chord_hr_status", "chord_hrd_type", "chord_p_BRCA1", "chord_p_BRCA2", "qc_status_hmf", "sex_hmf", "purity_hmf", "ploidy_hmf", "msi_hmf", @@ -309,10 +331,13 @@ dsig_filt <- dsig |> dall <- qcsum |> left_join(hrd_chord, by = "portal_run_id") |> select(all_of(cols_select1), everything(), -c("type")) |> - left_join(dsig_filt, by = "portal_run_id") + left_join(dsig_filt, by = "portal_run_id") |> + relocate(sig_top2, .before = "hrd_chord") |> + relocate(s3_or_gds, .after = "SubjectID") +dt_view(dall) ``` -```{r join_excel_layla} +```{r join_excel_layla, eval=FALSE} excel_all <- here("nogit/umccrise/Combined analysis Jan22_Aug23.xlsx") |> readxl::read_xlsx(sheet = "All") excel_all |> @@ -325,7 +350,7 @@ excel_all |> ### HRD Results -```{r hrd_plot, fig.width=15, fig.height = 10} +```{r hrd_plot, fig.width=15, fig.height = 15} p1 <- dall |> mutate( sbj = glue("{SubjectID}_{LibraryID_tumor}"), @@ -348,9 +373,7 @@ plotly::ggplotly(p1) ### Signature Results -**TODO** - -```{r fig.width = 15, fig.height=50, eval=FALSE} +```{r fig.width = 15, fig.height=65, eval=TRUE} sig_order2015 <- paste0("Sig", 1:30) sig_order2020 <- paste0( "SBS", @@ -366,8 +389,12 @@ sig_order2020 <- paste0( ) ) -d2p <- d2 |> - # filter(Rank %in% c(1:5)) |> +p2_prep <- dsig |> + filter( + Sig_group == "s2015", + Rank %in% c(1:3) + ) |> + left_join(dall |> select(portal_run_id, date_analysed_aest, SubjectID, LibraryID_tumor), by = "portal_run_id") |> mutate( sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")), date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") @@ -376,13 +403,14 @@ d2p <- d2 |> date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq ) |> mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020))) -p2 <- - d2p |> - ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) + +p2 <- p2_prep |> + filter(!grepl("ALLOCATE", sbj)) |> # get rid of ALLOCATE subject + ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) + ggplot2::geom_bar(position = "fill", stat = "identity") + - ggplot2::facet_wrap(~Sig_group, ncol = 1) + ggplot2::theme_bw(base_size = 7) +# ggplot2::facet_wrap(~Sig_group, ncol = 1) -plotly::ggplotly(p2) +plotly::ggplotly(p2, tooltip = c("x", "text", "fill")) ``` @@ -391,25 +419,31 @@ plotly::ggplotly(p2) ### ProjectOwner ```{r ProjectOwner} -count(d1, ProjectOwner) |> dt_view(scroll_y = 400) +count(dall, ProjectOwner) |> dt_view(scroll_y = 400) ``` ### ProjectName ```{r ProjectName} -count(d1, ProjectName) |> dt_view(scroll_y = 400) +count(dall, ProjectName) |> dt_view(scroll_y = 400) ``` ### Type ```{r Type} -count(d1, Type) |> dt_view(scroll_y = 400) +count(dall, Type) |> dt_view(scroll_y = 400) ``` ### Workflow ```{r Workflow} -count(d1, Workflow) |> dt_view(scroll_y = 400) +count(dall, Workflow) |> dt_view(scroll_y = 400) +``` + +### S3orGDS + +```{r s3orgds} +count(dall, s3_or_gds) |> dt_view(scroll_y = 400) ```