update umccrise multi reporter with s3 results

umccr · Sep 13, 2023 · 045c9cb · 045c9cb
1 parent e33d275
commit 045c9cb
Showing 1 changed file with 63 additions and 29 deletions.
diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
@@ -60,6 +60,7 @@ knitr::opts_chunk$set(
   require(ggplot2, include.only = c("ggplot", "aes"))
   require(lubridate, include.only = c("as_datetime"))
   require(plotly, include.only = c("ggplotly"))
+  require(openssl, include.only = c("sha256"))
 }
 ```
 
@@ -168,9 +169,21 @@ x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds"))
 ```{r data_load}
 lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |>
   readr::read_rds()
-dat_s3 <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |>
+dat_s3_raw <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |>
   purrr::map(readr::read_rds) |>
   bind_rows()
+# create sha256 for umccrise directory to distinguish between runs
+# keep first 8 digits and append to umccrise date folder.
+dat_s3 <- dat_s3_raw |>
+  mutate(
+    um_dir = sub("s3://umccr-primary-data-prod/(.*)/cancer_report_tables/.*", "\\1", path),
+    date_dir = basename(dirname(dirname(um_dir))),
+    date_dir = gsub("-", "", date_dir),
+    hash256 = openssl::sha256(um_dir),
+    hash256 = substr(hash256, 1, 8),
+    portal_run_id = glue("fake.{date_dir}{hash256}")
+  ) |>
+  select(-c(um_dir, date_dir, hash256, SampleID_tumor))
 dat_gds <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "x[1-5]{1}.rds") |>
   purrr::map(readr::read_rds) |>
   bind_rows()
@@ -183,9 +196,10 @@ dat_s3_res <- dat_s3 |>
       .default = .data$type
     ),
     date_utc2 = lubridate::as_datetime(.data$date_utc, format = "%Y-%m-%dT%H:%M:%S+00:00"),
-    date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne")
+    date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne"),
+    date_analysed_aest = as.character(.data$date_analysed_aest)
   ) |>
-  select(date_analysed_aest, SubjectID, LibraryID_tumor, SampleID_tumor, type, objp)
+  select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id)
 dat_gds_res <- dat_gds |>
   mutate(
     type = case_when(
@@ -195,7 +209,7 @@ dat_gds_res <- dat_gds |>
     ),
     date_analysed_aest = as.character(.data$end),
   ) |>
-  select(date_analysed_aest, SubjectID, LibraryID_tumor, LibraryID_normal, type, objp, portal_run_id)
+  select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id)
 
 lims_s3 <- lims_raw |>
   filter(LibraryID %in% dat_s3_res$LibraryID_tumor) |>
@@ -207,9 +221,13 @@ lims_gds <- lims_raw |>
   distinct()
 
 o1 <- dat_s3_res |>
-  left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID"))
-#################### UP TO HERE^ ###########################################
-
+  left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
+  mutate(
+    url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"),
+    sbj_url = glue("<a href={url}>{.data$SubjectID}</a>"),
+    url = glue("<a href={url}>{.data$url}</a>")
+  ) |>
+  rename(portal_url = url)
 o2 <- dat_gds_res |>
   left_join(lims_gds, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
   mutate(
@@ -220,7 +238,11 @@ o2 <- dat_gds_res |>
   ) |>
   rename(portal_url = url)
 
+d <- list(s3 = o1, gds = o2) |>
+  bind_rows(.id = "s3_or_gds")
+
 dt_view <- function(x, scroll_y = 1000, ...) {
+  options(DT.TOJSON_ARGS = list(na = "string"))
   x |>
     mutate(across(where(is.character), as.factor)) |>
     DT::datatable(
@@ -239,10 +261,10 @@ dt_view <- function(x, scroll_y = 1000, ...) {
     )
 }
 
-qcsum <- o2 |>
+qcsum <- d |>
   filter(type == "UmQcSumFile") |>
   tidyr::unnest_wider(objp)
-hrd_chord <- o2 |>
+hrd_chord <- d |>
   filter(type == "UmChordTsvFile") |>
   tidyr::unnest_wider(objp) |>
   select(portal_run_id,
@@ -257,12 +279,12 @@ hrd_chord <- o2 |>
 #   filter(type == "UmHrdetectTsvFile") |>
 #   unnest_wider(objp) |>
 #   select(portal_run_id, hrdetect_prob = Probability)
-sigs_2015 <- o2 |>
+sigs_2015 <- d |>
   filter(type == "UmSigsSnvFile2015") |>
   tidyr::unnest_wider(objp) |>
   select(-c(type)) |>
   tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
-sigs_2020 <- o2 |>
+sigs_2020 <- d |>
   filter(type == "UmSigsSnvFile2020") |>
   tidyr::unnest_wider(objp) |>
   select(-c(type)) |>
@@ -274,7 +296,7 @@ sigs_2020 <- o2 |>
 ```{r final_tab}
 cols_select1 <- c(
   "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID",
-  "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal",
+  "ProjectOwner", "ProjectName", "Type", "Workflow",
   "hrd_chord", "hrd_hrdetect",
   "chord_hr_status", "chord_hrd_type", "chord_p_BRCA1", "chord_p_BRCA2",
   "qc_status_hmf", "sex_hmf", "purity_hmf", "ploidy_hmf", "msi_hmf",
@@ -309,10 +331,13 @@ dsig_filt <- dsig |>
 dall <- qcsum |>
   left_join(hrd_chord, by = "portal_run_id") |>
   select(all_of(cols_select1), everything(), -c("type")) |>
-  left_join(dsig_filt, by = "portal_run_id")
+  left_join(dsig_filt, by = "portal_run_id") |>
+  relocate(sig_top2, .before = "hrd_chord") |>
+  relocate(s3_or_gds, .after = "SubjectID")
+dt_view(dall)
 ```
 
-```{r join_excel_layla}
+```{r join_excel_layla, eval=FALSE}
 excel_all <- here("nogit/umccrise/Combined analysis Jan22_Aug23.xlsx") |>
   readxl::read_xlsx(sheet = "All")
 excel_all |>
@@ -325,7 +350,7 @@ excel_all |>
 
 ### HRD Results
 
-```{r hrd_plot, fig.width=15, fig.height = 10}
+```{r hrd_plot, fig.width=15, fig.height = 15}
 p1 <- dall |>
   mutate(
     sbj = glue("{SubjectID}_{LibraryID_tumor}"),
@@ -348,9 +373,7 @@ plotly::ggplotly(p1)
 
 ### Signature Results
 
-**TODO**
-
-```{r fig.width = 15, fig.height=50, eval=FALSE}
+```{r fig.width = 15, fig.height=65, eval=TRUE}
 sig_order2015 <- paste0("Sig", 1:30)
 sig_order2020 <- paste0(
   "SBS",
@@ -366,8 +389,12 @@ sig_order2020 <- paste0(
   )
 )
 
-d2p <- d2 |>
-  # filter(Rank %in% c(1:5)) |>
+p2_prep <- dsig |>
+  filter(
+    Sig_group == "s2015",
+    Rank %in% c(1:3)
+  ) |>
+  left_join(dall |> select(portal_run_id, date_analysed_aest, SubjectID, LibraryID_tumor), by = "portal_run_id") |>
   mutate(
     sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")),
     date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S")
@@ -376,13 +403,14 @@ d2p <- d2 |>
     date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq
   ) |>
   mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020)))
-p2 <-
-  d2p |>
-  ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) +
+p2 <- p2_prep |>
+  filter(!grepl("ALLOCATE", sbj)) |> # get rid of ALLOCATE subject
+  ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) +
   ggplot2::geom_bar(position = "fill", stat = "identity") +
-  ggplot2::facet_wrap(~Sig_group, ncol = 1)
+  ggplot2::theme_bw(base_size = 7)
+# ggplot2::facet_wrap(~Sig_group, ncol = 1)
 
-plotly::ggplotly(p2)
+plotly::ggplotly(p2, tooltip = c("x", "text", "fill"))
 ```
 
 
@@ -391,25 +419,31 @@ plotly::ggplotly(p2)
 ### ProjectOwner
 
 ```{r ProjectOwner}
-count(d1, ProjectOwner) |> dt_view(scroll_y = 400)
+count(dall, ProjectOwner) |> dt_view(scroll_y = 400)
 ```
 
 ### ProjectName
 
 ```{r ProjectName}
-count(d1, ProjectName) |> dt_view(scroll_y = 400)
+count(dall, ProjectName) |> dt_view(scroll_y = 400)
 ```
 
 ### Type
 
 ```{r Type}
-count(d1, Type) |> dt_view(scroll_y = 400)
+count(dall, Type) |> dt_view(scroll_y = 400)
 ```
 
 ### Workflow
 
 ```{r Workflow}
-count(d1, Workflow) |> dt_view(scroll_y = 400)
+count(dall, Workflow) |> dt_view(scroll_y = 400)
+```
+
+### S3orGDS
+
+```{r s3orgds}
+count(dall, s3_or_gds) |> dt_view(scroll_y = 400)
 ```
 
 </div>