From 88100ebb018f84f6449f7d750e74eb38990d7d66 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Wed, 6 Sep 2023 17:03:33 +1000
Subject: [PATCH 01/10] sig breakdown

---
 inst/rmd/umccr_workflows/umccrise/multi.Rmd | 89 +++++++++++++++++----
 1 file changed, 75 insertions(+), 14 deletions(-)

diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
index 4689ad5..98091c2 100644
--- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd
+++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
@@ -64,7 +64,7 @@ knitr::opts_chunk$set(
 ```
 
 ```{r data_setup, eval=FALSE}
-options(width = 150)
+# options(width = 150)
 token <- dracarys::ica_token_validate(Sys.getenv("ICA_ACCESS_TOKEN_PRO"))
 pmeta <- here("nogit/umccrise/rds/portal_meta/2023-09-04_pmeta_final.rds") |>
   readr::read_rds()
@@ -194,17 +194,19 @@ hrd_chord <- o2 |>
 sigs_2015 <- o2 |>
   filter(type == "UmSigsSnvFile2015") |>
   unnest_wider(objp) |>
-  select(-c(type))
+  select(-c(type)) |>
+  tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
 sigs_2020 <- o2 |>
   filter(type == "UmSigsSnvFile2020") |>
   unnest_wider(objp) |>
-  select(-c(type))
+  select(-c(type)) |>
+  tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
 ```
 
 ## umccrise Results
 
 ```{r final_tab}
-cols_select <- c(
+cols_select1 <- c(
   "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID",
   "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal",
   "hrd_chord", "hrd_hrdetect",
@@ -214,16 +216,36 @@ cols_select <- c(
   "deleted_genes_hmf", "tmb_hmf", "tml_hmf", "wgd_hmf", "hypermutated",
   "bpi_enabled", "portal_run_id", "portal_url"
 )
-d <- qcsum |>
+cols_select2 <- c(
+  "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID",
+  "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal",
+  "Sig_group", "Rank", "Signature", "Contribution", "RelFreq",
+  "portal_run_id", "portal_url"
+)
+d1 <- qcsum |>
   dplyr::left_join(hrd_chord, by = "portal_run_id") |>
-  dplyr::select(dplyr::all_of(cols_select), dplyr::everything(), -c("type"))
-dt_view(d, caption = "umccrise Results Summary")
+  dplyr::select(dplyr::all_of(cols_select1), dplyr::everything(), -c("type"))
+dt_view(d1, caption = "umccrise Results Summary")
+
+d2 <- dplyr::bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |>
+  dplyr::select(dplyr::all_of(cols_select2), dplyr::everything())
+dt_view(d2, caption = "Signature contributions (2015 and 2020)")
+
+d2 |>
+  group_by(date_analysed_aest, SubjectID, LibraryID_tumor, Sig_group) |>
+  mutate(
+    Signature_all = paste(glue("{Signature} ({RelFreq})"), collapse = ", ")
+  ) |>
+  ungroup() |>
+  select(date_analysed_aest:Sig_group, Signature_all) |>
+  distinct() |>
+  arrange(desc(date_analysed_aest), SubjectID)
 ```
 
 ### HRD Results
 
 ```{r hrd_plot, fig.width=15, fig.height = 10}
-p <- d |>
+d1p <- d1 |>
   dplyr::mutate(
     sbj = glue("{SubjectID}_{LibraryID_tumor}"),
     date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S")
@@ -234,8 +256,8 @@ p <- d |>
     chord = hrd_chord, hrdetect = hrd_hrdetect,
   ) |>
   tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability")
-p1 <- p |>
-  ggplot(aes(x = date, y = probability, label = sbj)) +
+p1 <- d1p |>
+  ggplot2::ggplot(aes(x = date, y = probability, label = sbj)) +
   ggplot2::geom_point(aes(colour = method)) +
   ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) +
   ggplot2::theme_bw() +
@@ -244,31 +266,70 @@ p1 <- p |>
 plotly::ggplotly(p1)
 ```
 
+### Signature Results
+
+**TODO**
+
+```{r fig.width = 15, fig.height=50, eval=FALSE}
+sig_order2015 <- paste0("Sig", 1:30)
+sig_order2020 <- paste0(
+  "SBS",
+  c(
+    1:6,
+    paste0(7, c("a", "b", "c", "d")),
+    8:9,
+    paste0(10, c("a", "b", "c", "d")),
+    11:16,
+    paste0(17, c("a", "b")),
+    18:60,
+    84:94
+  )
+)
+
+d2p <- d2 |>
+  # dplyr::filter(Rank %in% c(1:5)) |>
+  dplyr::mutate(
+    sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")),
+    date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S")
+  ) |>
+  dplyr::select(
+    date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq
+  ) |>
+  dplyr::mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020)))
+p2 <-
+  d2p |>
+  ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) +
+  ggplot2::geom_bar(position = "fill", stat = "identity") +
+  ggplot2::facet_wrap(~Sig_group, ncol = 1)
+
+plotly::ggplotly(p2)
+```
+
 
 ## Metadata Summary {.tabset .tabset-pills}
 
 ### ProjectOwner
 
 ```{r ProjectOwner}
-count(d, ProjectOwner) |> dt_view(scroll_y = 400)
+count(d1, ProjectOwner) |> dt_view(scroll_y = 400)
 ```
 
 ### ProjectName
 
 ```{r ProjectName}
-count(d, ProjectName) |> dt_view(scroll_y = 400)
+count(d1, ProjectName) |> dt_view(scroll_y = 400)
 ```
 
 ### Type
 
 ```{r Type}
-count(d, Type) |> dt_view(scroll_y = 400)
+count(d1, Type) |> dt_view(scroll_y = 400)
 ```
 
 ### Workflow
 
 ```{r Workflow}
-count(d, Workflow) |> dt_view(scroll_y = 400)
+count(d1, Workflow) |> dt_view(scroll_y = 400)
 ```
 
 </div>

From c351abddeac4821219681b2241278a5b87ea7fa2 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Fri, 8 Sep 2023 11:18:55 +1000
Subject: [PATCH 02/10] sigs: keep top 2 ranked

---
 inst/rmd/umccr_workflows/umccrise/multi.Rmd | 63 +++++++++++++--------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
index 98091c2..581cab1 100644
--- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd
+++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
@@ -47,10 +47,10 @@ knitr::opts_chunk$set(
 
 ```{r load_pkgs}
 {
-  require(dplyr)
+  require(dplyr) # import all dplyr funcs
   require(readr, include.only = c("read_rds"))
   require(purrr, include.only = c("map"))
-  require(tidyr, include.only = c("unnest", "unnest_wider"))
+  require(tidyr, include.only = c("unnest"))
   require(dracarys)
   require(glue, include.only = "glue")
   require(here, include.only = "here")
@@ -88,14 +88,14 @@ saveRDS(gds_map, here("nogit/umccrise/rds/gds_map_2023-09-05.rds"))
 parse_files <- function(gds_map, row_slice, rds_out) {
   start_time <- Sys.time()
   dat1 <- gds_map |>
-    dplyr::slice(row_slice) |>
-    dplyr::rowwise() |>
-    dplyr::mutate(
+    slice(row_slice) |>
+    rowwise() |>
+    mutate(
       gen = list(dracarys::dr_func_eval(.data$type)),
       obj = list(.data$gen$new(.data$presigned_url)),
       objp = list(.data$obj$read())
     ) |>
-    dplyr::ungroup()
+    ungroup()
   end_time <- Sys.time()
   total_time <- end_time - start_time
   print(total_time)
@@ -117,7 +117,7 @@ lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |>
   readr::read_rds()
 dat1 <- fs::dir_ls(here("nogit/umccrise/rds/results")) |>
   purrr::map(readr::read_rds) |>
-  dplyr::bind_rows()
+  bind_rows()
 
 o <- dat1 |>
   mutate(
@@ -139,24 +139,24 @@ o <- dat1 |>
   )
 
 lims <- lims_raw |>
-  dplyr::filter(LibraryID %in% c(o$LibraryID_tumor)) |>
-  dplyr::select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |>
-  dplyr::distinct()
+  filter(LibraryID %in% c(o$LibraryID_tumor)) |>
+  select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |>
+  distinct()
 
 
 o2 <- o |>
-  dplyr::left_join(lims, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
-  dplyr::mutate(
+  left_join(lims, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
+  mutate(
     url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"),
     sbj_url = glue("<a href={url}>{.data$SubjectID}</a>"),
     url = glue("<a href={url}>{.data$url}</a>"),
     portal_run_id = glue("dr.{portal_run_id}")
   ) |>
-  dplyr::rename(portal_url = url)
+  rename(portal_url = url)
 
 dt_view <- function(x, scroll_y = 1000, ...) {
   x |>
-    dplyr::mutate(across(where(is.character), as.factor)) |>
+    mutate(across(where(is.character), as.factor)) |>
     DT::datatable(
       filter = list(position = "top", clear = FALSE, plain = TRUE),
       class = "cell-border display compact",
@@ -223,12 +223,27 @@ cols_select2 <- c(
   "portal_run_id", "portal_url"
 )
 d1 <- qcsum |>
-  dplyr::left_join(hrd_chord, by = "portal_run_id") |>
-  dplyr::select(dplyr::all_of(cols_select1), dplyr::everything(), -c("type"))
+  left_join(hrd_chord, by = "portal_run_id") |>
+  select(all_of(cols_select1), everything(), -c("type"))
 dt_view(d1, caption = "umccrise Results Summary")
 
-d2 <- dplyr::bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |>
-  dplyr::select(dplyr::all_of(cols_select2), dplyr::everything())
+d2 <- bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |>
+  select(all_of(cols_select2), everything())
+
+d2_filt <- d2 |>
+  filter(
+    Sig_group == "s2015"
+  ) |>
+  group_by(portal_run_id) |>
+  mutate(tot_sig_vars = sum(Contribution)) |>
+  arrange(Rank) |>
+  slice_head(n = 2) |>
+  mutate(rn = row_number()) |>
+  ungroup() |>
+  mutate(
+    sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})")
+  )
+
 dt_view(d2, caption = "Signature contributions (2015 and 2020)")
 
 d2 |>
@@ -246,11 +261,11 @@ d2 |>
 
 ```{r hrd_plot, fig.width=15, fig.height = 10}
 d1p <- d1 |>
-  dplyr::mutate(
+  mutate(
     sbj = glue("{SubjectID}_{LibraryID_tumor}"),
     date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S")
   ) |>
-  dplyr::select(
+  select(
     date,
     sbj,
     chord = hrd_chord, hrdetect = hrd_hrdetect,
@@ -287,15 +302,15 @@ sig_order2020 <- paste0(
 )
 
 d2p <- d2 |>
-  # dplyr::filter(Rank %in% c(1:5)) |>
-  dplyr::mutate(
+  # filter(Rank %in% c(1:5)) |>
+  mutate(
     sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")),
     date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S")
   ) |>
-  dplyr::select(
+  select(
     date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq
   ) |>
-  dplyr::mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020)))
+  mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020)))
 p2 <-
   d2p |>
   ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) +

From f013e05f9667c2905ccfc61f9eff502e0e6a04c3 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Fri, 8 Sep 2023 15:27:39 +1000
Subject: [PATCH 03/10] cleanup

---
 inst/rmd/umccr_workflows/umccrise/multi.Rmd | 56 ++++++++-------------
 1 file changed, 22 insertions(+), 34 deletions(-)

diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
index 581cab1..a718c54 100644
--- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd
+++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
@@ -175,10 +175,10 @@ dt_view <- function(x, scroll_y = 1000, ...) {
 
 qcsum <- o2 |>
   filter(type == "UmQcSumFile") |>
-  unnest_wider(objp)
+  tidyr::unnest_wider(objp)
 hrd_chord <- o2 |>
   filter(type == "UmChordTsvFile") |>
-  unnest_wider(objp) |>
+  tidyr::unnest_wider(objp) |>
   select(portal_run_id,
     # chord_p_hrd = p_hrd,
     chord_hr_status = hr_status,
@@ -193,12 +193,12 @@ hrd_chord <- o2 |>
 #   select(portal_run_id, hrdetect_prob = Probability)
 sigs_2015 <- o2 |>
   filter(type == "UmSigsSnvFile2015") |>
-  unnest_wider(objp) |>
+  tidyr::unnest_wider(objp) |>
   select(-c(type)) |>
   tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
 sigs_2020 <- o2 |>
   filter(type == "UmSigsSnvFile2020") |>
-  unnest_wider(objp) |>
+  tidyr::unnest_wider(objp) |>
   select(-c(type)) |>
   tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
 ```
@@ -216,21 +216,12 @@ cols_select1 <- c(
   "deleted_genes_hmf", "tmb_hmf", "tml_hmf", "wgd_hmf", "hypermutated",
   "bpi_enabled", "portal_run_id", "portal_url"
 )
-cols_select2 <- c(
-  "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID",
-  "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal",
-  "Sig_group", "Rank", "Signature", "Contribution", "RelFreq",
-  "portal_run_id", "portal_url"
-)
-d1 <- qcsum |>
-  left_join(hrd_chord, by = "portal_run_id") |>
-  select(all_of(cols_select1), everything(), -c("type"))
-dt_view(d1, caption = "umccrise Results Summary")
-
-d2 <- bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |>
-  select(all_of(cols_select2), everything())
+# signatures
+dsig <- bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |>
+  select(portal_run_id, Sig_group, Rank, Signature, Contribution, RelFreq)
 
-d2_filt <- d2 |>
+# keep top two ranked sigs from 2015
+dsig_filt <- dsig |>
   filter(
     Sig_group == "s2015"
   ) |>
@@ -238,29 +229,27 @@ d2_filt <- d2 |>
   mutate(tot_sig_vars = sum(Contribution)) |>
   arrange(Rank) |>
   slice_head(n = 2) |>
-  mutate(rn = row_number()) |>
+  # some sigs have same Rank so use explicit sig_rank
+  mutate(sig_rank = row_number()) |>
   ungroup() |>
   mutate(
     sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})")
-  )
-
-dt_view(d2, caption = "Signature contributions (2015 and 2020)")
-
-d2 |>
-  group_by(date_analysed_aest, SubjectID, LibraryID_tumor, Sig_group) |>
-  mutate(
-    Signature_all = paste(glue("{Signature} ({RelFreq})"), collapse = ", ")
   ) |>
-  ungroup() |>
-  select(date_analysed_aest:Sig_group, Signature_all) |>
-  distinct() |>
-  arrange(desc(date_analysed_aest), SubjectID)
+  select(portal_run_id, sig_rank, sig_summary) |>
+  tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |>
+  mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |>
+  select(portal_run_id, sig_top2)
+
+dall <- qcsum |>
+  left_join(hrd_chord, by = "portal_run_id") |>
+  select(all_of(cols_select1), everything(), -c("type")) |>
+  left_join(dsig_filt, by = "portal_run_id")
 ```
 
 ### HRD Results
 
 ```{r hrd_plot, fig.width=15, fig.height = 10}
-d1p <- d1 |>
+p1 <- dall |>
   mutate(
     sbj = glue("{SubjectID}_{LibraryID_tumor}"),
     date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S")
@@ -270,8 +259,7 @@ d1p <- d1 |>
     sbj,
     chord = hrd_chord, hrdetect = hrd_hrdetect,
   ) |>
-  tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability")
-p1 <- d1p |>
+  tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") |>
   ggplot2::ggplot(aes(x = date, y = probability, label = sbj)) +
   ggplot2::geom_point(aes(colour = method)) +
   ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) +

From f0081fd7a991551a21cd37c24e1fb974c3237cd2 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Fri, 8 Sep 2023 17:01:36 +1000
Subject: [PATCH 04/10] join sigs with excel sheet

---
 inst/rmd/umccr_workflows/umccrise/multi.Rmd | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
index a718c54..4cec7c0 100644
--- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd
+++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
@@ -246,6 +246,17 @@ dall <- qcsum |>
   left_join(dsig_filt, by = "portal_run_id")
 ```
 
+```{r join_excel_layla}
+excel_all <- here("nogit/umccrise/Combined analysis Jan22_Aug23.xlsx") |>
+  readxl::read_xlsx(sheet = "All")
+excel_all |>
+  select("...1", portal_run_id) |>
+  left_join(dall |> select(portal_run_id, sig_top2)) |>
+  rename(N = "...1") |>
+  readr::write_csv("sigs_top2_2023-09-08.csv")
+```
+
+
 ### HRD Results
 
 ```{r hrd_plot, fig.width=15, fig.height = 10}

From 13a0dd909e8192f3b9379d4d10fe42784e66c6bb Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 12 Sep 2023 01:41:38 +1000
Subject: [PATCH 05/10] add aws s3 funcs

---
 .gitignore                  |  1 +
 R/s3.R                      | 67 +++++++++++++++++++++++++++++++++++++
 inst/scripts/umccrise_run.R | 14 ++++++++
 3 files changed, 82 insertions(+)
 create mode 100644 R/s3.R

diff --git a/.gitignore b/.gitignore
index 88c5a3d..e2fec9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 __pycache__/
 *.py[cod]
 .Rproj.user
+.Rhistory
 
 /nogit
 /docs
diff --git a/R/s3.R b/R/s3.R
new file mode 100644
index 0000000..e40ce38
--- /dev/null
+++ b/R/s3.R
@@ -0,0 +1,67 @@
+#' s3_files_list_filter_relevant("s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables/", presign = TRUE)
+s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE) {
+  assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign))
+  pattern <- pattern %||% ".*" # keep all recognisable files by default
+  b <- sub("s3://(.*?)/.*", "\\1", s3dir)
+  p <- sub("s3://(.*?)/(.*)", "\\2", s3dir)
+  cmd <- glue(
+    "aws --output json s3api list-objects-v2 --bucket {b} --prefix {p} ",
+    "--max-items {max_items} --page-size {page_size}"
+  )
+  l <- system(cmd, intern = TRUE)
+  j <- jsonlite::fromJSON(l)
+  assertthat::assert_that("Contents" %in% names(j))
+  d <- j[["Contents"]] |>
+    tibble::as_tibble() |>
+    dplyr::mutate(
+      path = glue("s3://{b}/{.data$Key}"),
+      date1 = .data$LastModified,
+      size = fs::as_fs_bytes(.data$Size)
+    ) |>
+    dplyr::rowwise() |>
+    dplyr::mutate(
+      bname = basename(.data$path),
+      type = purrr::map_chr(.data$bname, match_regex)
+    ) |>
+    dplyr::ungroup() |>
+    dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
+    dplyr::select(path, date1, size, type)
+
+  if (presign) {
+    d <- d |>
+      dplyr::rowwise() |>
+      dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path)) |>
+      dplyr::ungroup()
+  }
+  d
+}
+
+s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) {
+  p <- system(glue("aws s3 presign {s3path} --expires-in {expiry_seconds}"), intern = TRUE)
+  p
+}
+
+# search for files on S3
+s3_search <- function(search, rows) {
+  au_tz <- "Australia/Melbourne"
+  utc_tz <- "UTC"
+  base_url <- "https://api.portal.prod.umccr.org/iam/s3"
+  url1 <- utils::URLencode(glue::glue("{base_url}?rowsPerPage={rows}&search={search}"))
+  awscurl_cmd <- glue::glue(
+    "awscurl '{url1}' ",
+    "--header 'Accept: application/json'"
+  )
+  message(glue::glue("Running {awscurl_cmd}"))
+  j <- system(awscurl_cmd, intern = TRUE)
+  date_fmt <- "%Y-%m-%dT%H:%M:%S"
+  d <- j |>
+    jsonlite::fromJSON() |>
+    purrr::pluck("results") |>
+    tibble::as_tibble()
+  d |>
+    dplyr::mutate(
+      date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt),
+      date_aest = lubridate::with_tz(.data$date1, tz = au_tz)
+    ) |>
+    dplyr::select(path = key, bucket, size, date_aest, id, unique_hash)
+}
diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R
index 8c40a6d..f76ba72 100644
--- a/inst/scripts/umccrise_run.R
+++ b/inst/scripts/umccrise_run.R
@@ -3,7 +3,9 @@ require(here)
 require(glue)
 require(dplyr)
 require(readr)
+require(paws)
 
+#---- GDS ----#
 # read last 1000 umccrise runs from portal
 # 475 from 2022-01-24 until 2023-09-03, of which 449 Succeeded
 date1 <- "2023-09-04"
@@ -44,3 +46,15 @@ d
 
 # final portal meta for umccrise runs
 saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds")))
+
+#---- S3 ----#
+pat <- "qc_summary.tsv.gz"
+rows <- 1000
+d <- s3_search(search = pat, rows = rows)
+
+d |>
+  mutate(
+    dir1 = dirname(path),
+    dir1 = dirname(dir1)
+  ) |>
+  select(dir1, path)

From ee6bd469507166199c969883786ec193e62d06a8 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 12 Sep 2023 10:15:02 +1000
Subject: [PATCH 06/10] doc s3_files_list_filter_relevant

---
 NAMESPACE                            |  1 +
 R/s3.R                               | 22 ++++++++++++++---
 man/s3_files_list_filter_relevant.Rd | 37 ++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 3 deletions(-)
 create mode 100644 man/s3_files_list_filter_relevant.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 2291398..1355ba0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -62,6 +62,7 @@ export(multiqc_tidy_json)
 export(portal_meta_read)
 export(rdf2tab)
 export(read)
+export(s3_files_list_filter_relevant)
 export(session_info_kable)
 export(time_metrics_process)
 export(tso_rmd)
diff --git a/R/s3.R b/R/s3.R
index e40ce38..9b92244 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -1,4 +1,20 @@
-#' s3_files_list_filter_relevant("s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables/", presign = TRUE)
+#' List Relevant Files In S3 Directory
+#'
+#' Lists relevant files in a S3 directory.
+#'
+#' @param s3dir GDS directory.
+#' @param pattern Pattern to further filter the returned file type tibble.
+#' @param page_size The size of each page to get in the AWS service call (def: 1000).
+#' @param max_items The total number of items to return in the command’s output (def: 1000).
+#' @param presign Include presigned URLs (def: FALSE).
+#'
+#' @return A tibble with path, date, file size, file type, and presigned URL if requested.
+#' @examples
+#' \dontrun{
+#' s3dir <- "s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables"
+#' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE)
+#' }
+#' @export
 s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE) {
   assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign))
   pattern <- pattern %||% ".*" # keep all recognisable files by default
@@ -25,7 +41,7 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 100
     ) |>
     dplyr::ungroup() |>
     dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::select(path, date1, size, type)
+    dplyr::select("path", "date1", "size", "type")
 
   if (presign) {
     d <- d |>
@@ -63,5 +79,5 @@ s3_search <- function(search, rows) {
       date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt),
       date_aest = lubridate::with_tz(.data$date1, tz = au_tz)
     ) |>
-    dplyr::select(path = key, bucket, size, date_aest, id, unique_hash)
+    dplyr::select(path = "key", "bucket", "size", "date_aest", "id", "unique_hash")
 }
diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd
new file mode 100644
index 0000000..f8af42e
--- /dev/null
+++ b/man/s3_files_list_filter_relevant.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/s3.R
+\name{s3_files_list_filter_relevant}
+\alias{s3_files_list_filter_relevant}
+\title{List Relevant Files In S3 Directory}
+\usage{
+s3_files_list_filter_relevant(
+  s3dir,
+  pattern = NULL,
+  page_size = 1000,
+  max_items = 1000,
+  presign = FALSE
+)
+}
+\arguments{
+\item{s3dir}{GDS directory.}
+
+\item{pattern}{Pattern to further filter the returned file type tibble.}
+
+\item{page_size}{The size of each page to get in the AWS service call (def: 1000).}
+
+\item{max_items}{The total number of items to return in the command’s output (def: 1000).}
+
+\item{presign}{Include presigned URLs (def: FALSE).}
+}
+\value{
+A tibble with path, date, file size, file type, and presigned URL if requested.
+}
+\description{
+Lists relevant files in a S3 directory.
+}
+\examples{
+\dontrun{
+s3dir <- "s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables"
+s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE)
+}
+}

From 0e1a53fb23fef63ee0507000c4314c6f1c814ce9 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 12 Sep 2023 21:26:57 +1000
Subject: [PATCH 07/10] doc s3_search func

---
 NAMESPACE                            |  1 +
 R/s3.R                               | 35 +++++++++++++++++++++-------
 man/s3_files_list_filter_relevant.Rd |  4 ++--
 man/s3_search.Rd                     | 26 +++++++++++++++++++++
 4 files changed, 55 insertions(+), 11 deletions(-)
 create mode 100644 man/s3_search.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 1355ba0..688287c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -63,6 +63,7 @@ export(portal_meta_read)
 export(rdf2tab)
 export(read)
 export(s3_files_list_filter_relevant)
+export(s3_search)
 export(session_info_kable)
 export(time_metrics_process)
 export(tso_rmd)
diff --git a/R/s3.R b/R/s3.R
index 9b92244..eef476c 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -1,6 +1,6 @@
-#' List Relevant Files In S3 Directory
+#' List Relevant Files In AWS S3 Directory
 #'
-#' Lists relevant files in a S3 directory.
+#' Lists relevant files in an AWS S3 directory.
 #'
 #' @param s3dir GDS directory.
 #' @param pattern Pattern to further filter the returned file type tibble.
@@ -57,17 +57,32 @@ s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) {
   p
 }
 
-# search for files on S3
-s3_search <- function(search, rows) {
+#' Search AWS S3 Objects
+#'
+#' Searches for the given pattern in the UMCCR `umccr-primary-data-prod` AWS S3
+#' bucket.
+#'
+#' @param pat Pattern to search for (e.g. 'multiqc_data.json').
+#' @param rows Max number of rows to return.
+#'
+#' @return Tibble with S3 path, object size, date modified, id, unique hash.
+#'
+#' @examples
+#' \dontrun{
+#' pat <- "qc_summary.tsv.gz"
+#' s3_search(pat, 10)
+#' }
+#' @export
+s3_search <- function(pat, rows) {
   au_tz <- "Australia/Melbourne"
   utc_tz <- "UTC"
   base_url <- "https://api.portal.prod.umccr.org/iam/s3"
-  url1 <- utils::URLencode(glue::glue("{base_url}?rowsPerPage={rows}&search={search}"))
-  awscurl_cmd <- glue::glue(
+  url1 <- utils::URLencode(glue("{base_url}?rowsPerPage={rows}&search={pat}"))
+  awscurl_cmd <- glue(
     "awscurl '{url1}' ",
     "--header 'Accept: application/json'"
   )
-  message(glue::glue("Running {awscurl_cmd}"))
+  message(glue("Running {awscurl_cmd}"))
   j <- system(awscurl_cmd, intern = TRUE)
   date_fmt <- "%Y-%m-%dT%H:%M:%S"
   d <- j |>
@@ -77,7 +92,9 @@ s3_search <- function(search, rows) {
   d |>
     dplyr::mutate(
       date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt),
-      date_aest = lubridate::with_tz(.data$date1, tz = au_tz)
+      date_aest = lubridate::with_tz(.data$date1, tz = au_tz),
+      path = glue("s3://{bucket}/{key}"),
+      size = fs::as_fs_bytes(.data$size)
     ) |>
-    dplyr::select(path = "key", "bucket", "size", "date_aest", "id", "unique_hash")
+    dplyr::select("path", "size", "date_aest", "id", "unique_hash")
 }
diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd
index f8af42e..0578bce 100644
--- a/man/s3_files_list_filter_relevant.Rd
+++ b/man/s3_files_list_filter_relevant.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/s3.R
 \name{s3_files_list_filter_relevant}
 \alias{s3_files_list_filter_relevant}
-\title{List Relevant Files In S3 Directory}
+\title{List Relevant Files In AWS S3 Directory}
 \usage{
 s3_files_list_filter_relevant(
   s3dir,
@@ -27,7 +27,7 @@ s3_files_list_filter_relevant(
 A tibble with path, date, file size, file type, and presigned URL if requested.
 }
 \description{
-Lists relevant files in a S3 directory.
+Lists relevant files in an AWS S3 directory.
 }
 \examples{
 \dontrun{
diff --git a/man/s3_search.Rd b/man/s3_search.Rd
new file mode 100644
index 0000000..c0d9f64
--- /dev/null
+++ b/man/s3_search.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/s3.R
+\name{s3_search}
+\alias{s3_search}
+\title{Search AWS S3 Objects}
+\usage{
+s3_search(pat, rows)
+}
+\arguments{
+\item{pat}{Pattern to search for (e.g. 'multiqc_data.json').}
+
+\item{rows}{Max number of rows to return.}
+}
+\value{
+Tibble with S3 path, object size, date modified, id, unique hash.
+}
+\description{
+Searches for the given pattern in the UMCCR \code{umccr-primary-data-prod} AWS S3
+bucket.
+}
+\examples{
+\dontrun{
+pat <- "qc_summary.tsv.gz"
+s3_search(pat, 10)
+}
+}

From 77d59f3dc696ff1afae3086df14df2069de278a0 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Tue, 12 Sep 2023 23:56:37 +1000
Subject: [PATCH 08/10] generate s3 presigned urls

---
 R/s3.R                                      |  9 ++--
 inst/rmd/umccr_workflows/umccrise/multi.Rmd | 51 +++++++++++++++++++--
 inst/scripts/umccrise_run.R                 | 26 ++++++++---
 man/s3_files_list_filter_relevant.Rd        |  5 +-
 4 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/R/s3.R b/R/s3.R
index eef476c..c798a70 100644
--- a/R/s3.R
+++ b/R/s3.R
@@ -7,6 +7,7 @@
 #' @param page_size The size of each page to get in the AWS service call (def: 1000).
 #' @param max_items The total number of items to return in the command’s output (def: 1000).
 #' @param presign Include presigned URLs (def: FALSE).
+#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).
 #'
 #' @return A tibble with path, date, file size, file type, and presigned URL if requested.
 #' @examples
@@ -15,7 +16,7 @@
 #' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE)
 #' }
 #' @export
-s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE) {
+s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200) {
   assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign))
   pattern <- pattern %||% ".*" # keep all recognisable files by default
   b <- sub("s3://(.*?)/.*", "\\1", s3dir)
@@ -31,7 +32,7 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 100
     tibble::as_tibble() |>
     dplyr::mutate(
       path = glue("s3://{b}/{.data$Key}"),
-      date1 = .data$LastModified,
+      date_utc = .data$LastModified,
       size = fs::as_fs_bytes(.data$Size)
     ) |>
     dplyr::rowwise() |>
@@ -41,12 +42,12 @@ s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 100
     ) |>
     dplyr::ungroup() |>
     dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
-    dplyr::select("path", "date1", "size", "type")
+    dplyr::select("path", "date_utc", "size", "type")
 
   if (presign) {
     d <- d |>
       dplyr::rowwise() |>
-      dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path)) |>
+      dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path, expiry_seconds = expiry_sec)) |>
       dplyr::ungroup()
   }
   d
diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
index 4cec7c0..788f196 100644
--- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd
+++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
@@ -64,7 +64,48 @@ knitr::opts_chunk$set(
 ```
 
 ```{r data_setup, eval=FALSE}
-# options(width = 150)
+#---- S3 ----#
+s3 <- here::here(glue::glue("nogit/umccrise/rds/portal_meta/2023-09-12_pmeta_s3.rds")) |>
+  readr::read_rds()
+s3_get_presigned1 <- function(x, row_slice) {
+  start_time <- Sys.time()
+  s3_map <- x |>
+    slice(row_slice) |>
+    rowwise() |>
+    mutate(
+      s3_contents = list(s3_files_list_filter_relevant(
+        s3dir = .data$dir1, presign = TRUE
+      ))
+    ) |>
+    ungroup() |>
+    tidyr::unnest("s3_contents") |>
+    select(
+      "SubjectID", "LibraryID_tumor", "SampleID_tumor",
+      "date_utc", "type", "size", "path", "presigned_url"
+    )
+  end_time <- Sys.time()
+  total_time <- end_time - start_time
+  print(total_time)
+  s3_map
+}
+# 2 seconds per row
+s3_map1 <- s3_get_presigned1(s3, 1:100)
+s3_map2 <- s3_get_presigned1(s3, 101:200)
+s3_map3 <- s3_get_presigned1(s3, 201:300)
+s3_map4 <- s3_get_presigned1(s3, 301:400)
+s3_map5 <- s3_get_presigned1(s3, 401:449)
+
+saveRDS(s3_map1, here("nogit/umccrise/rds/s3/map1_2023-09-12.rds"))
+saveRDS(s3_map2, here("nogit/umccrise/rds/s3/map2_2023-09-12.rds"))
+saveRDS(s3_map3, here("nogit/umccrise/rds/s3/map3_2023-09-12.rds"))
+saveRDS(s3_map4, here("nogit/umccrise/rds/s3/map4_2023-09-12.rds"))
+saveRDS(s3_map5, here("nogit/umccrise/rds/s3/map5_2023-09-12.rds"))
+s3_map <- fs::dir_ls(here("nogit/umccrise/rds/s3"), regexp = "map.*rds") |>
+  purrr::map(readr::read_rds) |>
+  bind_rows()
+saveRDS(s3_map, here("nogit/umccrise/rds/s3_map_2023-09-12.rds"))
+
+#---- GDS ----#
 token <- dracarys::ica_token_validate(Sys.getenv("ICA_ACCESS_TOKEN_PRO"))
 pmeta <- here("nogit/umccrise/rds/portal_meta/2023-09-04_pmeta_final.rds") |>
   readr::read_rds()
@@ -84,10 +125,12 @@ gds_map <- pmeta |>
   filter(type != "MultiqcFile")
 
 saveRDS(gds_map, here("nogit/umccrise/rds/gds_map_2023-09-05.rds"))
+```
 
-parse_files <- function(gds_map, row_slice, rds_out) {
+```{r data_parse, eval=FALSE}
+parse_files <- function(x, row_slice, rds_out) {
   start_time <- Sys.time()
-  dat1 <- gds_map |>
+  dat1 <- x |>
     slice(row_slice) |>
     rowwise() |>
     mutate(
@@ -102,6 +145,8 @@ parse_files <- function(gds_map, row_slice, rds_out) {
   readr::write_rds(x = dat1, file = rds_out)
 }
 
+rds_pa
+
 gds_map <- readr::read_rds(here("nogit/umccrise/rds/gds_map_2023-09-05.rds"))
 rds_path_out <- here("nogit/umccrise/rds/results")
 x0 <- parse_files(gds_map, 1:10, file.path(rds_path_out, "x0.rds"))
diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R
index f76ba72..6602d87 100644
--- a/inst/scripts/umccrise_run.R
+++ b/inst/scripts/umccrise_run.R
@@ -3,7 +3,6 @@ require(here)
 require(glue)
 require(dplyr)
 require(readr)
-require(paws)
 
 #---- GDS ----#
 # read last 1000 umccrise runs from portal
@@ -45,16 +44,31 @@ d <- pmeta |>
 d
 
 # final portal meta for umccrise runs
+# columns:
+# "id", "wfr_name", "wfr_id", "version", "end_status", "start", "end", "portal_run_id",
+# "SubjectID", "LibraryID_tumor", "LibraryID_normal", "SampleID_tumor", "SampleID_normal",
+# "gds_outdir_umccrise", "gds_indir_dragen_somatic", "gds_indir_dragen_germline", "gds_infile_genomes_tar"
 saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds")))
 
 #---- S3 ----#
 pat <- "qc_summary.tsv.gz"
 rows <- 1000
-d <- s3_search(search = pat, rows = rows)
+d_s3_raw <- dracarys::s3_search(pat = pat, rows = rows)
 
-d |>
+d_s3 <- d_s3_raw |>
+  arrange(desc(date_aest)) |>
   mutate(
-    dir1 = dirname(path),
-    dir1 = dirname(dir1)
+    bname = basename(path),
+    dir1 = dirname(path), # path/to/dirA/cancer_report_tables
+    dir2 = basename(dirname(dir1)), # dirA
+    sbj_samp_lib = sub(".*__(.*)", "\\1", dir2),
+    SubjectID = sub("(SBJ[0-9]{5})_.*", "\\1", sbj_samp_lib),
+    SampleID_tumor = sub("SBJ.*?_(.*?)_.*", "\\1", sbj_samp_lib),
+    LibraryID_tumor = sub("SBJ.*?_.*?_(.*)", "\\1", sbj_samp_lib),
+    rerun = grepl("rerun", .data$LibraryID_tumor)
   ) |>
-  select(dir1, path)
+  select(dir1, SubjectID, LibraryID_tumor, SampleID_tumor, date = date_aest, rerun)
+
+date2 <- "2023-09-12"
+saveRDS(d_s3, file = here(glue("nogit/umccrise/rds/portal_meta/{date2}_pmeta_s3.rds")))
+# now we have S3 paths and metadata, so all we need is to generate presigned URLs and read the data
diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd
index 0578bce..1194eea 100644
--- a/man/s3_files_list_filter_relevant.Rd
+++ b/man/s3_files_list_filter_relevant.Rd
@@ -9,7 +9,8 @@ s3_files_list_filter_relevant(
   pattern = NULL,
   page_size = 1000,
   max_items = 1000,
-  presign = FALSE
+  presign = FALSE,
+  expiry_sec = 43200
 )
 }
 \arguments{
@@ -22,6 +23,8 @@ s3_files_list_filter_relevant(
 \item{max_items}{The total number of items to return in the command’s output (def: 1000).}
 
 \item{presign}{Include presigned URLs (def: FALSE).}
+
+\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).}
 }
 \value{
 A tibble with path, date, file size, file type, and presigned URL if requested.

From e33d275c18636ab855d4161fe0d7f56d15d78b77 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Wed, 13 Sep 2023 01:01:32 +1000
Subject: [PATCH 09/10] ingest s3 files

---
 inst/rmd/umccr_workflows/umccrise/multi.Rmd | 55 ++++++++++++++-------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
index 788f196..4905c4a 100644
--- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd
+++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
@@ -145,10 +145,18 @@ parse_files <- function(x, row_slice, rds_out) {
   readr::write_rds(x = dat1, file = rds_out)
 }
 
-rds_pa
+rds_path_out <- here::here("nogit/umccrise/rds/results")
+#---- S3 ----#
+s3_map <- readr::read_rds(here("nogit/umccrise/rds/s3_map_2023-09-12.rds"))
+s0 <- parse_files(s3_map, 1:10, file.path(rds_path_out, "s0.rds"))
+s1 <- parse_files(s3_map, 1:500, file.path(rds_path_out, "s1.rds"))
+s2 <- parse_files(s3_map, 501:1000, file.path(rds_path_out, "s2.rds"))
+s3 <- parse_files(s3_map, 1001:1500, file.path(rds_path_out, "s3.rds"))
+s4 <- parse_files(s3_map, 1501:2000, file.path(rds_path_out, "s4.rds"))
+s5 <- parse_files(s3_map, 2001:2245, file.path(rds_path_out, "s5.rds"))
 
+#---- GDS ----#
 gds_map <- readr::read_rds(here("nogit/umccrise/rds/gds_map_2023-09-05.rds"))
-rds_path_out <- here("nogit/umccrise/rds/results")
 x0 <- parse_files(gds_map, 1:10, file.path(rds_path_out, "x0.rds"))
 x1 <- parse_files(gds_map, 1:500, file.path(rds_path_out, "x1.rds"))
 x2 <- parse_files(gds_map, 501:1000, file.path(rds_path_out, "x2.rds"))
@@ -160,11 +168,25 @@ x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds"))
 ```{r data_load}
 lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |>
   readr::read_rds()
-dat1 <- fs::dir_ls(here("nogit/umccrise/rds/results")) |>
+dat_s3 <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |>
+  purrr::map(readr::read_rds) |>
+  bind_rows()
+dat_gds <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "x[1-5]{1}.rds") |>
   purrr::map(readr::read_rds) |>
   bind_rows()
 
-o <- dat1 |>
+dat_s3_res <- dat_s3 |>
+  mutate(
+    type = case_when(
+      grepl("snv_2015.tsv.gz", path) ~ "UmSigsSnvFile2015",
+      grepl("snv_2020.tsv.gz", path) ~ "UmSigsSnvFile2020",
+      .default = .data$type
+    ),
+    date_utc2 = lubridate::as_datetime(.data$date_utc, format = "%Y-%m-%dT%H:%M:%S+00:00"),
+    date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne")
+  ) |>
+  select(date_analysed_aest, SubjectID, LibraryID_tumor, SampleID_tumor, type, objp)
+dat_gds_res <- dat_gds |>
   mutate(
     type = case_when(
       grepl("snv_2015.tsv.gz", bname) ~ "UmSigsSnvFile2015",
@@ -173,24 +195,23 @@ o <- dat1 |>
     ),
     date_analysed_aest = as.character(.data$end),
   ) |>
-  select(
-    date_analysed_aest,
-    SubjectID,
-    LibraryID_tumor,
-    LibraryID_normal,
-    type,
-    objp,
-    portal_run_id
-  )
+  select(date_analysed_aest, SubjectID, LibraryID_tumor, LibraryID_normal, type, objp, portal_run_id)
 
-lims <- lims_raw |>
-  filter(LibraryID %in% c(o$LibraryID_tumor)) |>
+lims_s3 <- lims_raw |>
+  filter(LibraryID %in% dat_s3_res$LibraryID_tumor) |>
+  select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |>
+  distinct()
+lims_gds <- lims_raw |>
+  filter(LibraryID %in% c(dat_gds_res$LibraryID_tumor)) |>
   select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |>
   distinct()
 
+o1 <- dat_s3_res |>
+  left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID"))
+#################### UP TO HERE^ ###########################################
 
-o2 <- o |>
-  left_join(lims, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
+o2 <- dat_gds_res |>
+  left_join(lims_gds, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
   mutate(
     url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"),
     sbj_url = glue("<a href={url}>{.data$SubjectID}</a>"),

From 045c9cb51c0a9b5778c838707b6f7ec2161a1111 Mon Sep 17 00:00:00 2001
From: pdiakumis <peterdiakumis@gmail.com>
Date: Wed, 13 Sep 2023 17:12:44 +1000
Subject: [PATCH 10/10] update umccrise multi reporter with s3 results

---
 inst/rmd/umccr_workflows/umccrise/multi.Rmd | 92 ++++++++++++++-------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
index 4905c4a..21b7bb2 100644
--- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd
+++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd
@@ -60,6 +60,7 @@ knitr::opts_chunk$set(
   require(ggplot2, include.only = c("ggplot", "aes"))
   require(lubridate, include.only = c("as_datetime"))
   require(plotly, include.only = c("ggplotly"))
+  require(openssl, include.only = c("sha256"))
 }
 ```
 
@@ -168,9 +169,21 @@ x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds"))
 ```{r data_load}
 lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |>
   readr::read_rds()
-dat_s3 <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |>
+dat_s3_raw <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |>
   purrr::map(readr::read_rds) |>
   bind_rows()
+# create sha256 for umccrise directory to distinguish between runs
+# keep first 8 digits and append to umccrise date folder.
+dat_s3 <- dat_s3_raw |>
+  mutate(
+    um_dir = sub("s3://umccr-primary-data-prod/(.*)/cancer_report_tables/.*", "\\1", path),
+    date_dir = basename(dirname(dirname(um_dir))),
+    date_dir = gsub("-", "", date_dir),
+    hash256 = openssl::sha256(um_dir),
+    hash256 = substr(hash256, 1, 8),
+    portal_run_id = glue("fake.{date_dir}{hash256}")
+  ) |>
+  select(-c(um_dir, date_dir, hash256, SampleID_tumor))
 dat_gds <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "x[1-5]{1}.rds") |>
   purrr::map(readr::read_rds) |>
   bind_rows()
@@ -183,9 +196,10 @@ dat_s3_res <- dat_s3 |>
       .default = .data$type
     ),
     date_utc2 = lubridate::as_datetime(.data$date_utc, format = "%Y-%m-%dT%H:%M:%S+00:00"),
-    date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne")
+    date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne"),
+    date_analysed_aest = as.character(.data$date_analysed_aest)
   ) |>
-  select(date_analysed_aest, SubjectID, LibraryID_tumor, SampleID_tumor, type, objp)
+  select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id)
 dat_gds_res <- dat_gds |>
   mutate(
     type = case_when(
@@ -195,7 +209,7 @@ dat_gds_res <- dat_gds |>
     ),
     date_analysed_aest = as.character(.data$end),
   ) |>
-  select(date_analysed_aest, SubjectID, LibraryID_tumor, LibraryID_normal, type, objp, portal_run_id)
+  select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id)
 
 lims_s3 <- lims_raw |>
   filter(LibraryID %in% dat_s3_res$LibraryID_tumor) |>
@@ -207,9 +221,13 @@ lims_gds <- lims_raw |>
   distinct()
 
 o1 <- dat_s3_res |>
-  left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID"))
-#################### UP TO HERE^ ###########################################
-
+  left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
+  mutate(
+    url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"),
+    sbj_url = glue("<a href={url}>{.data$SubjectID}</a>"),
+    url = glue("<a href={url}>{.data$url}</a>")
+  ) |>
+  rename(portal_url = url)
 o2 <- dat_gds_res |>
   left_join(lims_gds, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
   mutate(
@@ -220,7 +238,11 @@ o2 <- dat_gds_res |>
   ) |>
   rename(portal_url = url)
 
+d <- list(s3 = o1, gds = o2) |>
+  bind_rows(.id = "s3_or_gds")
+
 dt_view <- function(x, scroll_y = 1000, ...) {
+  options(DT.TOJSON_ARGS = list(na = "string"))
   x |>
     mutate(across(where(is.character), as.factor)) |>
     DT::datatable(
@@ -239,10 +261,10 @@ dt_view <- function(x, scroll_y = 1000, ...) {
     )
 }
 
-qcsum <- o2 |>
+qcsum <- d |>
   filter(type == "UmQcSumFile") |>
   tidyr::unnest_wider(objp)
-hrd_chord <- o2 |>
+hrd_chord <- d |>
   filter(type == "UmChordTsvFile") |>
   tidyr::unnest_wider(objp) |>
   select(portal_run_id,
@@ -257,12 +279,12 @@ hrd_chord <- o2 |>
 #   filter(type == "UmHrdetectTsvFile") |>
 #   unnest_wider(objp) |>
 #   select(portal_run_id, hrdetect_prob = Probability)
-sigs_2015 <- o2 |>
+sigs_2015 <- d |>
   filter(type == "UmSigsSnvFile2015") |>
   tidyr::unnest_wider(objp) |>
   select(-c(type)) |>
   tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
-sigs_2020 <- o2 |>
+sigs_2020 <- d |>
   filter(type == "UmSigsSnvFile2020") |>
   tidyr::unnest_wider(objp) |>
   select(-c(type)) |>
@@ -274,7 +296,7 @@ sigs_2020 <- o2 |>
 ```{r final_tab}
 cols_select1 <- c(
   "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID",
-  "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal",
+  "ProjectOwner", "ProjectName", "Type", "Workflow",
   "hrd_chord", "hrd_hrdetect",
   "chord_hr_status", "chord_hrd_type", "chord_p_BRCA1", "chord_p_BRCA2",
   "qc_status_hmf", "sex_hmf", "purity_hmf", "ploidy_hmf", "msi_hmf",
@@ -309,10 +331,13 @@ dsig_filt <- dsig |>
 dall <- qcsum |>
   left_join(hrd_chord, by = "portal_run_id") |>
   select(all_of(cols_select1), everything(), -c("type")) |>
-  left_join(dsig_filt, by = "portal_run_id")
+  left_join(dsig_filt, by = "portal_run_id") |>
+  relocate(sig_top2, .before = "hrd_chord") |>
+  relocate(s3_or_gds, .after = "SubjectID")
+dt_view(dall)
 ```
 
-```{r join_excel_layla}
+```{r join_excel_layla, eval=FALSE}
 excel_all <- here("nogit/umccrise/Combined analysis Jan22_Aug23.xlsx") |>
   readxl::read_xlsx(sheet = "All")
 excel_all |>
@@ -325,7 +350,7 @@ excel_all |>
 
 ### HRD Results
 
-```{r hrd_plot, fig.width=15, fig.height = 10}
+```{r hrd_plot, fig.width=15, fig.height = 15}
 p1 <- dall |>
   mutate(
     sbj = glue("{SubjectID}_{LibraryID_tumor}"),
@@ -348,9 +373,7 @@ plotly::ggplotly(p1)
 
 ### Signature Results
 
-**TODO**
-
-```{r fig.width = 15, fig.height=50, eval=FALSE}
+```{r fig.width = 15, fig.height=65, eval=TRUE}
 sig_order2015 <- paste0("Sig", 1:30)
 sig_order2020 <- paste0(
   "SBS",
@@ -366,8 +389,12 @@ sig_order2020 <- paste0(
   )
 )
 
-d2p <- d2 |>
-  # filter(Rank %in% c(1:5)) |>
+p2_prep <- dsig |>
+  filter(
+    Sig_group == "s2015",
+    Rank %in% c(1:3)
+  ) |>
+  left_join(dall |> select(portal_run_id, date_analysed_aest, SubjectID, LibraryID_tumor), by = "portal_run_id") |>
   mutate(
     sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")),
     date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S")
@@ -376,13 +403,14 @@ d2p <- d2 |>
     date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq
   ) |>
   mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020)))
-p2 <-
-  d2p |>
-  ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) +
+p2 <- p2_prep |>
+  filter(!grepl("ALLOCATE", sbj)) |> # get rid of ALLOCATE subject
+  ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) +
   ggplot2::geom_bar(position = "fill", stat = "identity") +
-  ggplot2::facet_wrap(~Sig_group, ncol = 1)
+  ggplot2::theme_bw(base_size = 7)
+# ggplot2::facet_wrap(~Sig_group, ncol = 1)
 
-plotly::ggplotly(p2)
+plotly::ggplotly(p2, tooltip = c("x", "text", "fill"))
 ```
 
 
@@ -391,25 +419,31 @@ plotly::ggplotly(p2)
 ### ProjectOwner
 
 ```{r ProjectOwner}
-count(d1, ProjectOwner) |> dt_view(scroll_y = 400)
+count(dall, ProjectOwner) |> dt_view(scroll_y = 400)
 ```
 
 ### ProjectName
 
 ```{r ProjectName}
-count(d1, ProjectName) |> dt_view(scroll_y = 400)
+count(dall, ProjectName) |> dt_view(scroll_y = 400)
 ```
 
 ### Type
 
 ```{r Type}
-count(d1, Type) |> dt_view(scroll_y = 400)
+count(dall, Type) |> dt_view(scroll_y = 400)
 ```
 
 ### Workflow
 
 ```{r Workflow}
-count(d1, Workflow) |> dt_view(scroll_y = 400)
+count(dall, Workflow) |> dt_view(scroll_y = 400)
+```
+
+### S3orGDS
+
+```{r s3orgds}
+count(dall, s3_or_gds) |> dt_view(scroll_y = 400)
 ```
 
 </div>