Skip to content

Commit

Permalink
update umccrise multi reporter with s3 results
Browse files Browse the repository at this point in the history
  • Loading branch information
pdiakumis committed Sep 13, 2023
1 parent e33d275 commit 045c9cb
Showing 1 changed file with 63 additions and 29 deletions.
92 changes: 63 additions & 29 deletions inst/rmd/umccr_workflows/umccrise/multi.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ knitr::opts_chunk$set(
require(ggplot2, include.only = c("ggplot", "aes"))
require(lubridate, include.only = c("as_datetime"))
require(plotly, include.only = c("ggplotly"))
require(openssl, include.only = c("sha256"))
}
```

Expand Down Expand Up @@ -168,9 +169,21 @@ x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds"))
```{r data_load}
lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |>
readr::read_rds()
dat_s3 <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |>
dat_s3_raw <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |>
purrr::map(readr::read_rds) |>
bind_rows()
# create sha256 for umccrise directory to distinguish between runs
# keep first 8 digits and append to umccrise date folder.
dat_s3 <- dat_s3_raw |>
mutate(
um_dir = sub("s3://umccr-primary-data-prod/(.*)/cancer_report_tables/.*", "\\1", path),
date_dir = basename(dirname(dirname(um_dir))),
date_dir = gsub("-", "", date_dir),
hash256 = openssl::sha256(um_dir),
hash256 = substr(hash256, 1, 8),
portal_run_id = glue("fake.{date_dir}{hash256}")
) |>
select(-c(um_dir, date_dir, hash256, SampleID_tumor))
dat_gds <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "x[1-5]{1}.rds") |>
purrr::map(readr::read_rds) |>
bind_rows()
Expand All @@ -183,9 +196,10 @@ dat_s3_res <- dat_s3 |>
.default = .data$type
),
date_utc2 = lubridate::as_datetime(.data$date_utc, format = "%Y-%m-%dT%H:%M:%S+00:00"),
date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne")
date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne"),
date_analysed_aest = as.character(.data$date_analysed_aest)
) |>
select(date_analysed_aest, SubjectID, LibraryID_tumor, SampleID_tumor, type, objp)
select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id)
dat_gds_res <- dat_gds |>
mutate(
type = case_when(
Expand All @@ -195,7 +209,7 @@ dat_gds_res <- dat_gds |>
),
date_analysed_aest = as.character(.data$end),
) |>
select(date_analysed_aest, SubjectID, LibraryID_tumor, LibraryID_normal, type, objp, portal_run_id)
select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id)
lims_s3 <- lims_raw |>
filter(LibraryID %in% dat_s3_res$LibraryID_tumor) |>
Expand All @@ -207,9 +221,13 @@ lims_gds <- lims_raw |>
distinct()
o1 <- dat_s3_res |>
left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID"))
#################### UP TO HERE^ ###########################################
left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
mutate(
url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"),
sbj_url = glue("<a href={url}>{.data$SubjectID}</a>"),
url = glue("<a href={url}>{.data$url}</a>")
) |>
rename(portal_url = url)
o2 <- dat_gds_res |>
left_join(lims_gds, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |>
mutate(
Expand All @@ -220,7 +238,11 @@ o2 <- dat_gds_res |>
) |>
rename(portal_url = url)
d <- list(s3 = o1, gds = o2) |>
bind_rows(.id = "s3_or_gds")
dt_view <- function(x, scroll_y = 1000, ...) {
options(DT.TOJSON_ARGS = list(na = "string"))
x |>
mutate(across(where(is.character), as.factor)) |>
DT::datatable(
Expand All @@ -239,10 +261,10 @@ dt_view <- function(x, scroll_y = 1000, ...) {
)
}
qcsum <- o2 |>
qcsum <- d |>
filter(type == "UmQcSumFile") |>
tidyr::unnest_wider(objp)
hrd_chord <- o2 |>
hrd_chord <- d |>
filter(type == "UmChordTsvFile") |>
tidyr::unnest_wider(objp) |>
select(portal_run_id,
Expand All @@ -257,12 +279,12 @@ hrd_chord <- o2 |>
# filter(type == "UmHrdetectTsvFile") |>
# unnest_wider(objp) |>
# select(portal_run_id, hrdetect_prob = Probability)
sigs_2015 <- o2 |>
sigs_2015 <- d |>
filter(type == "UmSigsSnvFile2015") |>
tidyr::unnest_wider(objp) |>
select(-c(type)) |>
tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq))
sigs_2020 <- o2 |>
sigs_2020 <- d |>
filter(type == "UmSigsSnvFile2020") |>
tidyr::unnest_wider(objp) |>
select(-c(type)) |>
Expand All @@ -274,7 +296,7 @@ sigs_2020 <- o2 |>
```{r final_tab}
cols_select1 <- c(
"date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID",
"ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal",
"ProjectOwner", "ProjectName", "Type", "Workflow",
"hrd_chord", "hrd_hrdetect",
"chord_hr_status", "chord_hrd_type", "chord_p_BRCA1", "chord_p_BRCA2",
"qc_status_hmf", "sex_hmf", "purity_hmf", "ploidy_hmf", "msi_hmf",
Expand Down Expand Up @@ -309,10 +331,13 @@ dsig_filt <- dsig |>
dall <- qcsum |>
left_join(hrd_chord, by = "portal_run_id") |>
select(all_of(cols_select1), everything(), -c("type")) |>
left_join(dsig_filt, by = "portal_run_id")
left_join(dsig_filt, by = "portal_run_id") |>
relocate(sig_top2, .before = "hrd_chord") |>
relocate(s3_or_gds, .after = "SubjectID")
dt_view(dall)
```

```{r join_excel_layla}
```{r join_excel_layla, eval=FALSE}
excel_all <- here("nogit/umccrise/Combined analysis Jan22_Aug23.xlsx") |>
readxl::read_xlsx(sheet = "All")
excel_all |>
Expand All @@ -325,7 +350,7 @@ excel_all |>

### HRD Results

```{r hrd_plot, fig.width=15, fig.height = 10}
```{r hrd_plot, fig.width=15, fig.height = 15}
p1 <- dall |>
mutate(
sbj = glue("{SubjectID}_{LibraryID_tumor}"),
Expand All @@ -348,9 +373,7 @@ plotly::ggplotly(p1)

### Signature Results

**TODO**

```{r fig.width = 15, fig.height=50, eval=FALSE}
```{r fig.width = 15, fig.height=65, eval=TRUE}
sig_order2015 <- paste0("Sig", 1:30)
sig_order2020 <- paste0(
"SBS",
Expand All @@ -366,8 +389,12 @@ sig_order2020 <- paste0(
)
)
d2p <- d2 |>
# filter(Rank %in% c(1:5)) |>
p2_prep <- dsig |>
filter(
Sig_group == "s2015",
Rank %in% c(1:3)
) |>
left_join(dall |> select(portal_run_id, date_analysed_aest, SubjectID, LibraryID_tumor), by = "portal_run_id") |>
mutate(
sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")),
date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S")
Expand All @@ -376,13 +403,14 @@ d2p <- d2 |>
date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq
) |>
mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020)))
p2 <-
d2p |>
ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, label = sbj)) +
p2 <- p2_prep |>
filter(!grepl("ALLOCATE", sbj)) |> # get rid of ALLOCATE subject
ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) +
ggplot2::geom_bar(position = "fill", stat = "identity") +
ggplot2::facet_wrap(~Sig_group, ncol = 1)
ggplot2::theme_bw(base_size = 7)
# ggplot2::facet_wrap(~Sig_group, ncol = 1)
plotly::ggplotly(p2)
plotly::ggplotly(p2, tooltip = c("x", "text", "fill"))
```


Expand All @@ -391,25 +419,31 @@ plotly::ggplotly(p2)
### ProjectOwner

```{r ProjectOwner}
count(d1, ProjectOwner) |> dt_view(scroll_y = 400)
count(dall, ProjectOwner) |> dt_view(scroll_y = 400)
```

### ProjectName

```{r ProjectName}
count(d1, ProjectName) |> dt_view(scroll_y = 400)
count(dall, ProjectName) |> dt_view(scroll_y = 400)
```

### Type

```{r Type}
count(d1, Type) |> dt_view(scroll_y = 400)
count(dall, Type) |> dt_view(scroll_y = 400)
```

### Workflow

```{r Workflow}
count(d1, Workflow) |> dt_view(scroll_y = 400)
count(dall, Workflow) |> dt_view(scroll_y = 400)
```

### S3orGDS

```{r s3orgds}
count(dall, s3_or_gds) |> dt_view(scroll_y = 400)
```

</div>
Expand Down

0 comments on commit 045c9cb

Please sign in to comment.