Skip to content

Commit

Permalink
Merge pull request #181 from umccr/issues-180/cn
Browse files Browse the repository at this point in the history
Handle zero CN gains/losses
  • Loading branch information
pdiakumis authored Dec 3, 2024
2 parents 9d0c699 + b866bb2 commit ba69aac
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 10 deletions.
28 changes: 18 additions & 10 deletions inst/rmd/rnasum.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -3198,24 +3198,28 @@ Table summarising the **mRNA expression** values in cancer and patient samples f

##### Percentiles

```{r cn_expr_data_table_gains_perc, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
```{r cn_expr_data_table_gains_check, eval = runPurpleChunk}
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
##### Keep only genes within CN gains
cn_data <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]] |>
dplyr::filter(.data$CN >= cn_top) |>
dplyr::select("Gene", "CN")
genes_gains <- as.character(cn_genes[cn_genes %in% cn_data[["Gene"]]])
runPurpleGainsChunk <- TRUE
##### Deal with no genes
if (length(genes_gains) == 0) {
genes_gains <- NULL
genes_gains_no <- 0
runPurpleGainsChunk <- FALSE
} else if (length(genes_gains) > params$top_genes) {
genes_gains_no <- params$top_genes
} else {
genes_gains_no <- length(genes_gains)
}
```


```{r cn_expr_data_table_gains_perc, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk & runPurpleGainsChunk}
##### Get expression data
dat1 <- ref_dataset.list[[dataset]][["data_to_report"]]
Expand Down Expand Up @@ -3292,7 +3296,7 @@ The [RED]{style="color:#ff0000"} colour range indicate relatively **high express

##### Z-scores

```{r cn_expr_data_table_gains, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk}
```{r cn_expr_data_table_gains, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk & runPurpleGainsChunk}
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
if (runPcgrChunk && runPurpleChunk) {
cn_expr_genes.expr.gains.z <- RNAsum::exprTable(
Expand Down Expand Up @@ -3373,24 +3377,28 @@ Table summarising the **mRNA expression** values in cancer and patient samples f

##### Percentiles

```{r cn_expr_data_table_losses_perc, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
```{r cn_expr_data_table_losses_check, eval = runPurpleChunk}
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
##### Keep only genes within CN losses
cn_data <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]] |>
dplyr::filter(.data$CN <= cn_bottom) |>
dplyr::select("Gene", "CN")
genes_losses <- as.character(cn_genes[cn_genes %in% cn_data[["Gene"]]])
runPurpleLossesChunk <- TRUE
##### Deal with no genes or when more than 10 genes are of interest
if (length(genes_losses) == 0) {
genes_losses <- NULL
genes_losses_no <- 0
runPurpleLossesChunk <- FALSE
} else if (length(genes_losses) > params$top_genes) {
genes_losses_no <- params$top_genes
} else {
genes_losses_no <- length(genes_losses)
}
```

```{r cn_expr_data_table_losses_perc, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk & runPurpleLossesChunk}
##### Deal with no genes or when more than 10 genes are of interest
if (genes_gains_no + genes_losses_no > params$top_genes) {
limit_genes <- TRUE
} else {
Expand Down Expand Up @@ -3473,7 +3481,7 @@ The [RED]{style="color:#ff0000"} colour range indicate relatively **high express

##### Z-scores

```{r cn_expr_data_table_losses, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
```{r cn_expr_data_table_losses, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk & runPurpleLossesChunk}
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
if (runPcgrChunk && runPurpleChunk) {
cn_expr_genes.expr.losses.z <- RNAsum::exprTable(
Expand Down Expand Up @@ -3555,7 +3563,7 @@ The [RED]{style="color:#ff0000"} colour range indicate relatively **high express

#### Gains {.tabset}

```{r cdf_plot_cn_expr_gains, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk, results="asis"}
```{r cdf_plot_cn_expr_gains, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk & runPurpleGainsChunk, results="asis"}
##### Generate empirical cumulative distribution function (ECDF) plot illustrating mRNA expression level for the genes of interest in the context of the overall mRNA expression distribution
output_cdf <- list()
output_counts <- list()
Expand Down Expand Up @@ -3642,7 +3650,7 @@ rm(list = ls(pattern = "^output*"))

#### Losses {.tabset}

```{r cdf_plot_cn_expr_losses, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk, results="asis"}
```{r cdf_plot_cn_expr_losses, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk & runPurpleLossesChunk, results="asis"}
##### Generate empirical cumulative distribution function (ECDF) plot illustrating mRNA expression level for the genes of interest in the context of the overall mRNA expression distribution
output_cdf <- list()
output_counts <- list()
Expand Down Expand Up @@ -4607,7 +4615,7 @@ cat(drugsTable_legend)

`r if ( params$drugs ) { c("#### Gains") }`

```{r drugs_predictive_cn_altered_genes_gains, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
```{r drugs_predictive_cn_altered_genes_gains, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk & runPurpleGainsChunk}
##### Generate table with drugs targeting CN altered genes
genes <- cn_expr_genes.expr.gains.perc[[2]]$SYMBOL
Expand Down Expand Up @@ -4643,7 +4651,7 @@ cat(drugsTable_legend)

`r if ( params$drugs ) { c("#### Losses") }`

```{r drugs_predictive_cn_altered_genes_losses, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
```{r drugs_predictive_cn_altered_genes_losses, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk & runPurpleLossesChunk}
##### Generate table with drugs targeting CN altered genes
genes <- cn_expr_genes.expr.losses.perc[[2]]$SYMBOL
Expand Down
133 changes: 133 additions & 0 deletions inst/scripts/icav2_download_and_run.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# helpers for downloading S3 data for local testing
require(dracarys, include.only = "s3_list_files_dir")
require(dplyr)
require(rportal, include.only = "orca_query_url")
require(glue, include.only = "glue")
require(here, include.only = "here")
require(tibble, include.only = "tibble")
require(tidyr, include.only = "pivot_longer")

# make sure you have logged into AWS
c("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION") |>
rportal::envvar_defined() |>
stopifnot()
token_orca <- rportal::orca_jwt() |> rportal::jwt_validate()
portalRunId <- "20241201d2c4e79d" # failed run

# query orca workflow manager for payload with rnasum s3 inputs
get_rnasum_inputs <- function(prid, token_orca) {
wfrid <- rportal::orca_prid2wfrid(prid = portalRunId, token = token)
states <- rportal::orca_wfrid2state(wfrid, token) # view states for gived wfrid
pldid <- states |>
filter(.data$status == "FAILED") |>
pull("payload")
url <- glue("https://workflow.prod.umccr.org/api/v1/payload/{pldid}")
pld <- rportal::orca_query_url(url, token)
s3inputs <- pld[["data"]][["inputs"]] |>
purrr::map(\(x) x |> stringr::str_replace("/$", ""))
return(s3inputs)
}

rnasum_inputs <- get_rnasum_inputs(portalRunId, token)
# patterns of files to fish out
rnasum_file_regex <- tibble::tribble(
~regex, ~fun,
"quant\\.genes\\.sf$", "DragenWtsQuantGenesSfFile",
"quant\\.sf$", "DragenWtsQuantSfFile",
"fusion_candidates\\.final$", "DragenWtsFusionsFinalFile",
"fusions\\.pdf$", "ArribaFusionsPdfFile",
"fusions\\.tsv$", "ArribaFusionsTsvFile",
"somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "PcgrTiersTsvFile",
"purple\\.cnv\\.gene\\.tsv$", "PurpleCnvGeneTsvFile",
"manta\\.tsv$", "MantaTsvFile",
"mapping_metrics\\.csv$", "MapMetricsFile"
)
outdir <- here::here("nogit/patient_data")

# melt s3_indir columns to get a single column with paths to S3 directories
# of interest, and fish out files of interest from each of them, then download
meta_rnasum <- rnasum_inputs |>
tibble::as_tibble_row() |>
tidyr::pivot_longer(contains("Uri"), names_to = "folder_type", values_to = "s3_indir") |>
mutate(folder_type = sub("Uri$", "", .data$folder_type)) |>
select(subjectId, wtsTumorLibraryId, folder_type, s3_indir) |>
rowwise() |>
mutate(
down = list(dracarys::dr_s3_download(s3dir = s3_indir, outdir = outdir, max_objects = 1000, regexes = rnasum_file_regex, dryrun = F))
) |>
ungroup()

# saveRDS(meta_rnasum, here(glue("nogit/patient_data/down_{date1}.rds")))
# meta_rnasum <- here::here(glue("nogit/patient_data/down_{date1}.rds")) |> readr::read_rds()
rnasum_params_set <- function(arriba_pdf, arriba_tsv, dataset, dragen_fusions, dragen_mapping_metrics, sv_tsv,
pcgr_tiers_tsv, purple_gene_tsv, report_dir, salmon,
sample_name, subject_id) {
params <- list(
arriba_pdf = arriba_pdf,
arriba_tsv = arriba_tsv,
batch_rm = TRUE,
cn_gain = 95,
cn_loss = 5,
dataset = dataset,
dataset_name_incl = FALSE,
dragen_fusions = dragen_fusions,
dragen_mapping_metrics = dragen_mapping_metrics,
drugs = FALSE,
filter = TRUE,
immunogram = FALSE,
log = TRUE,
sv_tsv = sv_tsv,
norm = "TMM",
pcgr_splice_vars = TRUE,
pcgr_tier = 4,
pcgr_tiers_tsv = pcgr_tiers_tsv,
project = "-",
purple_gene_tsv = purple_gene_tsv,
report_dir = report_dir,
salmon = salmon,
sample_name = sample_name,
sample_source = "-",
save_tables = TRUE,
scaling = "gene-wise",
subject_id = subject_id,
top_genes = 5,
transform = "CPM",
umccrise = NULL
)
params
}

# now unmelt to have one row per run
d_runs <- meta_rnasum |>
tidyr::unnest(down) |>
dplyr::select(subjectId, libraryId = "wtsTumorLibraryId", type, localpath) |>
tidyr::pivot_wider(names_from = type, values_from = localpath)

# slice to whichever run you want from d
d_runs |>
# dplyr::slice(1) |>
dplyr::rowwise() |>
dplyr::mutate(
rnasum_dataset = "PANCAN",
params = list(
rnasum_params_set(
arriba_pdf = ArribaFusionsPdfFile,
arriba_tsv = ArribaFusionsTsvFile,
dataset = rnasum_dataset,
dragen_fusions = DragenWtsFusionsFinalFile,
dragen_mapping_metrics = MapMetricsFile,
sv_tsv = MantaTsvFile,
pcgr_tiers_tsv = PcgrTiersTsvFile,
purple_gene_tsv = PurpleCnvGeneTsvFile,
report_dir = here::here(glue::glue("nogit/patient_data/reports/{subjectId}_{libraryId}_{rnasum_dataset}")),
salmon = DragenWtsQuantGenesSfFile,
sample_name = glue::glue("{subjectId}_{libraryId}"),
subject_id = subjectId
)
),
rnasum_rmd = RNAsum::rnasum_rmd(
out_file = here::here(glue::glue("nogit/patient_data/reports_html/{subjectId}_{libraryId}_{rnasum_dataset}.html")),
quiet = FALSE, pars = params
)
) |>
dplyr::ungroup()

0 comments on commit ba69aac

Please sign in to comment.