alignqc: include topup/rerun info (fixes #128)

umccr · Sep 13, 2024 · 620d795 · 620d795
1 parent 4680506
commit 620d795
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 20 deletions.
diff --git a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R
@@ -28,22 +28,35 @@ query_limsrow_libids <- function(libids) {
 }
 
 # first read in the workflows table, extract metadata, then join with lims
-start_date <- "2024-08-03"
+start_date <- "2024-09-09"
 p_raw <- query_workflow_alignqc(start_date)
 
 wgs <- p_raw |>
   rportal::meta_wgs_alignment_qc(status = "Succeeded")
 wts <- p_raw |>
   rportal::meta_wts_alignment_qc(status = "Succeeded")
 p <- bind_rows(wgs, wts)
-lims <- query_limsrow_libids(p$LibraryID)
+lims_raw <- query_limsrow_libids(p$LibraryID)
+
+lims <- lims_raw |>
+  tidyr::separate_wider_delim(
+    library_id,
+    delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start"
+  ) |>
+  select(
+    subject_id, library_id, sample_id, sample_name,
+    external_subject_id, external_sample_id,
+    project_name, project_owner, phenotype, type,
+    source, assay, quality, workflow
+  ) |>
+  distinct()
 
 d <- p |>
   left_join(lims, by = c("SubjectID" = "subject_id", "LibraryID" = "library_id")) |>
   select(
     "SubjectID", "LibraryID", "SampleID", "lane", "phenotype", "type", "source",
-    "assay", "external_subject_id", "project_name", "project_owner",
-    "start", "end", "portal_run_id", "gds_outdir_dragen"
+    "assay", "workflow", "external_subject_id", "project_name", "project_owner",
+    "start", "end", "portal_run_id", "gds_outdir_dragen", "fq1", "fq2"
   ) |>
   mutate(rownum = row_number())
 

diff --git a/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd b/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd
@@ -9,7 +9,7 @@ output:
     highlight: kate
 params:
   title: "UMCCR Alignment QC Summary Report"
-  meta: !r here::here("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/2024-08-03_wgts.rds")
+  meta: !r here::here("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/2024-09-09_wgts.rds")
 description: "UMCCR Alignment QC Summary Report"
 title: "`r params$title`"
 ---
@@ -33,14 +33,15 @@ knitr::opts_chunk$set(
 ```{r load_pkgs}
 {
   require(dplyr)
-  require(dracarys, include.only = "session_info_tbls")
+  require(dracarys, include.only = "session_info_kable")
   require(DT, include.only = "datatable")
   require(forcats, include.only = "fct_rev")
   require(glue, include.only = "glue")
   require(here, include.only = "here")
   require(plotly, include.only = "ggplotly")
   require(purrr, include.only = "map")
   require(readr, include.only = "read_rds")
+  require(stringr, include.only = "str_extract")
   require(tibble, include.only = "tibble")
   require(knitr, include.only = "kable")
   require(ggplot2)
@@ -50,7 +51,9 @@ knitr::opts_chunk$set(
 
 ```{r data_setup}
 ggplot2::theme_set(ggplot2::theme_bw())
-meta <- params[["meta"]] |> readr::read_rds()
+meta <- params[["meta"]] |>
+  readr::read_rds() |>
+  mutate(topup_or_rerun = stringr::str_extract(fq1, "topup|rerun(2)?"))
 stopifnot(all(dir.exists(meta$outdir)))
 options(scipen = 999) # disable scientific notation
 options(width = 150)
@@ -87,7 +90,8 @@ dat <- meta |>
   rowwise() |>
   mutate(
     fpaths = list(filepaths(indir = .data$outdir, sampleid = .data$SampleID)),
-    umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}")
+    umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}"),
+    umccrid = if_else(is.na(.data$topup_or_rerun), .data$umccrid, glue("{umccrid}_{.data$topup_or_rerun}"))
   ) |>
   select("umccrid", "phenotype", "type", "source", "fpaths") |>
   tidyr::unnest(fpaths) |>
@@ -177,7 +181,11 @@ meta |>
     SubjectID = get_sbj_url(.data$SubjectID),
     durationMin = round(end - start)
   ) |>
-  select(SubjectID, type, LibraryID, lane, durationMin, everything(), -c("indir", "outdir", "cmd")) |>
+  select(
+    SubjectID, type, LibraryID, lane, durationMin, topup_or_rerun,
+    everything(),
+    -c("rownum", "indir", "outdir", "cmd", "fq1", "fq2")
+  ) |>
   dt_view(escape = FALSE) |>
   DT::formatStyle(
     "type",
@@ -193,7 +201,7 @@ meta |>
 
 ```{r mm, eval=eval$MappingMetricsFile}
 d_map <- dr_unnest("MappingMetricsFile") |>
-  arrange(type, desc(umccrid)) |>
+  arrange(desc(umccrid), type) |>
   select(
     umccrid, phenotype, type,
     source,
@@ -273,7 +281,7 @@ d_pl_metrics <- d_pl |>
   )
 # cov_genome_pct_* metrics are in the Hist data, so filter out here
 d_cvg <- dr_unnest("WgsCoverageMetricsFile") |>
-  arrange(type, desc(umccrid)) |>
+  arrange(desc(umccrid)) |>
   left_join(d_pl_metrics, by = c("umccrid", "phenotype", "type", "source")) |>
   select(
     umccrid, phenotype, type, source,
@@ -316,7 +324,7 @@ d_cvg |>
 
 ```{r trim, eval=eval$TrimmerMetricsFile}
 d_tr <- dr_unnest("TrimmerMetricsFile") |>
-  arrange(type, desc(umccrid)) |>
+  arrange(desc(umccrid)) |>
   select(
     umccrid, phenotype, type, source,
     reads_tot = reads_tot_input_dragen,
@@ -449,9 +457,11 @@ plotly::ggplotly(f1_plot)
 
 ### Positional Base Content ('Per-Position Sequence Content')
 
+Skip
+
 - TODO: create heatmap instead
 
-```{r fqc_pbc, fig.height=42}
+```{r fqc_pbc, eval=F, fig.height=42}
 f1 <- dr_unnest("FastqcMetricsFile_positional_base_content")
 f1 |>
   filter(base != "N") |>
@@ -472,7 +482,9 @@ f1 |>
 
 ### Positional Base Mean Quality ('Per-Position Mean Quality Scores')
 
-```{r fqc_bmq, fig.height=80}
+Skip
+
+```{r fqc_bmq, eval=F, fig.height=80}
 f1 <- dr_unnest("FastqcMetricsFile_positional_base_mean_quality")
 ggplot() +
   geom_rect(
@@ -496,7 +508,9 @@ ggplot() +
 
 ### Positional Quality ('Per-Position Quality Score Ranges')
 
-```{r fqc_pq, eval=T, fig.width=13}
+Skip
+
+```{r fqc_pq, eval=FALSE, fig.width=13}
 # TODO: use boxplot instead of point
 f1 <- dr_unnest("FastqcMetricsFile_positional_quality")
 quants <- c(25, 50, 75)
@@ -539,7 +553,9 @@ plotly::ggplotly(read_len_plot)
 
 ### Sequence Positions ('Adapter Content')
 
-```{r seq_pos, fig.height=42}
+Skip
+
+```{r seq_pos, eval=F, fig.height=42}
 f1 <- dr_unnest("FastqcMetricsFile_sequence_positions")
 f1 |>
   ggplot(aes(x = bp, y = value, colour = seq)) +
@@ -555,7 +571,9 @@ f1 |>
 
 ## Coverage {.tabset .tabset-pills}
 
-```{r contig_cvg, eval=eval$WgsContigMeanCovFile, results='asis', fig.height=5}
+Skip
+
+```{r contig_cvg, eval=FALSE, results='asis', fig.height=5}
 d1 <- dr_unnest("WgsContigMeanCovFile") |>
   arrange(desc("umccrid"))
 for (type1 in sort(unique(d1$type), decreasing = FALSE)) {
@@ -606,7 +624,9 @@ plotly::ggplotly(flp)
 
 - Only for WGS.
 
-```{r pe, eval=eval$PloidyEstimationMetricsFile, fig.height=5}
+Skip
+
+```{r pe, eval=F, fig.height=5}
 chrom_levels <- c(1:22, "x", "y")
 d_pl_plot_data <- d_pl |>
   select(
@@ -631,7 +651,9 @@ plotly::ggplotly(d_pl_plot)
 
 ## Hist
 
-```{r cvgm, eval=eval$WgsCoverageMetricsFile, fig.height=8, fig.width=12}
+Skip
+
+```{r cvgm, eval=F, fig.height=8, fig.width=12}
 d_hist <- dr_unnest("WgsHistFile")
 d_hist1 <- d_hist |>
   ggplot(aes(x = start, y = pct, colour = umccrid)) +
@@ -660,7 +682,9 @@ plotly::subplot(d_hist1, d_hist2, shareY = TRUE, titleY = TRUE, titleX = TRUE, n
 
 ## FineHist
 
-```{r finehist, eval=eval$WgsFineHistFile, fig.height=10, fig.width=12}
+Skip
+
+```{r finehist, eval=FALSE, fig.height=10, fig.width=12}
 d_fhist <- dr_unnest("WgsFineHistFile")
 d_fhist |>
   dracarys::WgsFineHistFile$public_methods$plot(c(0, 150)) +