diff --git a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R index 9a5f75f..836e7f2 100755 --- a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R +++ b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R @@ -28,7 +28,7 @@ query_limsrow_libids <- function(libids) { } # first read in the workflows table, extract metadata, then join with lims -start_date <- "2024-08-03" +start_date <- "2024-09-09" p_raw <- query_workflow_alignqc(start_date) wgs <- p_raw |> @@ -36,14 +36,27 @@ wgs <- p_raw |> wts <- p_raw |> rportal::meta_wts_alignment_qc(status = "Succeeded") p <- bind_rows(wgs, wts) -lims <- query_limsrow_libids(p$LibraryID) +lims_raw <- query_limsrow_libids(p$LibraryID) + +lims <- lims_raw |> + tidyr::separate_wider_delim( + library_id, + delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" + ) |> + select( + subject_id, library_id, sample_id, sample_name, + external_subject_id, external_sample_id, + project_name, project_owner, phenotype, type, + source, assay, quality, workflow + ) |> + distinct() d <- p |> left_join(lims, by = c("SubjectID" = "subject_id", "LibraryID" = "library_id")) |> select( "SubjectID", "LibraryID", "SampleID", "lane", "phenotype", "type", "source", - "assay", "external_subject_id", "project_name", "project_owner", - "start", "end", "portal_run_id", "gds_outdir_dragen" + "assay", "workflow", "external_subject_id", "project_name", "project_owner", + "start", "end", "portal_run_id", "gds_outdir_dragen", "fq1", "fq2" ) |> mutate(rownum = row_number()) diff --git a/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd b/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd index acadff5..f38aa67 100644 --- a/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd +++ b/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd @@ -9,7 +9,7 @@ output: highlight: kate params: title: "UMCCR Alignment QC Summary Report" - meta: !r here::here("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/2024-08-03_wgts.rds") + meta: !r here::here("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/2024-09-09_wgts.rds") description: "UMCCR Alignment QC Summary Report" title: "`r params$title`" --- @@ -33,7 +33,7 @@ knitr::opts_chunk$set( ```{r load_pkgs} { require(dplyr) - require(dracarys, include.only = "session_info_tbls") + require(dracarys, include.only = "session_info_kable") require(DT, include.only = "datatable") require(forcats, include.only = "fct_rev") require(glue, include.only = "glue") @@ -41,6 +41,7 @@ knitr::opts_chunk$set( require(plotly, include.only = "ggplotly") require(purrr, include.only = "map") require(readr, include.only = "read_rds") + require(stringr, include.only = "str_extract") require(tibble, include.only = "tibble") require(knitr, include.only = "kable") require(ggplot2) @@ -50,7 +51,9 @@ knitr::opts_chunk$set( ```{r data_setup} ggplot2::theme_set(ggplot2::theme_bw()) -meta <- params[["meta"]] |> readr::read_rds() +meta <- params[["meta"]] |> + readr::read_rds() |> + mutate(topup_or_rerun = stringr::str_extract(fq1, "topup|rerun(2)?")) stopifnot(all(dir.exists(meta$outdir))) options(scipen = 999) # disable scientific notation options(width = 150) @@ -87,7 +90,8 @@ dat <- meta |> rowwise() |> mutate( fpaths = list(filepaths(indir = .data$outdir, sampleid = .data$SampleID)), - umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}") + umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}"), + umccrid = if_else(is.na(.data$topup_or_rerun), .data$umccrid, glue("{umccrid}_{.data$topup_or_rerun}")) ) |> select("umccrid", "phenotype", "type", "source", "fpaths") |> tidyr::unnest(fpaths) |> @@ -177,7 +181,11 @@ meta |> SubjectID = get_sbj_url(.data$SubjectID), durationMin = round(end - start) ) |> - select(SubjectID, type, LibraryID, lane, durationMin, everything(), -c("indir", "outdir", "cmd")) |> + select( + SubjectID, type, LibraryID, lane, durationMin, topup_or_rerun, + everything(), + -c("rownum", "indir", "outdir", "cmd", "fq1", "fq2") + ) |> dt_view(escape = FALSE) |> DT::formatStyle( "type", @@ -193,7 +201,7 @@ meta |> ```{r mm, eval=eval$MappingMetricsFile} d_map <- dr_unnest("MappingMetricsFile") |> - arrange(type, desc(umccrid)) |> + arrange(desc(umccrid), type) |> select( umccrid, phenotype, type, source, @@ -273,7 +281,7 @@ d_pl_metrics <- d_pl |> ) # cov_genome_pct_* metrics are in the Hist data, so filter out here d_cvg <- dr_unnest("WgsCoverageMetricsFile") |> - arrange(type, desc(umccrid)) |> + arrange(desc(umccrid)) |> left_join(d_pl_metrics, by = c("umccrid", "phenotype", "type", "source")) |> select( umccrid, phenotype, type, source, @@ -316,7 +324,7 @@ d_cvg |> ```{r trim, eval=eval$TrimmerMetricsFile} d_tr <- dr_unnest("TrimmerMetricsFile") |> - arrange(type, desc(umccrid)) |> + arrange(desc(umccrid)) |> select( umccrid, phenotype, type, source, reads_tot = reads_tot_input_dragen, @@ -449,9 +457,11 @@ plotly::ggplotly(f1_plot) ### Positional Base Content ('Per-Position Sequence Content') +Skip + - TODO: create heatmap instead -```{r fqc_pbc, fig.height=42} +```{r fqc_pbc, eval=F, fig.height=42} f1 <- dr_unnest("FastqcMetricsFile_positional_base_content") f1 |> filter(base != "N") |> @@ -472,7 +482,9 @@ f1 |> ### Positional Base Mean Quality ('Per-Position Mean Quality Scores') -```{r fqc_bmq, fig.height=80} +Skip + +```{r fqc_bmq, eval=F, fig.height=80} f1 <- dr_unnest("FastqcMetricsFile_positional_base_mean_quality") ggplot() + geom_rect( @@ -496,7 +508,9 @@ ggplot() + ### Positional Quality ('Per-Position Quality Score Ranges') -```{r fqc_pq, eval=T, fig.width=13} +Skip + +```{r fqc_pq, eval=FALSE, fig.width=13} # TODO: use boxplot instead of point f1 <- dr_unnest("FastqcMetricsFile_positional_quality") quants <- c(25, 50, 75) @@ -539,7 +553,9 @@ plotly::ggplotly(read_len_plot) ### Sequence Positions ('Adapter Content') -```{r seq_pos, fig.height=42} +Skip + +```{r seq_pos, eval=F, fig.height=42} f1 <- dr_unnest("FastqcMetricsFile_sequence_positions") f1 |> ggplot(aes(x = bp, y = value, colour = seq)) + @@ -555,7 +571,9 @@ f1 |> ## Coverage {.tabset .tabset-pills} -```{r contig_cvg, eval=eval$WgsContigMeanCovFile, results='asis', fig.height=5} +Skip + +```{r contig_cvg, eval=FALSE, results='asis', fig.height=5} d1 <- dr_unnest("WgsContigMeanCovFile") |> arrange(desc("umccrid")) for (type1 in sort(unique(d1$type), decreasing = FALSE)) { @@ -606,7 +624,9 @@ plotly::ggplotly(flp) - Only for WGS. -```{r pe, eval=eval$PloidyEstimationMetricsFile, fig.height=5} +Skip + +```{r pe, eval=F, fig.height=5} chrom_levels <- c(1:22, "x", "y") d_pl_plot_data <- d_pl |> select( @@ -631,7 +651,9 @@ plotly::ggplotly(d_pl_plot) ## Hist -```{r cvgm, eval=eval$WgsCoverageMetricsFile, fig.height=8, fig.width=12} +Skip + +```{r cvgm, eval=F, fig.height=8, fig.width=12} d_hist <- dr_unnest("WgsHistFile") d_hist1 <- d_hist |> ggplot(aes(x = start, y = pct, colour = umccrid)) + @@ -660,7 +682,9 @@ plotly::subplot(d_hist1, d_hist2, shareY = TRUE, titleY = TRUE, titleX = TRUE, n ## FineHist -```{r finehist, eval=eval$WgsFineHistFile, fig.height=10, fig.width=12} +Skip + +```{r finehist, eval=FALSE, fig.height=10, fig.width=12} d_fhist <- dr_unnest("WgsFineHistFile") d_fhist |> dracarys::WgsFineHistFile$public_methods$plot(c(0, 150)) +