From 9c32347f478220f2ae220c23f03e43f2197a5eb9 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 18 Oct 2024 21:30:15 +1100 Subject: [PATCH] dragen: add dirty name checker --- R/dragen.R | 89 ++++++++++++++++++++++------- R/dragen_fastqc.R | 18 +++--- R/tso_dragen.R | 2 +- R/utils.R | 15 +++++ man/dragen_cnv_metrics_read.Rd | 6 ++ man/dragen_coverage_metrics_read.Rd | 5 ++ man/dragen_gc_metrics_read.Rd | 6 ++ man/dragen_mapping_metrics_read.Rd | 5 ++ man/dragen_sv_metrics_read.Rd | 5 ++ man/dragen_trimmer_metrics_read.Rd | 5 ++ man/dragen_umi_metrics_read.Rd | 5 ++ man/dragen_vc_metrics_read.Rd | 5 ++ 12 files changed, 135 insertions(+), 31 deletions(-) diff --git a/R/dragen.R b/R/dragen.R index 90cfd8b..2e6cbbd 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -34,6 +34,10 @@ dragen_fraglenhist_plot <- function(d, min_count = 10) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_umi_metrics_read(x) +#' } #' @export dragen_umi_metrics_read <- function(x) { d0 <- readr::read_lines(x) @@ -85,6 +89,7 @@ dragen_umi_metrics_read <- function(x) { var = tolower(.data$var), var = dplyr::recode(.data$var, !!!abbrev_nm) ) + dirty_names_cleaned(d1$var, abbrev_nm, x) hist <- d1 |> dplyr::filter(grepl("histo", .data$var)) |> dplyr::select(name = "var", "count") |> @@ -121,6 +126,11 @@ dragen_umi_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_gc_metrics_read(x) +#' } +#' #' @export dragen_gc_metrics_read <- function(x) { d0 <- readr::read_lines(x) @@ -155,6 +165,7 @@ dragen_gc_metrics_read <- function(x) { name = dplyr::recode(.data$name, !!!abbrev_nm) ) |> tidyr::pivot_wider(names_from = "name", values_from = "value") + dirty_names_cleaned(colnames(summary), abbrev_nm, x) # GC BIAS DETAILS details <- d1 |> @@ -190,6 +201,11 @@ dragen_gc_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_cnv_metrics_read(x) +#' } +#' #' @export dragen_cnv_metrics_read <- function(x) { d0 <- readr::read_lines(x) @@ -230,7 +246,9 @@ dragen_cnv_metrics_read <- function(x) { count = as.numeric(.data$count), pct = round(as.numeric(.data$pct), 2), var = dplyr::recode(.data$var, !!!abbrev_nm) - ) |> + ) + dirty_names_cleaned(d2$var, abbrev_nm, x) + d2 <- d2 |> dplyr::select("var", "count", "pct") |> tidyr::pivot_longer(c("count", "pct")) |> dplyr::filter(!is.na(.data$value)) |> @@ -251,6 +269,10 @@ dragen_cnv_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_sv_metrics_read(x) +#' } #' @export dragen_sv_metrics_read <- function(x) { d <- readr::read_lines(x) @@ -261,7 +283,7 @@ dragen_sv_metrics_read <- function(x) { "Number of duplications (PASS)" = "dup", "Number of breakend pairs (PASS)" = "bnd" ) - d |> + res <- d |> tibble::as_tibble_col(column_name = "value") |> dplyr::filter(!grepl("Total number of structural variants", .data$value)) |> tidyr::separate_wider_delim( @@ -275,6 +297,8 @@ dragen_sv_metrics_read <- function(x) { ) |> dplyr::select("var", "count") |> tidyr::pivot_wider(names_from = "var", values_from = "count") + dirty_names_cleaned(colnames(res), abbrev_nm, x) + res } #' Read DRAGEN Trimmer Metrics @@ -284,10 +308,14 @@ dragen_sv_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_trimmer_metrics_read(x) +#' } #' @export dragen_trimmer_metrics_read <- function(x) { - d <- readr::read_lines(x) - assertthat::assert_that(grepl("TRIMMER STATISTICS", d[1])) + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("TRIMMER STATISTICS", d0[1])) abbrev_nm <- c( "Total input reads" = "reads_tot_input", "Total input bases" = "bases_tot", @@ -313,7 +341,7 @@ dragen_trimmer_metrics_read <- function(x) { "Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2" ) - d |> + d1 <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim("value", names = c("category", "extra", "var", "count", "pct"), delim = ",", too_few = "align_start") |> dplyr::mutate( @@ -321,7 +349,9 @@ dragen_trimmer_metrics_read <- function(x) { pct = round(as.numeric(.data$pct), 2), var = dplyr::recode(.data$var, !!!abbrev_nm) ) |> - dplyr::select("var", "count", "pct") |> + dplyr::select("var", "count", "pct") + dirty_names_cleaned(d1$var, abbrev_nm, x) + d1 |> tidyr::pivot_longer(c("count", "pct")) |> dplyr::filter(!is.na(.data$value)) |> dplyr::mutate( @@ -339,6 +369,10 @@ dragen_trimmer_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_vc_metrics_read(x) +#' } #' @export dragen_vc_metrics_read <- function(x) { abbrev_nm1 <- tibble::tribble( @@ -367,12 +401,14 @@ dragen_vc_metrics_read <- function(x) { "Percent Autosome Callability", "callability_auto_pct", FALSE, "Number of samples", "sample_num", FALSE, "Reads Processed", "reads_processed", FALSE, - "Child Sample", "sample_child", FALSE + "Child Sample", "sample_child", FALSE, + "Percent QC Region Callability in Region 1", "qc_region_callability_pct_region1", FALSE, + "Percent QC Region Callability in Region 2", "qc_region_callability_pct_region2", FALSE ) - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("VARIANT CALLER", raw[1])) + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("VARIANT CALLER", d0[1])) # first detect if this is genome or target region - res <- raw |> + d1 <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( "value", @@ -381,7 +417,7 @@ dragen_vc_metrics_read <- function(x) { ) reg1 <- NULL str1 <- NULL - tmp <- res |> + tmp <- d1 |> dplyr::filter(grepl("Chr X number of SNPs over ", .data$var)) |> dplyr::slice_head(n = 1) |> dplyr::pull("var") @@ -406,7 +442,7 @@ dragen_vc_metrics_read <- function(x) { dplyr::select("raw", "clean") |> tibble::deframe() - d <- res |> + d <- d1 |> dplyr::mutate( var = dplyr::recode(.data$var, !!!abbrev_nm), count = dplyr::na_if(.data$count, "NA"), @@ -421,6 +457,7 @@ dragen_vc_metrics_read <- function(x) { ) |> dplyr::filter(.data$category != "summary") |> dplyr::select("category", "sample", "var", "count", "pct") + dirty_names_cleaned(d$var, abbrev_nm, x) # pivot d |> tidyr::pivot_longer(c("count", "pct")) |> @@ -440,6 +477,10 @@ dragen_vc_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_mapping_metrics_read(x) +#' } #' @export dragen_mapping_metrics_read <- function(x) { abbrev_nm <- c( @@ -516,11 +557,11 @@ dragen_mapping_metrics_read <- function(x) { "Average sequenced coverage over genome" = "cov_avg_seq_over_genome", "Filtered rRNA reads" = "reads_rrna_filtered" ) - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("MAPPING/ALIGNING", raw[1])) + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("MAPPING/ALIGNING", d0[1])) # split by RG and non-RG # tidy - d <- raw |> + d <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( "value", @@ -535,6 +576,7 @@ dragen_mapping_metrics_read <- function(x) { RG = dplyr::if_else(.data$RG == "", "Total", .data$RG) ) |> dplyr::select("RG", "var", "count", "pct") + dirty_names_cleaned(unique(d$var), abbrev_nm, x) # pivot d |> tidyr::pivot_longer(c("count", "pct")) |> @@ -553,6 +595,10 @@ dragen_mapping_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_coverage_metrics_read(x) +#' } #' @export dragen_coverage_metrics_read <- function(x) { # all rows except 'Aligned bases' and 'Aligned reads' refer to the region @@ -572,10 +618,10 @@ dragen_coverage_metrics_read <- function(x) { "Mean/Median autosomal coverage ratio over ", "cov_mean_median_auto_ratio_over_", TRUE, "Aligned reads in ", "reads_aligned_in_", TRUE ) - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1])) + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("COVERAGE SUMMARY", d0[1])) # first detect if this is genome, QC coverage region, or target region - res <- raw |> + d1 <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( "value", @@ -584,7 +630,7 @@ dragen_coverage_metrics_read <- function(x) { ) reg1 <- NULL str1 <- NULL - tmp <- res |> + tmp <- d1 |> dplyr::filter(grepl("PCT of .* with coverage ", .data$var)) |> dplyr::slice_head(n = 1) |> dplyr::pull("var") @@ -611,12 +657,13 @@ dragen_coverage_metrics_read <- function(x) { # split to rename the # "PCT of genome with coverage [100x: inf)" values pat <- glue("PCT of {str1} with coverage ") - res1 <- res |> + res1 <- d1 |> # pct just shows % for a couple rows which can be # calculated from their above values dplyr::filter(!grepl(pat, .data$var)) |> dplyr::select("var", "value") - res2 <- res |> + dirty_names_cleaned(res1$var, names(abbrev_nm), x) + res2 <- d1 |> dplyr::filter(grepl(pat, .data$var)) |> dplyr::mutate( var = sub(pat, "", .data$var), diff --git a/R/dragen_fastqc.R b/R/dragen_fastqc.R index 0ac31bc..ceb9ce3 100644 --- a/R/dragen_fastqc.R +++ b/R/dragen_fastqc.R @@ -120,14 +120,14 @@ dragen_fastqc_metrics_read <- function(x) { dplyr::select("mate", "seq", "bp", "value") list( - positional_base_content = pos_base_cont, - positional_base_mean_quality = pos_base_mean_qual, - positional_quality = pos_qual, - read_gc_content = gc_cont, - read_gc_content_quality = gc_cont_qual, - read_lengths = read_len, - read_mean_quality = read_mean_qual, - sequence_positions = seq_pos + fqc_positionalBaseContent = pos_base_cont, + fqc_positionalBaseMeanQuality = pos_base_mean_qual, + fqc_positionalQuality = pos_qual, + fqc_readGCContent = gc_cont, + fqc_readGCContentQuality = gc_cont_qual, + fqc_readLengths = read_len, + fqc_readMeanQuality = read_mean_qual, + fqc_sequencePositions = seq_pos ) |> - tibble::enframe(name = "fastqc_name", value = "fastqc_value") + tibble::enframe(name = "name", value = "data") } diff --git a/R/tso_dragen.R b/R/tso_dragen.R index d5175a3..dc35091 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -241,7 +241,7 @@ Wf_dragen <- R6::R6Class( #' @param x Path to file. read_fastqcMetrics = function(x) { dat <- dragen_fastqc_metrics_read(x) - tibble::tibble(name = "fastqcmetrics", data = list(dat[])) + dat }, #' @description Read `gc_metrics.csv` file. #' @param x Path to file. diff --git a/R/utils.R b/R/utils.R index 8691826..c2bc44c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,18 @@ +dirty_names_cleaned <- function(dirty, clean, fname = NULL) { + assertthat::assert_that( + rlang::is_atomic(dirty), + rlang::is_atomic(clean) + ) + are_dirty <- dirty[!dirty %in% clean] + msg <- paste( + "Following columns are dirty:", + paste(are_dirty, collapse = ", "), + ifelse(!is.null(fname), paste("Filename:", fname), ""), + sep = "\n" + ) + assertthat::assert_that(length(are_dirty) == 0, msg = msg) +} + #' Print current timestamp for logging #' #' @return Current timestamp as character. diff --git a/man/dragen_cnv_metrics_read.Rd b/man/dragen_cnv_metrics_read.Rd index c6e7281..c347067 100644 --- a/man/dragen_cnv_metrics_read.Rd +++ b/man/dragen_cnv_metrics_read.Rd @@ -15,3 +15,9 @@ Tibble with metrics. \description{ Reads the \code{cnv_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_cnv_metrics_read(x) +} + +} diff --git a/man/dragen_coverage_metrics_read.Rd b/man/dragen_coverage_metrics_read.Rd index 96fd3ef..dc88de2 100644 --- a/man/dragen_coverage_metrics_read.Rd +++ b/man/dragen_coverage_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{coverage_metrics.csv} file generated by DRAGEN. } +\examples{ +\dontrun{ +dragen_coverage_metrics_read(x) +} +} diff --git a/man/dragen_gc_metrics_read.Rd b/man/dragen_gc_metrics_read.Rd index fc53959..f68f4a0 100644 --- a/man/dragen_gc_metrics_read.Rd +++ b/man/dragen_gc_metrics_read.Rd @@ -15,3 +15,9 @@ Tibble with metrics. \description{ Reads the \code{gc_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_gc_metrics_read(x) +} + +} diff --git a/man/dragen_mapping_metrics_read.Rd b/man/dragen_mapping_metrics_read.Rd index 223f0a7..a2ff466 100644 --- a/man/dragen_mapping_metrics_read.Rd +++ b/man/dragen_mapping_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{mapping_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_mapping_metrics_read(x) +} +} diff --git a/man/dragen_sv_metrics_read.Rd b/man/dragen_sv_metrics_read.Rd index 50577e0..0150f5d 100644 --- a/man/dragen_sv_metrics_read.Rd +++ b/man/dragen_sv_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{sv_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_sv_metrics_read(x) +} +} diff --git a/man/dragen_trimmer_metrics_read.Rd b/man/dragen_trimmer_metrics_read.Rd index 806a86f..8a58398 100644 --- a/man/dragen_trimmer_metrics_read.Rd +++ b/man/dragen_trimmer_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{trimmer_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_trimmer_metrics_read(x) +} +} diff --git a/man/dragen_umi_metrics_read.Rd b/man/dragen_umi_metrics_read.Rd index ca80a82..16f57af 100644 --- a/man/dragen_umi_metrics_read.Rd +++ b/man/dragen_umi_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{umi_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_umi_metrics_read(x) +} +} diff --git a/man/dragen_vc_metrics_read.Rd b/man/dragen_vc_metrics_read.Rd index 9bd4dae..e0e904e 100644 --- a/man/dragen_vc_metrics_read.Rd +++ b/man/dragen_vc_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{vc_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_vc_metrics_read(x) +} +}