Skip to content

Commit

Permalink
dragen: add dirty name checker
Browse files Browse the repository at this point in the history
  • Loading branch information
pdiakumis committed Oct 18, 2024
1 parent 91d3016 commit 9c32347
Show file tree
Hide file tree
Showing 12 changed files with 135 additions and 31 deletions.
89 changes: 68 additions & 21 deletions R/dragen.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ dragen_fraglenhist_plot <- function(d, min_count = 10) {
#' @param x Path to file.
#'
#' @return Tibble with metrics.
#' @examples
#' \dontrun{
#' dragen_umi_metrics_read(x)
#' }
#' @export
dragen_umi_metrics_read <- function(x) {
d0 <- readr::read_lines(x)
Expand Down Expand Up @@ -85,6 +89,7 @@ dragen_umi_metrics_read <- function(x) {
var = tolower(.data$var),
var = dplyr::recode(.data$var, !!!abbrev_nm)
)
dirty_names_cleaned(d1$var, abbrev_nm, x)
hist <- d1 |>
dplyr::filter(grepl("histo", .data$var)) |>
dplyr::select(name = "var", "count") |>
Expand Down Expand Up @@ -121,6 +126,11 @@ dragen_umi_metrics_read <- function(x) {
#' @param x Path to file.
#'
#' @return Tibble with metrics.
#' @examples
#' \dontrun{
#' dragen_gc_metrics_read(x)
#' }
#'
#' @export
dragen_gc_metrics_read <- function(x) {
d0 <- readr::read_lines(x)
Expand Down Expand Up @@ -155,6 +165,7 @@ dragen_gc_metrics_read <- function(x) {
name = dplyr::recode(.data$name, !!!abbrev_nm)
) |>
tidyr::pivot_wider(names_from = "name", values_from = "value")
dirty_names_cleaned(colnames(summary), abbrev_nm, x)

# GC BIAS DETAILS
details <- d1 |>
Expand Down Expand Up @@ -190,6 +201,11 @@ dragen_gc_metrics_read <- function(x) {
#' @param x Path to file.
#'
#' @return Tibble with metrics.
#' @examples
#' \dontrun{
#' dragen_cnv_metrics_read(x)
#' }
#'
#' @export
dragen_cnv_metrics_read <- function(x) {
d0 <- readr::read_lines(x)
Expand Down Expand Up @@ -230,7 +246,9 @@ dragen_cnv_metrics_read <- function(x) {
count = as.numeric(.data$count),
pct = round(as.numeric(.data$pct), 2),
var = dplyr::recode(.data$var, !!!abbrev_nm)
) |>
)
dirty_names_cleaned(d2$var, abbrev_nm, x)
d2 <- d2 |>
dplyr::select("var", "count", "pct") |>
tidyr::pivot_longer(c("count", "pct")) |>
dplyr::filter(!is.na(.data$value)) |>
Expand All @@ -251,6 +269,10 @@ dragen_cnv_metrics_read <- function(x) {
#' @param x Path to file.
#'
#' @return Tibble with metrics.
#' @examples
#' \dontrun{
#' dragen_sv_metrics_read(x)
#' }
#' @export
dragen_sv_metrics_read <- function(x) {
d <- readr::read_lines(x)
Expand All @@ -261,7 +283,7 @@ dragen_sv_metrics_read <- function(x) {
"Number of duplications (PASS)" = "dup",
"Number of breakend pairs (PASS)" = "bnd"
)
d |>
res <- d |>
tibble::as_tibble_col(column_name = "value") |>
dplyr::filter(!grepl("Total number of structural variants", .data$value)) |>
tidyr::separate_wider_delim(
Expand All @@ -275,6 +297,8 @@ dragen_sv_metrics_read <- function(x) {
) |>
dplyr::select("var", "count") |>
tidyr::pivot_wider(names_from = "var", values_from = "count")
dirty_names_cleaned(colnames(res), abbrev_nm, x)
res
}

#' Read DRAGEN Trimmer Metrics
Expand All @@ -284,10 +308,14 @@ dragen_sv_metrics_read <- function(x) {
#' @param x Path to file.
#'
#' @return Tibble with metrics.
#' @examples
#' \dontrun{
#' dragen_trimmer_metrics_read(x)
#' }
#' @export
dragen_trimmer_metrics_read <- function(x) {
d <- readr::read_lines(x)
assertthat::assert_that(grepl("TRIMMER STATISTICS", d[1]))
d0 <- readr::read_lines(x)
assertthat::assert_that(grepl("TRIMMER STATISTICS", d0[1]))
abbrev_nm <- c(
"Total input reads" = "reads_tot_input",
"Total input bases" = "bases_tot",
Expand All @@ -313,15 +341,17 @@ dragen_trimmer_metrics_read <- function(x) {
"Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2"
)

d |>
d1 <- d0 |>
tibble::as_tibble_col(column_name = "value") |>
tidyr::separate_wider_delim("value", names = c("category", "extra", "var", "count", "pct"), delim = ",", too_few = "align_start") |>
dplyr::mutate(
count = as.numeric(.data$count),
pct = round(as.numeric(.data$pct), 2),
var = dplyr::recode(.data$var, !!!abbrev_nm)
) |>
dplyr::select("var", "count", "pct") |>
dplyr::select("var", "count", "pct")
dirty_names_cleaned(d1$var, abbrev_nm, x)
d1 |>
tidyr::pivot_longer(c("count", "pct")) |>
dplyr::filter(!is.na(.data$value)) |>
dplyr::mutate(
Expand All @@ -339,6 +369,10 @@ dragen_trimmer_metrics_read <- function(x) {
#' @param x Path to file.
#'
#' @return Tibble with metrics.
#' @examples
#' \dontrun{
#' dragen_vc_metrics_read(x)
#' }
#' @export
dragen_vc_metrics_read <- function(x) {
abbrev_nm1 <- tibble::tribble(
Expand Down Expand Up @@ -367,12 +401,14 @@ dragen_vc_metrics_read <- function(x) {
"Percent Autosome Callability", "callability_auto_pct", FALSE,
"Number of samples", "sample_num", FALSE,
"Reads Processed", "reads_processed", FALSE,
"Child Sample", "sample_child", FALSE
"Child Sample", "sample_child", FALSE,
"Percent QC Region Callability in Region 1", "qc_region_callability_pct_region1", FALSE,
"Percent QC Region Callability in Region 2", "qc_region_callability_pct_region2", FALSE
)
raw <- readr::read_lines(x)
assertthat::assert_that(grepl("VARIANT CALLER", raw[1]))
d0 <- readr::read_lines(x)
assertthat::assert_that(grepl("VARIANT CALLER", d0[1]))
# first detect if this is genome or target region
res <- raw |>
d1 <- d0 |>
tibble::as_tibble_col(column_name = "value") |>
tidyr::separate_wider_delim(
"value",
Expand All @@ -381,7 +417,7 @@ dragen_vc_metrics_read <- function(x) {
)
reg1 <- NULL
str1 <- NULL
tmp <- res |>
tmp <- d1 |>
dplyr::filter(grepl("Chr X number of SNPs over ", .data$var)) |>
dplyr::slice_head(n = 1) |>
dplyr::pull("var")
Expand All @@ -406,7 +442,7 @@ dragen_vc_metrics_read <- function(x) {
dplyr::select("raw", "clean") |>
tibble::deframe()

d <- res |>
d <- d1 |>
dplyr::mutate(
var = dplyr::recode(.data$var, !!!abbrev_nm),
count = dplyr::na_if(.data$count, "NA"),
Expand All @@ -421,6 +457,7 @@ dragen_vc_metrics_read <- function(x) {
) |>
dplyr::filter(.data$category != "summary") |>
dplyr::select("category", "sample", "var", "count", "pct")
dirty_names_cleaned(d$var, abbrev_nm, x)
# pivot
d |>
tidyr::pivot_longer(c("count", "pct")) |>
Expand All @@ -440,6 +477,10 @@ dragen_vc_metrics_read <- function(x) {
#' @param x Path to file.
#'
#' @return Tibble with metrics.
#' @examples
#' \dontrun{
#' dragen_mapping_metrics_read(x)
#' }
#' @export
dragen_mapping_metrics_read <- function(x) {
abbrev_nm <- c(
Expand Down Expand Up @@ -516,11 +557,11 @@ dragen_mapping_metrics_read <- function(x) {
"Average sequenced coverage over genome" = "cov_avg_seq_over_genome",
"Filtered rRNA reads" = "reads_rrna_filtered"
)
raw <- readr::read_lines(x)
assertthat::assert_that(grepl("MAPPING/ALIGNING", raw[1]))
d0 <- readr::read_lines(x)
assertthat::assert_that(grepl("MAPPING/ALIGNING", d0[1]))
# split by RG and non-RG
# tidy
d <- raw |>
d <- d0 |>
tibble::as_tibble_col(column_name = "value") |>
tidyr::separate_wider_delim(
"value",
Expand All @@ -535,6 +576,7 @@ dragen_mapping_metrics_read <- function(x) {
RG = dplyr::if_else(.data$RG == "", "Total", .data$RG)
) |>
dplyr::select("RG", "var", "count", "pct")
dirty_names_cleaned(unique(d$var), abbrev_nm, x)
# pivot
d |>
tidyr::pivot_longer(c("count", "pct")) |>
Expand All @@ -553,6 +595,10 @@ dragen_mapping_metrics_read <- function(x) {
#' @param x Path to file.
#'
#' @return Tibble with metrics.
#' @examples
#' \dontrun{
#' dragen_coverage_metrics_read(x)
#' }
#' @export
dragen_coverage_metrics_read <- function(x) {
# all rows except 'Aligned bases' and 'Aligned reads' refer to the region
Expand All @@ -572,10 +618,10 @@ dragen_coverage_metrics_read <- function(x) {
"Mean/Median autosomal coverage ratio over ", "cov_mean_median_auto_ratio_over_", TRUE,
"Aligned reads in ", "reads_aligned_in_", TRUE
)
raw <- readr::read_lines(x)
assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1]))
d0 <- readr::read_lines(x)
assertthat::assert_that(grepl("COVERAGE SUMMARY", d0[1]))
# first detect if this is genome, QC coverage region, or target region
res <- raw |>
d1 <- d0 |>
tibble::as_tibble_col(column_name = "value") |>
tidyr::separate_wider_delim(
"value",
Expand All @@ -584,7 +630,7 @@ dragen_coverage_metrics_read <- function(x) {
)
reg1 <- NULL
str1 <- NULL
tmp <- res |>
tmp <- d1 |>
dplyr::filter(grepl("PCT of .* with coverage ", .data$var)) |>
dplyr::slice_head(n = 1) |>
dplyr::pull("var")
Expand All @@ -611,12 +657,13 @@ dragen_coverage_metrics_read <- function(x) {
# split to rename the
# "PCT of genome with coverage [100x: inf)" values
pat <- glue("PCT of {str1} with coverage ")
res1 <- res |>
res1 <- d1 |>
# pct just shows % for a couple rows which can be
# calculated from their above values
dplyr::filter(!grepl(pat, .data$var)) |>
dplyr::select("var", "value")
res2 <- res |>
dirty_names_cleaned(res1$var, names(abbrev_nm), x)
res2 <- d1 |>
dplyr::filter(grepl(pat, .data$var)) |>
dplyr::mutate(
var = sub(pat, "", .data$var),
Expand Down
18 changes: 9 additions & 9 deletions R/dragen_fastqc.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ dragen_fastqc_metrics_read <- function(x) {
dplyr::select("mate", "seq", "bp", "value")

list(
positional_base_content = pos_base_cont,
positional_base_mean_quality = pos_base_mean_qual,
positional_quality = pos_qual,
read_gc_content = gc_cont,
read_gc_content_quality = gc_cont_qual,
read_lengths = read_len,
read_mean_quality = read_mean_qual,
sequence_positions = seq_pos
fqc_positionalBaseContent = pos_base_cont,
fqc_positionalBaseMeanQuality = pos_base_mean_qual,
fqc_positionalQuality = pos_qual,
fqc_readGCContent = gc_cont,
fqc_readGCContentQuality = gc_cont_qual,
fqc_readLengths = read_len,
fqc_readMeanQuality = read_mean_qual,
fqc_sequencePositions = seq_pos
) |>
tibble::enframe(name = "fastqc_name", value = "fastqc_value")
tibble::enframe(name = "name", value = "data")
}
2 changes: 1 addition & 1 deletion R/tso_dragen.R
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ Wf_dragen <- R6::R6Class(
#' @param x Path to file.
read_fastqcMetrics = function(x) {
dat <- dragen_fastqc_metrics_read(x)
tibble::tibble(name = "fastqcmetrics", data = list(dat[]))
dat
},
#' @description Read `gc_metrics.csv` file.
#' @param x Path to file.
Expand Down
15 changes: 15 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
dirty_names_cleaned <- function(dirty, clean, fname = NULL) {
assertthat::assert_that(
rlang::is_atomic(dirty),
rlang::is_atomic(clean)
)
are_dirty <- dirty[!dirty %in% clean]
msg <- paste(
"Following columns are dirty:",
paste(are_dirty, collapse = ", "),
ifelse(!is.null(fname), paste("Filename:", fname), ""),
sep = "\n"
)
assertthat::assert_that(length(are_dirty) == 0, msg = msg)
}

#' Print current timestamp for logging
#'
#' @return Current timestamp as character.
Expand Down
6 changes: 6 additions & 0 deletions man/dragen_cnv_metrics_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions man/dragen_coverage_metrics_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions man/dragen_gc_metrics_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions man/dragen_mapping_metrics_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions man/dragen_sv_metrics_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions man/dragen_trimmer_metrics_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions man/dragen_umi_metrics_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions man/dragen_vc_metrics_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 9c32347

Please sign in to comment.