From 9bdd6cc443a41e90a3c9dadb68721abb82af7b84 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 12 Oct 2024 23:51:34 +1100 Subject: [PATCH 01/17] init dragen R6 class --- R/tso.R | 6 +- R/tso_dragen.R | 185 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 3 deletions(-) create mode 100644 R/tso_dragen.R diff --git a/R/tso.R b/R/tso.R index 048e993..a302003 100644 --- a/R/tso.R +++ b/R/tso.R @@ -119,13 +119,13 @@ Wf_tso_ctdna_tumor_only <- R6::R6Class( #' @description Read `MergedSmallVariants.vcf.gz` file. #' @param x Path to file. read_msv = function(x) { - dat <- bcftools_parse_vcf(x, only_pass = FALSE, alias = FALSE) + dat <- bcftools_parse_vcf(x, only_pass = FALSE, alias = TRUE) tibble::tibble(name = "mergedsmallv", data = list(dat)) }, #' @description Read `MergedSmallVariants.genome.vcf.gz` file. #' @param x Path to file. read_msvg = function(x) { - dat <- bcftools_parse_vcf(x, only_pass = FALSE, alias = FALSE) + dat <- bcftools_parse_vcf(x, only_pass = FALSE, alias = TRUE) tibble::tibble(name = "mergedsmallvg", data = list(dat)) }, #' @description Read `CombinedVariantOutput.tsv` file. @@ -137,7 +137,7 @@ Wf_tso_ctdna_tumor_only <- R6::R6Class( #' @description Read `CopyNumberVariants.vcf.gz` file. #' @param x Path to file. read_cnv = function(x) { - dat <- bcftools_parse_vcf(x, only_pass = FALSE, alias = FALSE) + dat <- bcftools_parse_vcf(x, only_pass = FALSE, alias = TRUE) tibble::tibble(name = "cnv", data = list(dat)) }, #' @description Read `fragment_length_hist.json.gz` file. diff --git a/R/tso_dragen.R b/R/tso_dragen.R new file mode 100644 index 0000000..f7ac52b --- /dev/null +++ b/R/tso_dragen.R @@ -0,0 +1,185 @@ +#' Wf_dragen R6 Class +#' +#' @description +#' Reads and writes tidy versions of files from the `dragen` workflow. +#' +#' @examples +#' \dontrun{ +#' +#' #---- Local ----# +#' p <- file.path( +#' "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", +#' "analysis/cttsov2/20240915ff0295ed" +#' ) +#' prefix <- "L2401290" +#' t1 <- Wf_tso_ctdna_tumor_only_v2$new(path = p, prefix = prefix) +#' t1$list_files(max_files = 100) +#' t1$list_files_filter_relevant(max_files = 300) +#' d <- t1$download_files(max_files = 100, dryrun = F) +#' d_tidy <- t1$tidy_files(d) +#' d_write <- t1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = prefix, +#' format = "tsv" +#' ) +#' } +#' @export +Wf_dragen <- R6::R6Class( + "Wf_dragen", + inherit = Wf, + public = list( + #' @field prefix The LibraryID prefix of the sample (needed for path lookup). + prefix = NULL, + #' @description Create a new Wf_dragen object. + #' @param path Path to directory with raw workflow results (from S3 or + #' local filesystem). + #' @param prefix The LibraryID prefix of the sample (needed for path lookup). + initialize = function(path = NULL, prefix = NULL) { + wname <- "dragen" + pref <- prefix + reg1 <- tibble::tribble( + ~regex, ~fun, + glue("{dc}/{pref}\\-replay\\.json$"), "replay", + glue("{dc}/{pref}\\.cnv_metrics.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.exon_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{dc}/{pref}\\.exon_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.exon_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.exon_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.exon_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.fastqc_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.fragment_length_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.gvcf_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.mapping_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.microsat_output\\.json$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.sv_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.target_bed_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{dc}/{pref}\\.target_bed_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.target_bed_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.target_bed_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.target_bed_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.time_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.tmb_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{dc}/{pref}\\.tmb_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.tmb_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.tmb_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.tmb_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.trimmer_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.umi_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.vc_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.wgs_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{dc}/{pref}\\.wgs_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.wgs_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.wgs_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{dc}/{pref}\\.wgs_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY" + ) + regexes <- reg1 |> + dplyr::mutate( + fun = paste0("read_", .data$fun), + fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun) + ) + + super$initialize(path = path, wname = wname, regexes = regexes) + self$prefix <- prefix + }, + #' @description Print details about the Workflow. + #' @param ... (ignored). + print = function(...) { + res <- tibble::tribble( + ~var, ~value, + "path", self$path, + "wname", self$wname, + "filesystem", self$filesystem, + "prefix", self$prefix + ) + print(res) + invisible(self) + }, + #' @description Read `replay.json` file. + #' @param x Path to file. + read_replay = function(x) { + res <- x |> + jsonlite::read_json(simplifyVector = TRUE) |> + purrr::map_if(is.data.frame, tibble::as_tibble) + req_elements <- c("command_line", "hash_table_build", "dragen_config", "system") + assertthat::assert_that(all(names(res) %in% req_elements)) + res[["system"]] <- res[["system"]] |> + tibble::as_tibble_row() + res[["hash_table_build"]] <- res[["hash_table_build"]] |> + tibble::as_tibble_row() + # we don't care if the columns are characters, no analysis likely to be done on dragen options + # (though never say never!) + res[["dragen_config"]] <- res[["dragen_config"]] |> + tidyr::pivot_wider(names_from = "name", values_from = "value") + return(dplyr::bind_cols(res)) + }, + #' @description Read `contig_mean_cov.csv` file. + #' @param x Path to file. + read_contigMeanCov = function(x) { + readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> + dplyr::filter( + if (!keep_alt) { + !grepl("chrM|MT|_|Autosomal|HLA-|EBV", .data$chrom) + } else { + TRUE + } + ) + }, + #' @description Read `dragen.tsv.gz` cancer report hrd file. + #' @param x Path to file. + read_coverageMetrics = function(x) { + abbrev_nm <- c( + "Aligned bases" = "bases_aligned_dragen", + "Aligned bases in genome" = "bases_aligned_in_genome_dragen", + "Average alignment coverage over genome" = "cov_alignment_avg_over_genome_dragen", + "Uniformity of coverage (PCT > 0.2*mean) over genome" = "cov_uniformity_over_genome_pct_gt02mean_dragen", + "Uniformity of coverage (PCT > 0.4*mean) over genome" = "cov_uniformity_over_genome_pct_gt04mean_dragen", + "Average chr X coverage over genome" = "cov_avg_x_over_genome_dragen", + "Average chr Y coverage over genome" = "cov_avg_y_over_genome_dragen", + "Average mitochondrial coverage over genome" = "cov_avg_mt_over_genome_dragen", + "Average autosomal coverage over genome" = "cov_avg_auto_over_genome_dragen", + "Median autosomal coverage over genome" = "cov_median_auto_over_genome_dragen", + "Mean/Median autosomal coverage ratio over genome" = "cov_mean_median_auto_ratio_over_genome_dragen", + "Aligned reads" = "reads_aligned_dragen", + "Aligned reads in genome" = "reads_aligned_in_genome_dragen" + ) + raw <- readr::read_lines(x) + assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1])) + + res <- raw |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim( + "value", + delim = ",", too_few = "align_start", + names = c("category", "dummy1", "var", "value", "pct") + ) + # split to rename the + # "PCT of genome with coverage [100x: inf)" values + res1 <- res |> + # pct just shows 100% for a couple rows + dplyr::filter(!grepl("PCT of genome with coverage", .data$var)) |> + dplyr::select("var", "value") + res2 <- res |> + dplyr::filter(grepl("PCT of genome with coverage", .data$var)) |> + dplyr::mutate( + var = sub("PCT of genome with coverage ", "", .data$var), + var = gsub("\\[|\\]|\\(|\\)| ", "", .data$var), + var = gsub("x", "", .data$var), + var = gsub("inf", "Inf", .data$var) + ) |> + tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> + dplyr::mutate(var = as.character(glue("cov_genome_pct_{start}_{end}_dragen"))) |> + dplyr::select("var", "value") + res <- dplyr::bind_rows(res1, res2) |> + dplyr::mutate( + value = dplyr::na_if(.data$value, "NA"), + value = as.numeric(.data$value), + var = dplyr::recode(.data$var, !!!abbrev_nm) + ) |> + tidyr::pivot_wider(names_from = "var", values_from = "value") + return(res) + }, + ) # end public +) # end Wf_dragen From 283db95f7496280a80cc9b81f21dec6db656858e Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 13 Oct 2024 23:38:54 +1100 Subject: [PATCH 02/17] alignqc: join metrics tabs with meta --- .../alignment_qc/dl_and_tidy.R | 4 +- .../umccr_workflows/alignment_qc/summary.Rmd | 60 ++++++++++++------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R index f5ed36b..9fbfa12 100755 --- a/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R +++ b/inst/rmd/umccr_workflows/alignment_qc/dl_and_tidy.R @@ -14,8 +14,6 @@ c("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION") |> stopifnot() icav1_token <- Sys.getenv("ICA_ACCESS_TOKEN") |> dracarys::ica_token_validate() -# this helps keep annoying reticulate prompt away -Sys.setenv(RETICULATE_PYTHON = Sys.getenv("CONDA_PYTHON_EXE")) query_workflow_alignqc <- function(start_date) { wfs <- c("wgs_alignment_qc", "wts_alignment_qc") |> @@ -37,7 +35,7 @@ query_limsrow_libids <- function(libids) { } # first read in the workflows table, extract metadata, then join with lims -start_date <- "2024-09-27" +start_date <- "2024-10-11" p_raw <- query_workflow_alignqc(start_date) wgs <- p_raw |> diff --git a/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd b/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd index b40fbe7..95807e2 100644 --- a/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd +++ b/inst/rmd/umccr_workflows/alignment_qc/summary.Rmd @@ -9,7 +9,7 @@ output: highlight: kate params: title: "UMCCR Alignment QC Summary Report" - meta: !r here::here("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/2024-09-27_wgts.rds") + meta: !r here::here("inst/rmd/umccr_workflows/alignment_qc/nogit/meta/2024-10-11_wgts.rds") description: "UMCCR Alignment QC Summary Report" title: "`r params$title`" --- @@ -239,7 +239,14 @@ conf <- list( pink_range = c(8, 20) ) d_map |> - dt_view() |> + left_join( + meta |> + mutate(umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}")) |> + select(umccrid, assay, workflow, project_name, project_owner), + by = "umccrid" + ) |> + select(umccrid, phenotype, type, source, assay, workflow, project_name, project_owner, everything()) |> + dt_view(scroll_y = 1500) |> DT::formatCurrency(columns = names(d_map)[num_pct_cols], currency = "", digits = 1) |> DT::formatCurrency(columns = names(d_map)[no_numpct_cols], currency = "", digits = 0) |> DT::formatStyle( @@ -304,7 +311,14 @@ d_cvg <- dr_unnest("WgsCoverageMetricsFile") |> ) num_cols <- names(d_cvg)[purrr::map_lgl(d_cvg, is.numeric)] d_cvg |> - dt_view() |> + left_join( + meta |> + mutate(umccrid = glue("{.data$SubjectID}_{.data$LibraryID}_LN{.data$lane}")) |> + select(umccrid, assay, workflow, project_name, project_owner), + by = "umccrid" + ) |> + select(umccrid, phenotype, type, source, assay, workflow, project_name, project_owner, everything()) |> + dt_view(scroll_y = 1500) |> DT::formatCurrency(columns = num_cols, currency = "", digits = 1) |> DT::formatStyle( "ploidy", @@ -392,7 +406,7 @@ f1_plot <- ggplot() + fill = rep(fqc_colours1$col, length(unique(f1$type))), alpha = 0.7 ) + - geom_line(data = f1, aes(x = q, y = prop, colour = umccrid, linetype = mate), linewidth = 1) + + geom_line(data = f1, aes(x = q, y = prop, colour = umccrid, linetype = mate), linewidth = 1, show.legend = FALSE) + scale_y_continuous(labels = scales::label_comma()) + theme(panel.grid.major = element_blank()) + facet_wrap(~type, ncol = 1) + @@ -401,8 +415,8 @@ f1_plot <- ggplot() + subtitle = glue("Percentage of reads with average quality scores. Shows if\na subset of reads has poor quality.") ) -# plotly::ggplotly(f1_plot) -f1_plot +plotly::ggplotly(f1_plot) +# f1_plot ``` ### GC Content ('Per-Sequence GC Content') @@ -427,8 +441,8 @@ gc_data_plot <- gc_data |> title = "Read GC Content", subtitle = glue("Total number of reads with each GC content\npercentile between 0% and 100%") ) -# plotly::ggplotly(gc_data_plot) -gc_data_plot +plotly::ggplotly(gc_data_plot) +# gc_data_plot ``` ### GC Content Quality ('GC Content Mean Quality Scores') @@ -454,15 +468,15 @@ f1_plot <- ggplot() + title = "GC Content Quality", subtitle = glue("Average Phred-scale read mean quality for reads with\neach GC content percentile between 0% and 100%.") ) -# plotly::ggplotly(f1_plot) -f1_plot +plotly::ggplotly(f1_plot) +# f1_plot ``` ### Positional Base Content ('Per-Position Sequence Content') - TODO: create heatmap instead -```{r fqc_pbc, eval=T, fig.height=42} +```{r fqc_pbc, eval=F, fig.height=42} f1 <- dr_unnest("FastqcMetricsFile_positional_base_content") f1 |> filter(base != "N") |> @@ -484,7 +498,7 @@ f1 |> ### Positional Base Mean Quality ('Per-Position Mean Quality Scores') -```{r fqc_bmq, eval=T, fig.height=80} +```{r fqc_bmq, eval=F, fig.height=80} f1 <- dr_unnest("FastqcMetricsFile_positional_base_mean_quality") ggplot() + geom_rect( @@ -508,7 +522,7 @@ ggplot() + ### Positional Quality ('Per-Position Quality Score Ranges') -```{r fqc_pq, eval=T, fig.width=13} +```{r fqc_pq, eval=F, fig.width=13} # TODO: use boxplot instead of point f1 <- dr_unnest("FastqcMetricsFile_positional_quality") quants <- c(25, 50, 75) @@ -546,8 +560,8 @@ read_len_plot <- read_len |> title = "Read Lengths", subtitle = glue("Read percentage with each observed length.") ) -# plotly::ggplotly(read_len_plot) -read_len_plot +plotly::ggplotly(read_len_plot) +# read_len_plot ``` ### Sequence Positions ('Adapter Content') @@ -569,7 +583,7 @@ f1 |> ## Coverage {.tabset .tabset-pills} -```{r contig_cvg, eval=T, results='asis', fig.height=5} +```{r contig_cvg, eval=F, results='asis', fig.height=5} d1 <- dr_unnest("WgsContigMeanCovFile") |> arrange(desc("umccrid")) for (type1 in sort(unique(d1$type), decreasing = FALSE)) { @@ -608,13 +622,13 @@ flp <- fl1 |> xlab("Fragment Length (bp)") + ylab(glue("Read Count (min: {min_count})")) + theme( - legend.position = c(0.9, 0.9), + legend.position.inside = c(0.9, 0.9), legend.justification = c(1, 1), panel.grid.minor = ggplot2::element_blank(), plot.title = ggplot2::element_text(colour = "#2c3e50", size = 14, face = "bold") ) -# plotly::ggplotly(flp) -flp +plotly::ggplotly(flp) +# flp ``` --- @@ -638,8 +652,8 @@ d_pl_plot <- d_pl_plot_data |> geom_line(aes(colour = umccrid, group = umccrid), na.rm = TRUE) + geom_point(aes(colour = umccrid), na.rm = TRUE) + labs(title = "Chromosome Median / Autosomal Median") -# plotly::ggplotly(d_pl_plot) -d_pl_plot +plotly::ggplotly(d_pl_plot) +# d_pl_plot ``` @@ -671,12 +685,12 @@ d_hist2 <- d_hist |> x = "Coverage", y = "PCT Cumsum", title = "Percentage of sites in genome with given coverage" ) -plotly::subplot(d_hist1, d_hist2, shareY = TRUE, titleY = TRUE, titleX = TRUE, nrows = 2) +# plotly::subplot(d_hist1, d_hist2, shareY = TRUE, titleY = TRUE, titleX = TRUE, nrows = 2) ``` ## FineHist -```{r finehist, eval=T, fig.height=10, fig.width=12} +```{r finehist, eval=F, fig.height=10, fig.width=12} d_fhist <- dr_unnest("WgsFineHistFile") d_fhist |> dracarys::WgsFineHistFile$public_methods$plot(c(0, 150)) + From 733476843ce0937e4568dc45782c26fa978406cf Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 14 Oct 2024 23:58:33 +1100 Subject: [PATCH 03/17] dragen: generalise coverage metrics --- NAMESPACE | 1 + R/tso_dragen.R | 152 ++++++++++++++++++++++++----------------- man/Wf_dragen.Rd | 171 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+), 60 deletions(-) create mode 100644 man/Wf_dragen.Rd diff --git a/NAMESPACE b/NAMESPACE index 4cf547e..ec8e62c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,7 @@ export(TimeMetricsFile) export(TrimmerMetricsFile) export(VCMetricsFile) export(Wf) +export(Wf_dragen) export(Wf_sash) export(Wf_sash_download_tidy_write) export(Wf_tso_ctdna_tumor_only) diff --git a/R/tso_dragen.R b/R/tso_dragen.R index f7ac52b..e7dd346 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -7,12 +7,13 @@ #' \dontrun{ #' #' #---- Local ----# +#' prefix <- "L2401290" #' p <- file.path( #' "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", -#' "analysis/cttsov2/20240915ff0295ed" +#' "analysis/cttsov2/20240915ff0295ed/Logs_Intermediates/DragenCaller", +#' prefix #' ) -#' prefix <- "L2401290" -#' t1 <- Wf_tso_ctdna_tumor_only_v2$new(path = p, prefix = prefix) +#' t1 <- Wf_dragen$new(path = p, prefix = prefix) #' t1$list_files(max_files = 100) #' t1$list_files_filter_relevant(max_files = 300) #' d <- t1$download_files(max_files = 100, dryrun = F) @@ -40,40 +41,40 @@ Wf_dragen <- R6::R6Class( pref <- prefix reg1 <- tibble::tribble( ~regex, ~fun, - glue("{dc}/{pref}\\-replay\\.json$"), "replay", - glue("{dc}/{pref}\\.cnv_metrics.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{dc}/{pref}\\.exon_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.fastqc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.fragment_length_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.gvcf_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.mapping_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.microsat_output\\.json$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.sv_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{dc}/{pref}\\.target_bed_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.time_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{dc}/{pref}\\.tmb_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.trimmer_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.umi_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.vc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{dc}/{pref}\\.wgs_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY" + glue("{pref}\\-replay\\.json$"), "replay", + glue("{pref}\\.cnv_metrics.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{pref}\\.exon_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.exon_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.exon_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.exon_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.fastqc_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.fragment_length_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.gvcf_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.mapping_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", + glue("{pref}\\.microsat_output\\.json$"), "DOWNLOAD_ONLY", + glue("{pref}\\.sv_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.target_bed_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.target_bed_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.target_bed_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.time_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.tmb_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.tmb_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.tmb_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.trimmer_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.umi_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.vc_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.wgs_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{pref}\\.wgs_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.wgs_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.wgs_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.wgs_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY" ) regexes <- reg1 |> dplyr::mutate( @@ -117,11 +118,12 @@ Wf_dragen <- R6::R6Class( }, #' @description Read `contig_mean_cov.csv` file. #' @param x Path to file. - read_contigMeanCov = function(x) { + #' @param keep_alt Keep ALT contigs. + read_contigMeanCov = function(x, keep_alt = FALSE) { readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> dplyr::filter( if (!keep_alt) { - !grepl("chrM|MT|_|Autosomal|HLA-|EBV", .data$chrom) + !grepl("chrM|MT|_|Autosomal|HLA-|EBV|GL|hs37d5", .data$chrom) } else { TRUE } @@ -130,24 +132,26 @@ Wf_dragen <- R6::R6Class( #' @description Read `dragen.tsv.gz` cancer report hrd file. #' @param x Path to file. read_coverageMetrics = function(x) { - abbrev_nm <- c( - "Aligned bases" = "bases_aligned_dragen", - "Aligned bases in genome" = "bases_aligned_in_genome_dragen", - "Average alignment coverage over genome" = "cov_alignment_avg_over_genome_dragen", - "Uniformity of coverage (PCT > 0.2*mean) over genome" = "cov_uniformity_over_genome_pct_gt02mean_dragen", - "Uniformity of coverage (PCT > 0.4*mean) over genome" = "cov_uniformity_over_genome_pct_gt04mean_dragen", - "Average chr X coverage over genome" = "cov_avg_x_over_genome_dragen", - "Average chr Y coverage over genome" = "cov_avg_y_over_genome_dragen", - "Average mitochondrial coverage over genome" = "cov_avg_mt_over_genome_dragen", - "Average autosomal coverage over genome" = "cov_avg_auto_over_genome_dragen", - "Median autosomal coverage over genome" = "cov_median_auto_over_genome_dragen", - "Mean/Median autosomal coverage ratio over genome" = "cov_mean_median_auto_ratio_over_genome_dragen", - "Aligned reads" = "reads_aligned_dragen", - "Aligned reads in genome" = "reads_aligned_in_genome_dragen" + # all rows except 'Aligned bases' and 'Aligned reads' refer to the region + abbrev_nm <- tibble::tribble( + ~raw, ~clean, ~region, + "Aligned bases", "bases_aligned_tot_dragen", FALSE, + "Aligned reads", "reads_aligned_tot_dragen", FALSE, + "Aligned bases in ", "bases_aligned_", TRUE, + "Average alignment coverage over ", "cov_alignment_avg_over_", TRUE, + "Uniformity of coverage (PCT > 0.2*mean) over ", "cov_uniformity_pct_gt02mean_", TRUE, + "Uniformity of coverage (PCT > 0.4*mean) over ", "cov_uniformity_pct_gt04mean_", TRUE, + "Average chr X coverage over ", "cov_avg_x_over_", TRUE, + "Average chr Y coverage over ", "cov_avg_y_over_", TRUE, + "Average mitochondrial coverage over ", "cov_avg_mt_over_", TRUE, + "Average autosomal coverage over ", "cov_avg_auto_over_", TRUE, + "Median autosomal coverage over ", "cov_median_auto_over_", TRUE, + "Mean/Median autosomal coverage ratio over ", "cov_mean_median_auto_ratio_over_", TRUE, + "Aligned reads in ", "reads_aligned_in_", TRUE ) raw <- readr::read_lines(x) assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1])) - + # first detect if this is genome, QC coverage region, or target region res <- raw |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( @@ -155,22 +159,50 @@ Wf_dragen <- R6::R6Class( delim = ",", too_few = "align_start", names = c("category", "dummy1", "var", "value", "pct") ) + reg1 <- NULL + str1 <- NULL + tmp <- res |> + dplyr::filter(grepl("PCT of .* with coverage ", .data$var)) |> + dplyr::slice_head(n = 1) |> + dplyr::pull("var") + assertthat::assert_that(length(tmp) == 1) + if (grepl("genome", tmp)) { + str1 <- "genome" + reg1 <- "genome" + } else if (grepl("QC coverage region", tmp)) { + str1 <- "QC coverage region" + reg1 <- "qccovreg" + } else if (grepl("target region", tmp)) { + str1 <- "target region" + reg1 <- "targetreg" + } else { + cli::cli_abort("Cannot determine the coverage region from: {x}") + } + abbrev_nm <- abbrev_nm |> + dplyr::mutate( + raw = ifelse(.data$region, glue("{.data$raw}{str1}"), .data$raw), + clean = ifelse(.data$region, glue("{.data$clean}{reg1}_dragen"), .data$clean) + ) |> + dplyr::select("raw", "clean") |> + tibble::deframe() # split to rename the # "PCT of genome with coverage [100x: inf)" values + pat <- glue("PCT of {str1} with coverage ") res1 <- res |> - # pct just shows 100% for a couple rows - dplyr::filter(!grepl("PCT of genome with coverage", .data$var)) |> + # pct just shows % for a couple rows which can be + # calculated from their above values + dplyr::filter(!grepl(pat, .data$var)) |> dplyr::select("var", "value") res2 <- res |> - dplyr::filter(grepl("PCT of genome with coverage", .data$var)) |> + dplyr::filter(grepl(pat, .data$var)) |> dplyr::mutate( - var = sub("PCT of genome with coverage ", "", .data$var), + var = sub(pat, "", .data$var), var = gsub("\\[|\\]|\\(|\\)| ", "", .data$var), var = gsub("x", "", .data$var), var = gsub("inf", "Inf", .data$var) ) |> tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> - dplyr::mutate(var = as.character(glue("cov_genome_pct_{start}_{end}_dragen"))) |> + dplyr::mutate(var = as.character(glue("cov_pct_{start}_{end}_{region}_dragen"))) |> dplyr::select("var", "value") res <- dplyr::bind_rows(res1, res2) |> dplyr::mutate( @@ -180,6 +212,6 @@ Wf_dragen <- R6::R6Class( ) |> tidyr::pivot_wider(names_from = "var", values_from = "value") return(res) - }, + } ) # end public ) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd new file mode 100644 index 0000000..1c57315 --- /dev/null +++ b/man/Wf_dragen.Rd @@ -0,0 +1,171 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tso_dragen.R +\name{Wf_dragen} +\alias{Wf_dragen} +\title{Wf_dragen R6 Class} +\description{ +Reads and writes tidy versions of files from the \code{dragen} workflow. +} +\examples{ +\dontrun{ + +#---- Local ----# +prefix <- "L2401290" +p <- file.path( + "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", + "analysis/cttsov2/20240915ff0295ed/Logs_Intermediates/DragenCaller", + prefix +) +t1 <- Wf_dragen$new(path = p, prefix = prefix) +t1$list_files(max_files = 100) +t1$list_files_filter_relevant(max_files = 300) +d <- t1$download_files(max_files = 100, dryrun = F) +d_tidy <- t1$tidy_files(d) +d_write <- t1$write( + d_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = prefix, + format = "tsv" +) +} +} +\section{Super class}{ +\code{\link[dracarys:Wf]{dracarys::Wf}} -> \code{Wf_dragen} +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{prefix}}{The LibraryID prefix of the sample (needed for path lookup).} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-Wf_dragen-new}{\code{Wf_dragen$new()}} +\item \href{#method-Wf_dragen-print}{\code{Wf_dragen$print()}} +\item \href{#method-Wf_dragen-read_replay}{\code{Wf_dragen$read_replay()}} +\item \href{#method-Wf_dragen-read_contigMeanCov}{\code{Wf_dragen$read_contigMeanCov()}} +\item \href{#method-Wf_dragen-read_coverageMetrics}{\code{Wf_dragen$read_coverageMetrics()}} +\item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} +} +} +\if{html}{\out{ +
Inherited methods + +
+}} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-new}{}}} +\subsection{Method \code{new()}}{ +Create a new Wf_dragen object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$new(path = NULL, prefix = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path to directory with raw workflow results (from S3 or +local filesystem).} + +\item{\code{prefix}}{The LibraryID prefix of the sample (needed for path lookup).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-print}{}}} +\subsection{Method \code{print()}}{ +Print details about the Workflow. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$print(...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{(ignored).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_replay}{}}} +\subsection{Method \code{read_replay()}}{ +Read \code{replay.json} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_replay(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_contigMeanCov}{}}} +\subsection{Method \code{read_contigMeanCov()}}{ +Read \code{contig_mean_cov.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_contigMeanCov(x, keep_alt = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} + +\item{\code{keep_alt}}{Keep ALT contigs.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_coverageMetrics}{}}} +\subsection{Method \code{read_coverageMetrics()}}{ +Read \code{dragen.tsv.gz} cancer report hrd file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_coverageMetrics(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} From c0ef098f6b8fdca54406c0096bd8168bf3e7d56c Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Tue, 15 Oct 2024 10:20:27 +1100 Subject: [PATCH 04/17] dragen: more parsers --- NAMESPACE | 1 + R/Wf.R | 3 +- R/dragen.R | 90 ++++++++++++++ R/tso_dragen.R | 116 ++++++------------ man/Wf_dragen.Rd | 42 ++++++- man/dragen_coverage_metrics_read.Rd | 17 +++ .../test-roxytest-testexamples-dragen.R | 2 +- 7 files changed, 185 insertions(+), 86 deletions(-) create mode 100644 man/dragen_coverage_metrics_read.Rd diff --git a/NAMESPACE b/NAMESPACE index ec8e62c..859e3dc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,7 @@ export(dr_func_eval) export(dr_gds_download) export(dr_output_format_valid) export(dr_s3_download) +export(dragen_coverage_metrics_read) export(dtw_Wf_tso_ctdna_tumor_only) export(dtw_Wf_tso_ctdna_tumor_only_v2) export(empty_tbl) diff --git a/R/Wf.R b/R/Wf.R index 7c2edab..4f23fc6 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -92,7 +92,8 @@ Wf <- R6::R6Class( "oncoanalyser_wgts_existing_both", "sash" ) - assertthat::assert_that(wname %in% wnames) + subwnames <- c("dragen") + assertthat::assert_that(wname %in% c(wnames, subwnames)) self$path <- sub("/$", "", path) # remove potential trailing slash self$wname <- wname self$filesystem <- dplyr::case_when( diff --git a/R/dragen.R b/R/dragen.R index 7e7459c..95790b0 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -1,3 +1,93 @@ +#' Read DRAGEN Coverage Metrics +#' +#' Reads the `coverage_metrics.csv` file generated by DRAGEN. +#' @param x Path to file. +#' +#' @return Tibble with data in list column. +#' @export +dragen_coverage_metrics_read <- function(x) { + # all rows except 'Aligned bases' and 'Aligned reads' refer to the region + abbrev_nm <- tibble::tribble( + ~raw, ~clean, ~region, + "Aligned bases", "bases_aligned_tot_dragen", FALSE, + "Aligned reads", "reads_aligned_tot_dragen", FALSE, + "Aligned bases in ", "bases_aligned_", TRUE, + "Average alignment coverage over ", "cov_alignment_avg_over_", TRUE, + "Uniformity of coverage (PCT > 0.2*mean) over ", "cov_uniformity_pct_gt02mean_", TRUE, + "Uniformity of coverage (PCT > 0.4*mean) over ", "cov_uniformity_pct_gt04mean_", TRUE, + "Average chr X coverage over ", "cov_avg_x_over_", TRUE, + "Average chr Y coverage over ", "cov_avg_y_over_", TRUE, + "Average mitochondrial coverage over ", "cov_avg_mt_over_", TRUE, + "Average autosomal coverage over ", "cov_avg_auto_over_", TRUE, + "Median autosomal coverage over ", "cov_median_auto_over_", TRUE, + "Mean/Median autosomal coverage ratio over ", "cov_mean_median_auto_ratio_over_", TRUE, + "Aligned reads in ", "reads_aligned_in_", TRUE + ) + raw <- readr::read_lines(x) + assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1])) + # first detect if this is genome, QC coverage region, or target region + res <- raw |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim( + "value", + delim = ",", too_few = "align_start", + names = c("category", "dummy1", "var", "value", "pct") + ) + reg1 <- NULL + str1 <- NULL + tmp <- res |> + dplyr::filter(grepl("PCT of .* with coverage ", .data$var)) |> + dplyr::slice_head(n = 1) |> + dplyr::pull("var") + assertthat::assert_that(length(tmp) == 1) + if (grepl("genome", tmp)) { + str1 <- "genome" + reg1 <- "genome" + } else if (grepl("QC coverage region", tmp)) { + str1 <- "QC coverage region" + reg1 <- "qccovreg" + } else if (grepl("target region", tmp)) { + str1 <- "target region" + reg1 <- "targetreg" + } else { + cli::cli_abort("Cannot determine the coverage region from: {x}") + } + abbrev_nm <- abbrev_nm |> + dplyr::mutate( + raw = ifelse(.data$region, glue("{.data$raw}{str1}"), .data$raw), + clean = ifelse(.data$region, glue("{.data$clean}{reg1}_dragen"), .data$clean) + ) |> + dplyr::select("raw", "clean") |> + tibble::deframe() + # split to rename the + # "PCT of genome with coverage [100x: inf)" values + pat <- glue("PCT of {str1} with coverage ") + res1 <- res |> + # pct just shows % for a couple rows which can be + # calculated from their above values + dplyr::filter(!grepl(pat, .data$var)) |> + dplyr::select("var", "value") + res2 <- res |> + dplyr::filter(grepl(pat, .data$var)) |> + dplyr::mutate( + var = sub(pat, "", .data$var), + var = gsub("\\[|\\]|\\(|\\)| ", "", .data$var), + var = gsub("x", "", .data$var), + var = gsub("inf", "Inf", .data$var) + ) |> + tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> + dplyr::mutate(var = as.character(glue("cov_pct_{start}_{end}_{reg1}_dragen"))) |> + dplyr::select("var", "value") + res <- dplyr::bind_rows(res1, res2) |> + dplyr::mutate( + value = dplyr::na_if(.data$value, "NA"), + value = as.numeric(.data$value), + var = dplyr::recode(.data$var, !!!abbrev_nm) + ) |> + tidyr::pivot_wider(names_from = "var", values_from = "value") + return(res) +} + #' WgsContigMeanCovFile R6 Class #' #' @description diff --git a/R/tso_dragen.R b/R/tso_dragen.R index e7dd346..e37bd53 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -13,8 +13,8 @@ #' "analysis/cttsov2/20240915ff0295ed/Logs_Intermediates/DragenCaller", #' prefix #' ) -#' t1 <- Wf_dragen$new(path = p, prefix = prefix) -#' t1$list_files(max_files = 100) +#' d1 <- Wf_dragen$new(path = p, prefix = prefix) +#' d1$list_files(max_files = 100) #' t1$list_files_filter_relevant(max_files = 300) #' d <- t1$download_files(max_files = 100, dryrun = F) #' d_tidy <- t1$tidy_files(d) @@ -114,13 +114,14 @@ Wf_dragen <- R6::R6Class( # (though never say never!) res[["dragen_config"]] <- res[["dragen_config"]] |> tidyr::pivot_wider(names_from = "name", values_from = "value") - return(dplyr::bind_cols(res)) + dat <- dplyr::bind_cols(res) + tibble::tibble(name = "replay", data = list(dat)) }, #' @description Read `contig_mean_cov.csv` file. #' @param x Path to file. #' @param keep_alt Keep ALT contigs. read_contigMeanCov = function(x, keep_alt = FALSE) { - readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> + dat <- readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> dplyr::filter( if (!keep_alt) { !grepl("chrM|MT|_|Autosomal|HLA-|EBV|GL|hs37d5", .data$chrom) @@ -128,90 +129,43 @@ Wf_dragen <- R6::R6Class( TRUE } ) + tibble::tibble(name = "contigmeancov", data = list(dat[])) }, - #' @description Read `dragen.tsv.gz` cancer report hrd file. + #' @description Read `coverage_metrics.csv` file. #' @param x Path to file. read_coverageMetrics = function(x) { - # all rows except 'Aligned bases' and 'Aligned reads' refer to the region - abbrev_nm <- tibble::tribble( - ~raw, ~clean, ~region, - "Aligned bases", "bases_aligned_tot_dragen", FALSE, - "Aligned reads", "reads_aligned_tot_dragen", FALSE, - "Aligned bases in ", "bases_aligned_", TRUE, - "Average alignment coverage over ", "cov_alignment_avg_over_", TRUE, - "Uniformity of coverage (PCT > 0.2*mean) over ", "cov_uniformity_pct_gt02mean_", TRUE, - "Uniformity of coverage (PCT > 0.4*mean) over ", "cov_uniformity_pct_gt04mean_", TRUE, - "Average chr X coverage over ", "cov_avg_x_over_", TRUE, - "Average chr Y coverage over ", "cov_avg_y_over_", TRUE, - "Average mitochondrial coverage over ", "cov_avg_mt_over_", TRUE, - "Average autosomal coverage over ", "cov_avg_auto_over_", TRUE, - "Median autosomal coverage over ", "cov_median_auto_over_", TRUE, - "Mean/Median autosomal coverage ratio over ", "cov_mean_median_auto_ratio_over_", TRUE, - "Aligned reads in ", "reads_aligned_in_", TRUE - ) - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1])) - # first detect if this is genome, QC coverage region, or target region - res <- raw |> - tibble::as_tibble_col(column_name = "value") |> - tidyr::separate_wider_delim( - "value", - delim = ",", too_few = "align_start", - names = c("category", "dummy1", "var", "value", "pct") - ) - reg1 <- NULL - str1 <- NULL - tmp <- res |> - dplyr::filter(grepl("PCT of .* with coverage ", .data$var)) |> - dplyr::slice_head(n = 1) |> - dplyr::pull("var") - assertthat::assert_that(length(tmp) == 1) - if (grepl("genome", tmp)) { - str1 <- "genome" - reg1 <- "genome" - } else if (grepl("QC coverage region", tmp)) { - str1 <- "QC coverage region" - reg1 <- "qccovreg" - } else if (grepl("target region", tmp)) { - str1 <- "target region" - reg1 <- "targetreg" - } else { - cli::cli_abort("Cannot determine the coverage region from: {x}") - } - abbrev_nm <- abbrev_nm |> - dplyr::mutate( - raw = ifelse(.data$region, glue("{.data$raw}{str1}"), .data$raw), - clean = ifelse(.data$region, glue("{.data$clean}{reg1}_dragen"), .data$clean) - ) |> - dplyr::select("raw", "clean") |> - tibble::deframe() - # split to rename the - # "PCT of genome with coverage [100x: inf)" values - pat <- glue("PCT of {str1} with coverage ") - res1 <- res |> - # pct just shows % for a couple rows which can be - # calculated from their above values - dplyr::filter(!grepl(pat, .data$var)) |> - dplyr::select("var", "value") - res2 <- res |> - dplyr::filter(grepl(pat, .data$var)) |> + dat <- dragen_coverage_metrics_read(x) + tibble::tibble(name = "covmetrics", data = list(dat)) + }, + #' @description Read `fine_hist.csv` file. + #' @param x Path to file. + read_fineHist = function(x) { + d <- readr::read_csv(x, col_types = "cd") + assertthat::assert_that(all(colnames(d) == c("Depth", "Overall"))) + # there's a max Depth of 2000+, so convert to numeric for easier plotting + dat <- d |> dplyr::mutate( - var = sub(pat, "", .data$var), - var = gsub("\\[|\\]|\\(|\\)| ", "", .data$var), - var = gsub("x", "", .data$var), - var = gsub("inf", "Inf", .data$var) + Depth = ifelse(grepl("+", .data$Depth), sub("(\\d*)\\+", "\\1", .data$Depth), .data$Depth), + Depth = as.integer(.data$Depth) ) |> - tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> - dplyr::mutate(var = as.character(glue("cov_pct_{start}_{end}_{region}_dragen"))) |> - dplyr::select("var", "value") - res <- dplyr::bind_rows(res1, res2) |> + dplyr::select(depth = "Depth", n_loci = "Overall") + tibble::tibble(name = "finehist", data = list(dat)) + }, + #' @description Read `fragment_length_hist.csv` file. + #' @param x Path to file. + read_fragmentLengthHist = function(x) { + d <- readr::read_lines(x) + assertthat::assert_that(grepl("#Sample", d[1])) + dat <- d |> + tibble::enframe(name = "name", value = "value") |> + dplyr::filter(!grepl("#Sample: |FragmentLength,Count", .data$value)) |> + tidyr::separate_wider_delim(cols = "value", names = c("fragmentLength", "count"), delim = ",") |> dplyr::mutate( - value = dplyr::na_if(.data$value, "NA"), - value = as.numeric(.data$value), - var = dplyr::recode(.data$var, !!!abbrev_nm) + count = as.numeric(.data$count), + fragmentLength = as.numeric(.data$fragmentLength) ) |> - tidyr::pivot_wider(names_from = "var", values_from = "value") - return(res) + dplyr::select("fragmentLength", "count") + tibble::tibble(name = "fraglen", data = list(dat)) } ) # end public ) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 1c57315..3402faf 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -16,8 +16,8 @@ p <- file.path( "analysis/cttsov2/20240915ff0295ed/Logs_Intermediates/DragenCaller", prefix ) -t1 <- Wf_dragen$new(path = p, prefix = prefix) -t1$list_files(max_files = 100) +d1 <- Wf_dragen$new(path = p, prefix = prefix) +d1$list_files(max_files = 100) t1$list_files_filter_relevant(max_files = 300) d <- t1$download_files(max_files = 100, dryrun = F) d_tidy <- t1$tidy_files(d) @@ -47,6 +47,8 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_replay}{\code{Wf_dragen$read_replay()}} \item \href{#method-Wf_dragen-read_contigMeanCov}{\code{Wf_dragen$read_contigMeanCov()}} \item \href{#method-Wf_dragen-read_coverageMetrics}{\code{Wf_dragen$read_coverageMetrics()}} +\item \href{#method-Wf_dragen-read_fineHist}{\code{Wf_dragen$read_fineHist()}} +\item \href{#method-Wf_dragen-read_fragmentLengthHist}{\code{Wf_dragen$read_fragmentLengthHist()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} } } @@ -138,11 +140,45 @@ Read \code{contig_mean_cov.csv} file. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Wf_dragen-read_coverageMetrics}{}}} \subsection{Method \code{read_coverageMetrics()}}{ -Read \code{dragen.tsv.gz} cancer report hrd file. +Read \code{coverage_metrics.csv} file. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Wf_dragen$read_coverageMetrics(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_fineHist}{}}} +\subsection{Method \code{read_fineHist()}}{ +Read \code{fine_hist.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_fineHist(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_fragmentLengthHist}{}}} +\subsection{Method \code{read_fragmentLengthHist()}}{ +Read \code{fragment_length_hist.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_fragmentLengthHist(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/man/dragen_coverage_metrics_read.Rd b/man/dragen_coverage_metrics_read.Rd new file mode 100644 index 0000000..a81e906 --- /dev/null +++ b/man/dragen_coverage_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_coverage_metrics_read} +\alias{dragen_coverage_metrics_read} +\title{Read DRAGEN Coverage Metrics} +\usage{ +dragen_coverage_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with data in list column. +} +\description{ +Reads the \code{coverage_metrics.csv} file generated by DRAGEN. +} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R index 1caa4ee..0054e64 100644 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -2,7 +2,7 @@ # File R/dragen.R: @testexamples -test_that("Function time_metrics_process() @ L789", { +test_that("Function time_metrics_process() @ L879", { p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") x <- TimeMetricsFile$new(p) From c1e4aa3e90f241ca78c4bd5b029d9c45fb68c9f0 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 16 Oct 2024 02:20:44 +1100 Subject: [PATCH 05/17] dragen: more parsers --- NAMESPACE | 1 + R/dragen.R | 127 +++++++++++++++++- R/tso_dragen.R | 59 ++++---- man/Wf_dragen.Rd | 24 +++- man/dragen_coverage_metrics_read.Rd | 2 +- man/dragen_mapping_metrics_read.Rd | 17 +++ .../test-roxytest-testexamples-dragen.R | 2 +- 7 files changed, 201 insertions(+), 31 deletions(-) create mode 100644 man/dragen_mapping_metrics_read.Rd diff --git a/NAMESPACE b/NAMESPACE index 859e3dc..5c6f744 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -37,6 +37,7 @@ export(dr_gds_download) export(dr_output_format_valid) export(dr_s3_download) export(dragen_coverage_metrics_read) +export(dragen_mapping_metrics_read) export(dtw_Wf_tso_ctdna_tumor_only) export(dtw_Wf_tso_ctdna_tumor_only_v2) export(empty_tbl) diff --git a/R/dragen.R b/R/dragen.R index 95790b0..082ca1a 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -1,9 +1,134 @@ +dragen_subprefix <- function(x, suffix) { + # L2401290.exon_contig_mean_cov.csv -> exon + # L2401290.target_bed_contig_mean_cov.csv -> target_bed + # L2401290.tmb_contig_mean_cov.csv -> tmb + # L2401290.wgs_contig_mean_cov.csv -> wgs + bname <- basename(x) + s1 <- tools::file_path_sans_ext(bname) + s2 <- sub(".*\\.(.*)", "\\1", s1) + sub(suffix, "", s2) +} + +#' Read DRAGEN Mapping Metrics +#' +#' Reads the `mapping_metrics.csv` file output from DRAGEN. +#' +#' @param x Path to file. +#' +#' @return Tibble with metrics. +#' @export +dragen_mapping_metrics_read <- function(x) { + abbrev_nm <- c( + "Total input reads" = "reads_tot_input_dragen", + "Number of duplicate marked reads" = "reads_num_dupmarked_dragen", + "Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed_dragen", + "Number of unique reads (excl. duplicate marked reads)" = "reads_num_uniq_dragen", + "Reads with mate sequenced" = "reads_w_mate_seq_dragen", + "Reads without mate sequenced" = "reads_wo_mate_seq_dragen", + "QC-failed reads" = "reads_qcfail_dragen", + "Mapped reads" = "reads_mapped_dragen", + "Mapped reads adjusted for filtered mapping" = "reads_mapped_adjfilt_dragen", + "Mapped reads R1" = "reads_mapped_r1_dragen", + "Mapped reads R2" = "reads_mapped_r2_dragen", + "Number of unique & mapped reads (excl. duplicate marked reads)" = "reads_num_uniq_mapped_dragen", + "Unmapped reads" = "reads_unmapped_dragen", + "Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt_dragen", + "Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj_dragen", + "Singleton reads (itself mapped; mate unmapped)" = "reads_singleton_dragen", + "Paired reads (itself & mate mapped)" = "reads_paired_dragen", + "Properly paired reads" = "reads_paired_proper_dragen", + "Not properly paired reads (discordant)" = "reads_discordant_dragen", + "Paired reads mapped to different chromosomes" = "reads_paired_mapped_diff_chrom_dragen", + "Paired reads mapped to different chromosomes (MAPQ>=10)" = "reads_paired_mapped_diff_chrom_mapq10_dragen", + "Reads with MAPQ [40:inf)" = "reads_mapq_40_inf_dragen", + "Reads with MAPQ [30:40)" = "reads_mapq_30_40_dragen", + "Reads with MAPQ [20:30)" = "reads_mapq_20_30_dragen", + "Reads with MAPQ [10:20)" = "reads_mapq_10_20_dragen", + "Reads with MAPQ [ 0:10)" = "reads_mapq_0_10_dragen", + "Reads with MAPQ NA (Unmapped reads)" = "reads_mapq_na_unmapped_dragen", + "Reads with indel R1" = "reads_indel_r1_dragen", + "Reads with indel R2" = "reads_indel_r2_dragen", + "Total bases" = "bases_tot_dragen", + "Total bases R1" = "bases_tot_r1_dragen", + "Total bases R2" = "bases_tot_r2_dragen", + "Mapped bases" = "bases_mapped_dragen", + "Mapped bases R1" = "bases_mapped_r1_dragen", + "Mapped bases R2" = "bases_mapped_r2_dragen", + "Soft-clipped bases" = "bases_softclip_dragen", + "Soft-clipped bases R1" = "bases_softclip_r1_dragen", + "Soft-clipped bases R2" = "bases_softclip_r2_dragen", + "Hard-clipped bases" = "bases_hardclip_dragen", + "Hard-clipped bases R1" = "bases_hardclip_r1_dragen", + "Hard-clipped bases R2" = "bases_hardclip_r2_dragen", + "Mismatched bases R1" = "bases_mismatched_r1_dragen", + "Mismatched bases R2" = "bases_mismatched_r2_dragen", + "Mismatched bases R1 (excl. indels)" = "bases_mismatched_r1_noindels_dragen", + "Mismatched bases R2 (excl. indels)" = "bases_mismatched_r2_noindels_dragen", + "Q30 bases" = "bases_q30_dragen", + "Q30 bases R1" = "bases_q30_r1_dragen", + "Q30 bases R2" = "bases_q30_r2_dragen", + "Q30 bases (excl. dups & clipped bases)" = "bases_q30_nodups_noclipped_dragen", + "Total alignments" = "alignments_tot_dragen", + "Secondary alignments" = "alignments_secondary_dragen", + "Supplementary (chimeric) alignments" = "alignments_chimeric_dragen", + "Estimated read length" = "read_len_dragen", + "Bases in reference genome" = "bases_in_ref_genome_dragen", + "Bases in target bed [% of genome]" = "bases_in_target_bed_genome_pct_dragen", + "Insert length: mean" = "insert_len_mean_dragen", + "Insert length: median" = "insert_len_median_dragen", + "Insert length: standard deviation" = "insert_len_std_dev_dragen", + "Provided sex chromosome ploidy" = "ploidy_sex_chrom_provided_dragen", + "Estimated sample contamination" = "contamination_est_dragen", + "Estimated sample contamination standard error" = "contamination_stderr_est_dragen", + "DRAGEN mapping rate [mil. reads/second]" = "mapping_rate_dragen_milreads_per_sec_dragen", + "Total reads in RG" = "reads_tot_rg_dragen", + "Mapped reads adjusted for excluded mapping" = "reads_mapped_adjexcl_dragen", + "Mapped reads adjusted for filtered and excluded mapping" = "reads_mapped_adjfiltexcl_dragen", + "Unmapped reads adjusted for excluded mapping" = "reads_unmapped_adjexcl_dragen", + "Unmapped reads adjusted for filtered and excluded mapping" = "reads_unmapped_adjfiltexcl_dragen", + "Reads mapping to multiple locations" = "reads_map_multiloc_dragen", + "Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj_dragen", + "Reads with splice junction" = "reads_splicejunc_dragen", + "Average sequenced coverage over genome" = "cov_avg_seq_over_genome_dragen", + "Filtered rRNA reads" = "reads_rrna_filtered_dragen" + ) + raw <- readr::read_lines(x) + assertthat::assert_that(grepl("MAPPING/ALIGNING", raw[1])) + # split by RG and non-RG + # tidy + d <- raw |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim( + "value", + names = c("category", "RG", "var", "count", "pct"), + delim = ",", too_few = "align_start" + ) |> + dplyr::mutate( + count = dplyr::na_if(.data$count, "NA"), + count = as.numeric(.data$count), + pct = as.numeric(.data$pct), + var = dplyr::recode(.data$var, !!!abbrev_nm), + RG = dplyr::if_else(.data$RG == "", "Total", .data$RG) + ) |> + dplyr::select("RG", "var", "count", "pct") + # pivot + d |> + tidyr::pivot_longer(c("count", "pct")) |> + dplyr::mutate( + name = dplyr::if_else(.data$name == "count", "", "_pct"), + var = glue("{.data$var}{.data$name}") + ) |> + dplyr::select("RG", "var", "value") |> + dplyr::filter(!is.na(.data$value)) |> + tidyr::pivot_wider(names_from = "var", values_from = "value") +} + #' Read DRAGEN Coverage Metrics #' #' Reads the `coverage_metrics.csv` file generated by DRAGEN. #' @param x Path to file. #' -#' @return Tibble with data in list column. +#' @return Tibble with metrics. #' @export dragen_coverage_metrics_read <- function(x) { # all rows except 'Aligned bases' and 'Aligned reads' refer to the region diff --git a/R/tso_dragen.R b/R/tso_dragen.R index e37bd53..80d940d 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -15,9 +15,9 @@ #' ) #' d1 <- Wf_dragen$new(path = p, prefix = prefix) #' d1$list_files(max_files = 100) -#' t1$list_files_filter_relevant(max_files = 300) -#' d <- t1$download_files(max_files = 100, dryrun = F) -#' d_tidy <- t1$tidy_files(d) +#' d1$list_files_filter_relevant(max_files = 300) +#' d <- d1$download_files(max_files = 100, dryrun = F) +#' d_tidy <- d1$tidy_files(d) #' d_write <- t1$write( #' d_tidy, #' outdir = file.path(p, "dracarys_tidy"), @@ -44,37 +44,37 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\-replay\\.json$"), "replay", glue("{pref}\\.cnv_metrics.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{pref}\\.exon_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.exon_fine_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{pref}\\.wgs_contig_mean_cov\\.csv$"), "contigMeanCov", + glue("{pref}\\.exon_coverage_metrics\\.csv$"), "coverageMetrics", + glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "coverageMetrics", + glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "coverageMetrics", + glue("{pref}\\.wgs_coverage_metrics\\.csv$"), "coverageMetrics", + glue("{pref}\\.exon_fine_hist\\.csv$"), "fineHist", + glue("{pref}\\.target_bed_fine_hist\\.csv$"), "fineHist", + glue("{pref}\\.tmb_fine_hist\\.csv$"), "fineHist", + glue("{pref}\\.wgs_fine_hist\\.csv$"), "fineHist", glue("{pref}\\.exon_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.target_bed_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.tmb_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.wgs_hist\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.exon_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.target_bed_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.tmb_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.wgs_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.fastqc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.fragment_length_hist\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.fragment_length_hist\\.csv$"), "fragmentLengthHist", glue("{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.gvcf_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.mapping_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", glue("{pref}\\.microsat_output\\.json$"), "DOWNLOAD_ONLY", glue("{pref}\\.sv_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{pref}\\.target_bed_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.target_bed_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.target_bed_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.target_bed_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.time_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{pref}\\.tmb_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.tmb_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.tmb_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.tmb_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.trimmer_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.umi_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.vc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.wgs_contig_mean_cov\\.csv$"), "contigMeanCov", - glue("{pref}\\.wgs_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.wgs_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.wgs_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.wgs_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY" + glue("{pref}\\.vc_metrics\\.csv$"), "DOWNLOAD_ONLY" ) regexes <- reg1 |> dplyr::mutate( @@ -121,6 +121,7 @@ Wf_dragen <- R6::R6Class( #' @param x Path to file. #' @param keep_alt Keep ALT contigs. read_contigMeanCov = function(x, keep_alt = FALSE) { + subprefix <- dragen_subprefix(x, "_contig_mean_cov") dat <- readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> dplyr::filter( if (!keep_alt) { @@ -129,17 +130,19 @@ Wf_dragen <- R6::R6Class( TRUE } ) - tibble::tibble(name = "contigmeancov", data = list(dat[])) + tibble::tibble(name = glue("contigmeancov_{subprefix}"), data = list(dat[])) }, #' @description Read `coverage_metrics.csv` file. #' @param x Path to file. read_coverageMetrics = function(x) { + subprefix <- dragen_subprefix(x, "_coverage_metrics") dat <- dragen_coverage_metrics_read(x) - tibble::tibble(name = "covmetrics", data = list(dat)) + tibble::tibble(name = glue("covmetrics_{subprefix}"), data = list(dat)) }, #' @description Read `fine_hist.csv` file. #' @param x Path to file. read_fineHist = function(x) { + subprefix <- dragen_subprefix(x, "_fine_hist") d <- readr::read_csv(x, col_types = "cd") assertthat::assert_that(all(colnames(d) == c("Depth", "Overall"))) # there's a max Depth of 2000+, so convert to numeric for easier plotting @@ -149,7 +152,7 @@ Wf_dragen <- R6::R6Class( Depth = as.integer(.data$Depth) ) |> dplyr::select(depth = "Depth", n_loci = "Overall") - tibble::tibble(name = "finehist", data = list(dat)) + tibble::tibble(name = glue("finehist_{subprefix}"), data = list(dat)) }, #' @description Read `fragment_length_hist.csv` file. #' @param x Path to file. @@ -166,6 +169,12 @@ Wf_dragen <- R6::R6Class( ) |> dplyr::select("fragmentLength", "count") tibble::tibble(name = "fraglen", data = list(dat)) + }, + #' @description Read `mapping_metrics.csv` file. + #' @param x Path to file. + read_mappingMetrics = function(x) { + dat <- dragen_mapping_metrics_read(x) + tibble::tibble(name = "mapmetrics", data = list(dat)) } ) # end public ) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 3402faf..e023ca1 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -18,9 +18,9 @@ p <- file.path( ) d1 <- Wf_dragen$new(path = p, prefix = prefix) d1$list_files(max_files = 100) -t1$list_files_filter_relevant(max_files = 300) -d <- t1$download_files(max_files = 100, dryrun = F) -d_tidy <- t1$tidy_files(d) +d1$list_files_filter_relevant(max_files = 300) +d <- d1$download_files(max_files = 100, dryrun = F) +d_tidy <- d1$tidy_files(d) d_write <- t1$write( d_tidy, outdir = file.path(p, "dracarys_tidy"), @@ -49,6 +49,7 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_coverageMetrics}{\code{Wf_dragen$read_coverageMetrics()}} \item \href{#method-Wf_dragen-read_fineHist}{\code{Wf_dragen$read_fineHist()}} \item \href{#method-Wf_dragen-read_fragmentLengthHist}{\code{Wf_dragen$read_fragmentLengthHist()}} +\item \href{#method-Wf_dragen-read_mappingMetrics}{\code{Wf_dragen$read_mappingMetrics()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} } } @@ -179,6 +180,23 @@ Read \code{fragment_length_hist.csv} file. \if{html}{\out{
}}\preformatted{Wf_dragen$read_fragmentLengthHist(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_mappingMetrics}{}}} +\subsection{Method \code{read_mappingMetrics()}}{ +Read \code{mapping_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_mappingMetrics(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/man/dragen_coverage_metrics_read.Rd b/man/dragen_coverage_metrics_read.Rd index a81e906..96fd3ef 100644 --- a/man/dragen_coverage_metrics_read.Rd +++ b/man/dragen_coverage_metrics_read.Rd @@ -10,7 +10,7 @@ dragen_coverage_metrics_read(x) \item{x}{Path to file.} } \value{ -Tibble with data in list column. +Tibble with metrics. } \description{ Reads the \code{coverage_metrics.csv} file generated by DRAGEN. diff --git a/man/dragen_mapping_metrics_read.Rd b/man/dragen_mapping_metrics_read.Rd new file mode 100644 index 0000000..223f0a7 --- /dev/null +++ b/man/dragen_mapping_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_mapping_metrics_read} +\alias{dragen_mapping_metrics_read} +\title{Read DRAGEN Mapping Metrics} +\usage{ +dragen_mapping_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Reads the \code{mapping_metrics.csv} file output from DRAGEN. +} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R index 0054e64..1bc8cea 100644 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -2,7 +2,7 @@ # File R/dragen.R: @testexamples -test_that("Function time_metrics_process() @ L879", { +test_that("Function time_metrics_process() @ L1004", { p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") x <- TimeMetricsFile$new(p) From 44be1a2b2cf1c6472f444cb8a3633a823f3191c7 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Wed, 16 Oct 2024 11:16:16 +1100 Subject: [PATCH 06/17] dragen: add hist cumsum parser --- R/tso_dragen.R | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 80d940d..4785123 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -55,14 +55,10 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\.target_bed_fine_hist\\.csv$"), "fineHist", glue("{pref}\\.tmb_fine_hist\\.csv$"), "fineHist", glue("{pref}\\.wgs_fine_hist\\.csv$"), "fineHist", - glue("{pref}\\.exon_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.target_bed_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.tmb_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.wgs_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.exon_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.target_bed_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.tmb_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.wgs_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.exon_hist\\.csv$"), "hist", + glue("{pref}\\.target_bed_hist\\.csv$"), "hist", + glue("{pref}\\.tmb_hist\\.csv$"), "hist", + glue("{pref}\\.wgs_hist\\.csv$"), "hist", glue("{pref}\\.fastqc_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.fragment_length_hist\\.csv$"), "fragmentLengthHist", glue("{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", @@ -175,6 +171,26 @@ Wf_dragen <- R6::R6Class( read_mappingMetrics = function(x) { dat <- dragen_mapping_metrics_read(x) tibble::tibble(name = "mapmetrics", data = list(dat)) + }, + #' @description Read `hist.csv` file. + read_hist = function(x) { + subprefix <- dragen_subprefix(x, "_hist") + d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd") + dat <- d |> + dplyr::mutate( + var = sub("PCT of bases in .* with coverage ", "", .data$var), + var = gsub("\\[|\\]|\\(|\\)", "", .data$var), + var = gsub("x", "", .data$var), + var = gsub("inf", "Inf", .data$var) + ) |> + tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> + dplyr::mutate( + start = as.numeric(.data$start), + end = as.numeric(.data$end), + pct = round(.data$pct, 2), + cumsum = cumsum(.data$pct) + ) + tibble::tibble(name = glue("hist_{subprefix}"), data = list(dat)) } ) # end public ) # end Wf_dragen From e5ffa3b0a7e62f2054fe3539cd4c5ee86f1ef99c Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 17 Oct 2024 00:51:55 +1100 Subject: [PATCH 07/17] dragen: parse vc/time metrics --- R/dragen.R | 93 +++++++++++++++++++ R/tso_dragen.R | 37 +++++++- man/Wf_dragen.Rd | 54 +++++++++++ .../test-roxytest-testexamples-dragen.R | 2 +- 4 files changed, 180 insertions(+), 6 deletions(-) diff --git a/R/dragen.R b/R/dragen.R index 082ca1a..9f66022 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -9,6 +9,99 @@ dragen_subprefix <- function(x, suffix) { sub(suffix, "", s2) } +dragen_vc_metrics_read <- function(x) { + abbrev_nm1 <- tibble::tribble( + ~raw, ~clean, ~region, + "Total", "var_tot_dragen", FALSE, + "Biallelic", "var_biallelic_dragen", FALSE, + "Multiallelic", "var_multiallelic_dragen", FALSE, + "SNPs", "var_snp_dragen", FALSE, + "Insertions (Hom)", "var_ins_hom_dragen", FALSE, + "Insertions (Het)", "var_ins_het_dragen", FALSE, + "Deletions (Hom)", "var_del_hom_dragen", FALSE, + "Deletions (Het)", "var_del_het_dragen", FALSE, + "Indels (Het)", "var_indel_het_dragen", FALSE, + "Chr X number of SNPs over ", "var_snp_x_over_", TRUE, + "Chr Y number of SNPs over ", "var_snp_y_over_", TRUE, + "(Chr X SNPs)/(chr Y SNPs) ratio over ", "var_x_over_y_snp_ratio_over_", TRUE, + "SNP Transitions", "var_snp_transitions_dragen", FALSE, + "SNP Transversions", "var_snp_transversions_dragen", FALSE, + "Ti/Tv ratio", "var_ti_tv_ratio_dragen", FALSE, + "Heterozygous", "var_heterozygous_dragen", FALSE, + "Homozygous", "var_homozygous_dragen", FALSE, + "Het/Hom ratio", "var_het_hom_ratio_dragen", FALSE, + "In dbSNP", "var_in_dbsnp_dragen", FALSE, + "Not in dbSNP", "var_nin_dbsnp_dragen", FALSE, + "Percent Callability", "callability_pct_dragen", FALSE, + "Percent Autosome Callability", "callability_auto_pct_dragen", FALSE, + "Number of samples", "sample_num_dragen", FALSE, + "Reads Processed", "reads_processed_dragen", FALSE, + "Child Sample", "sample_child_dragen", FALSE + ) + raw <- readr::read_lines(x) + assertthat::assert_that(grepl("VARIANT CALLER", raw[1])) + # first detect if this is genome or target region + res <- raw |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim( + "value", + names = c("category", "sample", "var", "count", "pct"), + delim = ",", too_few = "align_start" + ) + reg1 <- NULL + str1 <- NULL + tmp <- res |> + dplyr::filter(grepl("Chr X number of SNPs over ", .data$var)) |> + dplyr::slice_head(n = 1) |> + dplyr::pull("var") + assertthat::assert_that(length(tmp) == 1) + if (grepl("genome", tmp)) { + str1 <- "genome" + reg1 <- "genome" + } else if (grepl("QC coverage region", tmp)) { + str1 <- "QC coverage region" + reg1 <- "qccovreg" + } else if (grepl("target region", tmp)) { + str1 <- "target region" + reg1 <- "targetreg" + } else { + cli::cli_abort("Cannot determine the varcall region from: {x}") + } + abbrev_nm <- abbrev_nm1 |> + dplyr::mutate( + raw = ifelse(.data$region, glue("{.data$raw}{str1}"), .data$raw), + clean = ifelse(.data$region, glue("{.data$clean}{reg1}_dragen"), .data$clean) + ) |> + dplyr::select("raw", "clean") |> + tibble::deframe() + + d <- res |> + dplyr::mutate( + var = dplyr::recode(.data$var, !!!abbrev_nm), + count = dplyr::na_if(.data$count, "NA"), + count = as.numeric(.data$count), + pct = round(as.numeric(.data$pct), 2), + category = dplyr::case_when( + grepl("SUMMARY", .data$category) ~ "summary", + grepl("PREFILTER", .data$category) ~ "prefilter", + grepl("POSTFILTER", .data$category) ~ "postfilter", + TRUE ~ "unknown" + ) + ) |> + dplyr::filter(.data$category != "summary") |> + dplyr::select("category", "sample", "var", "count", "pct") + # pivot + d |> + tidyr::pivot_longer(c("count", "pct")) |> + dplyr::mutate( + name = dplyr::if_else(.data$name == "count", "", "_pct"), + var = glue("{.data$var}{.data$name}") + ) |> + dplyr::select("category", "sample", "var", "value") |> + dplyr::filter(!is.na(.data$value)) |> + tidyr::pivot_wider(names_from = "var", values_from = "value") +} + #' Read DRAGEN Mapping Metrics #' #' Reads the `mapping_metrics.csv` file output from DRAGEN. diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 4785123..22d5768 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -62,15 +62,15 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\.fastqc_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.fragment_length_hist\\.csv$"), "fragmentLengthHist", glue("{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.gvcf_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.mapping_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.gvcf_metrics\\.csv$"), "vcMetrics", + glue("{pref}\\.mapping_metrics\\.csv$"), "mappingMetrics", glue("{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", glue("{pref}\\.microsat_output\\.json$"), "DOWNLOAD_ONLY", glue("{pref}\\.sv_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.time_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.time_metrics\\.csv$"), "timeMetrics", glue("{pref}\\.trimmer_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.umi_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{pref}\\.vc_metrics\\.csv$"), "DOWNLOAD_ONLY" + glue("{pref}\\.vc_metrics\\.csv$"), "vcMetrics" ) regexes <- reg1 |> dplyr::mutate( @@ -172,7 +172,8 @@ Wf_dragen <- R6::R6Class( dat <- dragen_mapping_metrics_read(x) tibble::tibble(name = "mapmetrics", data = list(dat)) }, - #' @description Read `hist.csv` file. + #' @description Read `hist.csv` (not `fine_hist.csv`!) file. + #' @param x Path to file. read_hist = function(x) { subprefix <- dragen_subprefix(x, "_hist") d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd") @@ -191,6 +192,32 @@ Wf_dragen <- R6::R6Class( cumsum = cumsum(.data$pct) ) tibble::tibble(name = glue("hist_{subprefix}"), data = list(dat)) + }, + #' @description Read `time_metrics.csv` file. + #' @param x Path to file. + read_timeMetrics = function(x) { + cn <- c("dummy1", "dummy2", "Step", "time_hrs", "time_sec") + ct <- readr::cols(.default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d") + d <- readr::read_csv(x, col_names = cn, col_types = ct) + assertthat::assert_that(d$dummy1[1] == "RUN TIME", is.na(d$dummy2[1])) + assertthat::assert_that(inherits(d$time_hrs, "hms")) + dat <- d |> + dplyr::mutate( + Step = tools::toTitleCase(sub("Time ", "", .data$Step)), + Step = gsub(" |/", "", .data$Step), + Time = substr(.data$time_hrs, 1, 5) + ) |> + dplyr::select("Step", "Time") |> + tidyr::pivot_wider(names_from = "Step", values_from = "Time") |> + dplyr::relocate("TotalRuntime") + tibble::tibble(name = "timemetrics", data = list(dat)) + }, + #' @description Read `vc_metrics.csv`/`gvcf_metrics.csv` file. + #' @param x Path to file. + read_vcMetrics = function(x) { + subprefix <- dragen_subprefix(x, "_metrics") + dat <- dragen_vc_metrics_read(x) + tibble::tibble(name = glue("vcmetrics_{subprefix}"), data = list(dat[])) } ) # end public ) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index e023ca1..57ff416 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -50,6 +50,9 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_fineHist}{\code{Wf_dragen$read_fineHist()}} \item \href{#method-Wf_dragen-read_fragmentLengthHist}{\code{Wf_dragen$read_fragmentLengthHist()}} \item \href{#method-Wf_dragen-read_mappingMetrics}{\code{Wf_dragen$read_mappingMetrics()}} +\item \href{#method-Wf_dragen-read_hist}{\code{Wf_dragen$read_hist()}} +\item \href{#method-Wf_dragen-read_timeMetrics}{\code{Wf_dragen$read_timeMetrics()}} +\item \href{#method-Wf_dragen-read_vcMetrics}{\code{Wf_dragen$read_vcMetrics()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} } } @@ -197,6 +200,57 @@ Read \code{mapping_metrics.csv} file. \if{html}{\out{
}}\preformatted{Wf_dragen$read_mappingMetrics(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_hist}{}}} +\subsection{Method \code{read_hist()}}{ +Read \code{hist.csv} (not \code{fine_hist.csv}!) file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_hist(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_timeMetrics}{}}} +\subsection{Method \code{read_timeMetrics()}}{ +Read \code{time_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_timeMetrics(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_vcMetrics}{}}} +\subsection{Method \code{read_vcMetrics()}}{ +Read \code{vc_metrics.csv}/\code{gvcf_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_vcMetrics(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R index 1bc8cea..e220df4 100644 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -2,7 +2,7 @@ # File R/dragen.R: @testexamples -test_that("Function time_metrics_process() @ L1004", { +test_that("Function time_metrics_process() @ L1097", { p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") x <- TimeMetricsFile$new(p) From eb27203547345c70deb2a50c29a6b10a8c438761 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 17 Oct 2024 01:30:58 +1100 Subject: [PATCH 08/17] dragen: parse trimmer/sv metrics --- NAMESPACE | 3 + R/dragen.R | 96 +++++++++++++++++++ R/tso_dragen.R | 16 +++- man/Wf_dragen.Rd | 36 +++++++ man/dragen_sv_metrics_read.Rd | 17 ++++ man/dragen_trimmer_metrics_read.Rd | 17 ++++ man/dragen_vc_metrics_read.Rd | 17 ++++ .../test-roxytest-testexamples-dragen.R | 2 +- 8 files changed, 201 insertions(+), 3 deletions(-) create mode 100644 man/dragen_sv_metrics_read.Rd create mode 100644 man/dragen_trimmer_metrics_read.Rd create mode 100644 man/dragen_vc_metrics_read.Rd diff --git a/NAMESPACE b/NAMESPACE index 5c6f744..dcd7516 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,6 +38,9 @@ export(dr_output_format_valid) export(dr_s3_download) export(dragen_coverage_metrics_read) export(dragen_mapping_metrics_read) +export(dragen_sv_metrics_read) +export(dragen_trimmer_metrics_read) +export(dragen_vc_metrics_read) export(dtw_Wf_tso_ctdna_tumor_only) export(dtw_Wf_tso_ctdna_tumor_only_v2) export(empty_tbl) diff --git a/R/dragen.R b/R/dragen.R index 9f66022..21ca6d7 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -9,6 +9,102 @@ dragen_subprefix <- function(x, suffix) { sub(suffix, "", s2) } +#' Read DRAGEN SV Metrics +#' +#' Reads the `sv_metrics.csv` file output from DRAGEN. +#' +#' @param x Path to file. +#' +#' @return Tibble with metrics. +#' @export +dragen_sv_metrics_read <- function(x) { + d <- readr::read_lines(x) + assertthat::assert_that(grepl("SV SUMMARY", d[1])) + abbrev_nm <- c( + "Number of deletions (PASS)" = "del", + "Number of insertions (PASS)" = "ins", + "Number of duplications (PASS)" = "dup", + "Number of breakend pairs (PASS)" = "bnd" + ) + d |> + tibble::as_tibble_col(column_name = "value") |> + dplyr::filter(!grepl("Total number of structural variants", .data$value)) |> + tidyr::separate_wider_delim( + "value", + names = c("svsum", "sample", "var", "count", "pct"), delim = ",", + too_few = "align_start" + ) |> + dplyr::mutate( + count = as.numeric(.data$count), + var = dplyr::recode(.data$var, !!!abbrev_nm) + ) |> + dplyr::select("var", "count") |> + tidyr::pivot_wider(names_from = "var", values_from = "count") +} + +#' Read DRAGEN Trimmer Metrics +#' +#' Reads the `trimmer_metrics.csv` file output from DRAGEN. +#' +#' @param x Path to file. +#' +#' @return Tibble with metrics. +#' @export +dragen_trimmer_metrics_read <- function(x) { + d <- readr::read_lines(x) + assertthat::assert_that(grepl("TRIMMER STATISTICS", d[1])) + abbrev_nm <- c( + "Total input reads" = "reads_tot_input_dragen", + "Total input bases" = "bases_tot_dragen", + "Total input bases R1" = "bases_r1_dragen", + "Total input bases R2" = "bases_r2_dragen", + "Average input read length" = "read_len_avg_dragen", + "Total trimmed reads" = "reads_trimmed_tot_dragen", + "Total trimmed bases" = "bases_trimmed_tot_dragen", + "Average bases trimmed per read" = "bases_trimmed_avg_per_read_dragen", + "Average bases trimmed per trimmed read" = "bases_trimmed_avg_per_trimmedread_dragen", + "Remaining poly-G K-mers R1 3prime" = "polygkmers3r1_remaining_dragen", + "Remaining poly-G K-mers R2 3prime" = "polygkmers3r2_remaining_dragen", + "Poly-G soft trimmed reads unfiltered R1 3prime" = "polyg_soft_trimmed_reads_unfilt_3r1_dragen", + "Poly-G soft trimmed reads unfiltered R2 3prime" = "polyg_soft_trimmed_reads_unfilt_3r2_dragen", + "Poly-G soft trimmed reads filtered R1 3prime" = "polyg_soft_trimmed_reads_filt_3r1_dragen", + "Poly-G soft trimmed reads filtered R2 3prime" = "polyg_soft_trimmed_reads_filt_3r2_dragen", + "Poly-G soft trimmed bases unfiltered R1 3prime" = "polyg_soft_trimmed_bases_unfilt_3r1_dragen", + "Poly-G soft trimmed bases unfiltered R2 3prime" = "polyg_soft_trimmed_bases_unfilt_3r2_dragen", + "Poly-G soft trimmed bases filtered R1 3prime" = "polyg_soft_trimmed_bases_filt_3r1_dragen", + "Poly-G soft trimmed bases filtered R2 3prime" = "polyg_soft_trimmed_bases_filt_3r2_dragen", + "Total filtered reads" = "reads_tot_filt_dragen", + "Reads filtered for minimum read length R1" = "reads_filt_minreadlenr1_dragen", + "Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2_dragen" + ) + + d |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim("value", names = c("category", "extra", "var", "count", "pct"), delim = ",", too_few = "align_start") |> + dplyr::mutate( + count = as.numeric(.data$count), + pct = round(as.numeric(.data$pct), 2), + var = dplyr::recode(.data$var, !!!abbrev_nm) + ) |> + dplyr::select("var", "count", "pct") |> + tidyr::pivot_longer(c("count", "pct")) |> + dplyr::filter(!is.na(.data$value)) |> + dplyr::mutate( + name = dplyr::if_else(.data$name == "count", "", "_pct"), + var = glue("{.data$var}{.data$name}") + ) |> + dplyr::select("var", "value") |> + tidyr::pivot_wider(names_from = "var", values_from = "value") +} + +#' Read DRAGEN VariantCall Metrics +#' +#' Reads the `vc_metrics.csv` file output from DRAGEN. +#' +#' @param x Path to file. +#' +#' @return Tibble with metrics. +#' @export dragen_vc_metrics_read <- function(x) { abbrev_nm1 <- tibble::tribble( ~raw, ~clean, ~region, diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 22d5768..8964779 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -66,9 +66,9 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\.mapping_metrics\\.csv$"), "mappingMetrics", glue("{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", glue("{pref}\\.microsat_output\\.json$"), "DOWNLOAD_ONLY", - glue("{pref}\\.sv_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.sv_metrics\\.csv$"), "svMetrics", glue("{pref}\\.time_metrics\\.csv$"), "timeMetrics", - glue("{pref}\\.trimmer_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.trimmer_metrics\\.csv$"), "trimmerMetrics", glue("{pref}\\.umi_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.vc_metrics\\.csv$"), "vcMetrics" ) @@ -218,6 +218,18 @@ Wf_dragen <- R6::R6Class( subprefix <- dragen_subprefix(x, "_metrics") dat <- dragen_vc_metrics_read(x) tibble::tibble(name = glue("vcmetrics_{subprefix}"), data = list(dat[])) + }, + #' @description Read `trimmer_metrics.csv` file. + #' @param x Path to file. + read_trimmerMetrics = function(x) { + dat <- dragen_trimmer_metrics_read(x) + tibble::tibble(name = "trimmermetrics", data = list(dat[])) + }, + #' @description Read `sv_metrics.csv` file. + #' @param x Path to file. + read_svMetrics = function(x) { + dat <- dragen_sv_metrics_read(x) + tibble::tibble(name = "svmetrics", data = list(dat[])) } ) # end public ) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 57ff416..5745e87 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -53,6 +53,8 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_hist}{\code{Wf_dragen$read_hist()}} \item \href{#method-Wf_dragen-read_timeMetrics}{\code{Wf_dragen$read_timeMetrics()}} \item \href{#method-Wf_dragen-read_vcMetrics}{\code{Wf_dragen$read_vcMetrics()}} +\item \href{#method-Wf_dragen-read_trimmerMetrics}{\code{Wf_dragen$read_trimmerMetrics()}} +\item \href{#method-Wf_dragen-read_svMetrics}{\code{Wf_dragen$read_svMetrics()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} } } @@ -251,6 +253,40 @@ Read \code{vc_metrics.csv}/\code{gvcf_metrics.csv} file. \if{html}{\out{
}}\preformatted{Wf_dragen$read_vcMetrics(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_trimmerMetrics}{}}} +\subsection{Method \code{read_trimmerMetrics()}}{ +Read \code{trimmer_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_trimmerMetrics(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_svMetrics}{}}} +\subsection{Method \code{read_svMetrics()}}{ +Read \code{sv_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_svMetrics(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/man/dragen_sv_metrics_read.Rd b/man/dragen_sv_metrics_read.Rd new file mode 100644 index 0000000..50577e0 --- /dev/null +++ b/man/dragen_sv_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_sv_metrics_read} +\alias{dragen_sv_metrics_read} +\title{Read DRAGEN SV Metrics} +\usage{ +dragen_sv_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Reads the \code{sv_metrics.csv} file output from DRAGEN. +} diff --git a/man/dragen_trimmer_metrics_read.Rd b/man/dragen_trimmer_metrics_read.Rd new file mode 100644 index 0000000..806a86f --- /dev/null +++ b/man/dragen_trimmer_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_trimmer_metrics_read} +\alias{dragen_trimmer_metrics_read} +\title{Read DRAGEN Trimmer Metrics} +\usage{ +dragen_trimmer_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Reads the \code{trimmer_metrics.csv} file output from DRAGEN. +} diff --git a/man/dragen_vc_metrics_read.Rd b/man/dragen_vc_metrics_read.Rd new file mode 100644 index 0000000..9bd4dae --- /dev/null +++ b/man/dragen_vc_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_vc_metrics_read} +\alias{dragen_vc_metrics_read} +\title{Read DRAGEN VariantCall Metrics} +\usage{ +dragen_vc_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Reads the \code{vc_metrics.csv} file output from DRAGEN. +} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R index e220df4..52fdda4 100644 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -2,7 +2,7 @@ # File R/dragen.R: @testexamples -test_that("Function time_metrics_process() @ L1097", { +test_that("Function time_metrics_process() @ L1193", { p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") x <- TimeMetricsFile$new(p) From 84c3c2c8703cace31d0bab638466752c38c68bd4 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 17 Oct 2024 10:08:48 +1100 Subject: [PATCH 09/17] dragen: parse fastqc/cnv metrics --- NAMESPACE | 2 + R/dragen.R | 80 ++++++++++++++++--- R/dragen_fastqc.R | 24 +++++- R/tso_dragen.R | 16 +++- man/Wf_dragen.Rd | 36 +++++++++ man/dragen_cnv_metrics_read.Rd | 17 ++++ man/dragen_fastqc_metrics_read.Rd | 17 ++++ .../test-roxytest-testexamples-dragen.R | 2 +- 8 files changed, 179 insertions(+), 15 deletions(-) create mode 100644 man/dragen_cnv_metrics_read.Rd create mode 100644 man/dragen_fastqc_metrics_read.Rd diff --git a/NAMESPACE b/NAMESPACE index dcd7516..a496196 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,7 +36,9 @@ export(dr_func_eval) export(dr_gds_download) export(dr_output_format_valid) export(dr_s3_download) +export(dragen_cnv_metrics_read) export(dragen_coverage_metrics_read) +export(dragen_fastqc_metrics_read) export(dragen_mapping_metrics_read) export(dragen_sv_metrics_read) export(dragen_trimmer_metrics_read) diff --git a/R/dragen.R b/R/dragen.R index 21ca6d7..bde01f9 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -1,12 +1,63 @@ -dragen_subprefix <- function(x, suffix) { - # L2401290.exon_contig_mean_cov.csv -> exon - # L2401290.target_bed_contig_mean_cov.csv -> target_bed - # L2401290.tmb_contig_mean_cov.csv -> tmb - # L2401290.wgs_contig_mean_cov.csv -> wgs - bname <- basename(x) - s1 <- tools::file_path_sans_ext(bname) - s2 <- sub(".*\\.(.*)", "\\1", s1) - sub(suffix, "", s2) +#' Read DRAGEN CNV Metrics +#' +#' Reads the `cnv_metrics.csv` file output from DRAGEN. +#' +#' @param x Path to file. +#' +#' @return Tibble with metrics. +#' @export +dragen_cnv_metrics_read <- function(x) { + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("SEX GENOTYPER", d0[1])) + abbrev_nm <- c( + "Bases in reference genome" = "bases_in_ref_genome_dragen", + "Average alignment coverage over genome" = "cov_alignment_avg_over_genome_dragen", + "Number of alignment records" = "n_alignment_records", + "Number of filtered records (total)" = "n_filtered_records_tot", + "Number of filtered records (duplicates)" = "n_filtered_records_dup", + "Number of filtered records (MAPQ)" = "n_filtered_records_mapq", + "Number of filtered records (unmapped)" = "n_filtered_records_unmap", + "Coverage MAD" = "coverage_mad", + "Gene Scaled MAD" = "gene_scaled_mad", + "Median Bin Count" = "median_bin_count", + "Number of target intervals" = "n_target_intervals", + "Number of normal samples" = "n_samp_norm", + "Number of segments" = "n_seg", + "Number of amplifications" = "n_amp", + "Number of deletions" = "n_del", + "Number of passing amplifications" = "n_amp_pass", + "Number of passing deletions" = "n_del_pass" + ) + + d1 <- d0 |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim( + "value", + names = c("category", "extra", "var", "count", "pct"), + delim = ",", too_few = "align_start" + ) + sexgt <- d1 |> + dplyr::filter(.data$category == "SEX GENOTYPER") |> + dplyr::select(sexgt = "count", sexgt_pct = "pct") + + d2 <- d1 |> + dplyr::filter(!.data$category == "SEX GENOTYPER") |> + dplyr::mutate( + count = as.numeric(.data$count), + pct = round(as.numeric(.data$pct), 2), + var = dplyr::recode(.data$var, !!!abbrev_nm) + ) |> + dplyr::select("var", "count", "pct") |> + tidyr::pivot_longer(c("count", "pct")) |> + dplyr::filter(!is.na(.data$value)) |> + dplyr::mutate( + name = dplyr::if_else(.data$name == "count", "", "_pct"), + var = glue("{.data$var}{.data$name}") + ) |> + dplyr::select("var", "value") |> + tidyr::pivot_wider(names_from = "var", values_from = "value") + res <- dplyr::bind_cols(sexgt, d2) + return(res) } #' Read DRAGEN SV Metrics @@ -1514,3 +1565,14 @@ WgsHistFile <- R6::R6Class( } ) ) + +dragen_subprefix <- function(x, suffix) { + # L2401290.exon_contig_mean_cov.csv -> exon + # L2401290.target_bed_contig_mean_cov.csv -> target_bed + # L2401290.tmb_contig_mean_cov.csv -> tmb + # L2401290.wgs_contig_mean_cov.csv -> wgs + bname <- basename(x) + s1 <- tools::file_path_sans_ext(bname) + s2 <- sub(".*\\.(.*)", "\\1", s1) + sub(suffix, "", s2) +} diff --git a/R/dragen_fastqc.R b/R/dragen_fastqc.R index c272e20..3615cd6 100644 --- a/R/dragen_fastqc.R +++ b/R/dragen_fastqc.R @@ -22,7 +22,7 @@ FastqcMetricsFile <- R6::R6Class( #' @return tibble. TODO. read = function() { x <- self$path - res <- read_fastqc_metrics(x) + res <- dragen_fastqc_metrics_read(x) }, #' @description #' Writes a tidy version of the `fastqc_metrics.csv` file output @@ -53,7 +53,15 @@ FastqcMetricsFile <- R6::R6Class( ) ) -read_fastqc_metrics <- function(x) { +#' DRAGEN FASTQC Metrics +#' +#' Read DRAGEN `fastqc_metrics.csv` file. +#' +#' @param x Path to file. +#' +#' @return Tibble with metrics. +#' @export +dragen_fastqc_metrics_read <- function(x) { # 'SEQUENCE POSITIONS' has an extra field for 'Total Sequence Starts' which # can be filtered out since that can be computed by the rest of that section. raw <- readr::read_lines(x) |> @@ -70,6 +78,15 @@ read_fastqc_metrics <- function(x) { value = as.numeric(.data$value) ) + # 1 POSITIONAL BASE CONTENT + # 2 POSITIONAL BASE MEAN QUALITY + # 3 POSITIONAL QUALITY + # 4 READ GC CONTENT + # 5 READ GC CONTENT QUALITY + # 6 READ LENGTHS + # 7 READ MEAN QUALITY + # 8 SEQUENCE POSITIONS + # there are binned pos e.g. "149-150" # the value is an accumulation, so divide by number of grains pos_base_cont <- d |> @@ -166,5 +183,6 @@ read_fastqc_metrics <- function(x) { read_lengths = read_len, read_mean_quality = read_mean_qual, sequence_positions = seq_pos - ) + ) |> + tibble::enframe(name = "fastqc_name", value = "fastqc_value") } diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 8964779..7030263 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -42,7 +42,7 @@ Wf_dragen <- R6::R6Class( reg1 <- tibble::tribble( ~regex, ~fun, glue("{pref}\\-replay\\.json$"), "replay", - glue("{pref}\\.cnv_metrics.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.cnv_metrics.csv$"), "cnvMetrics", glue("{pref}\\.exon_contig_mean_cov\\.csv$"), "contigMeanCov", glue("{pref}\\.target_bed_contig_mean_cov\\.csv$"), "contigMeanCov", glue("{pref}\\.tmb_contig_mean_cov\\.csv$"), "contigMeanCov", @@ -59,7 +59,7 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\.target_bed_hist\\.csv$"), "hist", glue("{pref}\\.tmb_hist\\.csv$"), "hist", glue("{pref}\\.wgs_hist\\.csv$"), "hist", - glue("{pref}\\.fastqc_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.fastqc_metrics\\.csv$"), "fastqcMetrics", glue("{pref}\\.fragment_length_hist\\.csv$"), "fragmentLengthHist", glue("{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", glue("{pref}\\.gvcf_metrics\\.csv$"), "vcMetrics", @@ -230,6 +230,18 @@ Wf_dragen <- R6::R6Class( read_svMetrics = function(x) { dat <- dragen_sv_metrics_read(x) tibble::tibble(name = "svmetrics", data = list(dat[])) + }, + #' @description Read `cnv_metrics.csv` file. + #' @param x Path to file. + read_cnvMetrics = function(x) { + dat <- dragen_cnv_metrics_read(x) + tibble::tibble(name = "cnvmetrics", data = list(dat[])) + }, + #' @description Read `fastqc_metrics.csv` file. + #' @param x Path to file. + read_fastqcMetrics = function(x) { + dat <- dragen_fastqc_metrics_read(x) + tibble::tibble(name = "fastqcmetrics", data = list(dat[])) } ) # end public ) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 5745e87..6e6a763 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -55,6 +55,8 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_vcMetrics}{\code{Wf_dragen$read_vcMetrics()}} \item \href{#method-Wf_dragen-read_trimmerMetrics}{\code{Wf_dragen$read_trimmerMetrics()}} \item \href{#method-Wf_dragen-read_svMetrics}{\code{Wf_dragen$read_svMetrics()}} +\item \href{#method-Wf_dragen-read_cnvMetrics}{\code{Wf_dragen$read_cnvMetrics()}} +\item \href{#method-Wf_dragen-read_fastqcMetrics}{\code{Wf_dragen$read_fastqcMetrics()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} } } @@ -287,6 +289,40 @@ Read \code{sv_metrics.csv} file. \if{html}{\out{
}}\preformatted{Wf_dragen$read_svMetrics(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_cnvMetrics}{}}} +\subsection{Method \code{read_cnvMetrics()}}{ +Read \code{cnv_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_cnvMetrics(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_fastqcMetrics}{}}} +\subsection{Method \code{read_fastqcMetrics()}}{ +Read \code{fastqc_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_fastqcMetrics(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/man/dragen_cnv_metrics_read.Rd b/man/dragen_cnv_metrics_read.Rd new file mode 100644 index 0000000..c6e7281 --- /dev/null +++ b/man/dragen_cnv_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_cnv_metrics_read} +\alias{dragen_cnv_metrics_read} +\title{Read DRAGEN CNV Metrics} +\usage{ +dragen_cnv_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Reads the \code{cnv_metrics.csv} file output from DRAGEN. +} diff --git a/man/dragen_fastqc_metrics_read.Rd b/man/dragen_fastqc_metrics_read.Rd new file mode 100644 index 0000000..018a75d --- /dev/null +++ b/man/dragen_fastqc_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen_fastqc.R +\name{dragen_fastqc_metrics_read} +\alias{dragen_fastqc_metrics_read} +\title{DRAGEN FASTQC Metrics} +\usage{ +dragen_fastqc_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Read DRAGEN \code{fastqc_metrics.csv} file. +} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R index 52fdda4..73866fd 100644 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -2,7 +2,7 @@ # File R/dragen.R: @testexamples -test_that("Function time_metrics_process() @ L1193", { +test_that("Function time_metrics_process() @ L1244", { p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") x <- TimeMetricsFile$new(p) From c69d26fd6717a2927fc15396841591dd39e6d3d0 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 17 Oct 2024 17:23:39 +1100 Subject: [PATCH 10/17] dragen: parse gc metrics --- NAMESPACE | 1 + R/dragen.R | 70 ++++++++++++++++++- R/tso_dragen.R | 8 ++- man/Wf_dragen.Rd | 18 +++++ man/dragen_gc_metrics_read.Rd | 17 +++++ .../test-roxytest-testexamples-dragen.R | 2 +- 6 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 man/dragen_gc_metrics_read.Rd diff --git a/NAMESPACE b/NAMESPACE index a496196..4a1f49b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,6 +39,7 @@ export(dr_s3_download) export(dragen_cnv_metrics_read) export(dragen_coverage_metrics_read) export(dragen_fastqc_metrics_read) +export(dragen_gc_metrics_read) export(dragen_mapping_metrics_read) export(dragen_sv_metrics_read) export(dragen_trimmer_metrics_read) diff --git a/R/dragen.R b/R/dragen.R index bde01f9..fbebdc6 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -1,3 +1,72 @@ +#' Read DRAGEN GC Metrics +#' +#' Reads the `gc_metrics.csv` file output from DRAGEN. +#' +#' @param x Path to file. +#' +#' @return Tibble with metrics. +#' @export +dragen_gc_metrics_read <- function(x) { + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("GC BIAS DETAILS", d0[1])) + d1 <- d0 |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim( + "value", + names = c("category", "dummy1", "name", "value", "fraction"), + delim = ",", too_few = "align_start" + ) + abbrev_nm <- c( + "Window size" = "window_size", + "Number of valid windows" = "n_windows_valid", + "Number of discarded windows" = "n_windows_discard", + "Average reference GC" = "gc_avg_reference", + "Mean global coverage" = "cov_avg_global", + "Normalized coverage at GCs 0-19" = "cov_norm_0to19", + "Normalized coverage at GCs 20-39" = "cov_norm_20to39", + "Normalized coverage at GCs 40-59" = "cov_norm_40to59", + "Normalized coverage at GCs 60-79" = "cov_norm_60to79", + "Normalized coverage at GCs 80-100" = "cov_norm_80to100", + "AT Dropout" = "dropout_at", + "GC Dropout" = "dropout_gc" + ) + # GC METRICS SUMMARY + summary <- d1 |> + dplyr::filter(.data$category == "GC METRICS SUMMARY") |> + dplyr::select("name", "value") |> + dplyr::mutate( + value = as.numeric(.data$value), + name = dplyr::recode(.data$name, !!!abbrev_nm) + ) |> + tidyr::pivot_wider(names_from = "name", values_from = "value") + + # GC BIAS DETAILS + details <- d1 |> + dplyr::filter(.data$category == "GC BIAS DETAILS") |> + dplyr::mutate( + gc = sub(".* at GC (.*)", "\\1", .data$name), + name = sub("(.*) at GC .*", "\\1", .data$name), + name = tolower(.data$name), + name = sub(" ", "", .data$name), + value = as.numeric(.data$value), + fraction = as.numeric(.data$fraction) + ) + details_wind <- details |> + dplyr::filter(.data$name == "windows") |> + dplyr::select("gc", "value") + + details_cov <- details |> + dplyr::filter(.data$name == "normalizedcoverage") |> + dplyr::select("gc", "value") + + list( + gcmetrics_summary = summary, + gcmetrics_windows = details_wind, + gcmetrics_coverage = details_cov + ) |> + tibble::enframe(name = "name", value = "data") +} + #' Read DRAGEN CNV Metrics #' #' Reads the `cnv_metrics.csv` file output from DRAGEN. @@ -28,7 +97,6 @@ dragen_cnv_metrics_read <- function(x) { "Number of passing amplifications" = "n_amp_pass", "Number of passing deletions" = "n_del_pass" ) - d1 <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 7030263..03d8845 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -61,7 +61,7 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\.wgs_hist\\.csv$"), "hist", glue("{pref}\\.fastqc_metrics\\.csv$"), "fastqcMetrics", glue("{pref}\\.fragment_length_hist\\.csv$"), "fragmentLengthHist", - glue("{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.gc_metrics\\.csv$"), "gcMetrics", glue("{pref}\\.gvcf_metrics\\.csv$"), "vcMetrics", glue("{pref}\\.mapping_metrics\\.csv$"), "mappingMetrics", glue("{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", @@ -242,6 +242,12 @@ Wf_dragen <- R6::R6Class( read_fastqcMetrics = function(x) { dat <- dragen_fastqc_metrics_read(x) tibble::tibble(name = "fastqcmetrics", data = list(dat[])) + }, + #' @description Read `gc_metrics.csv` file. + #' @param x Path to file. + read_gcMetrics = function(x) { + dat <- dragen_gc_metrics_read(x) + dat } ) # end public ) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 6e6a763..04b3d0e 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -57,6 +57,7 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_svMetrics}{\code{Wf_dragen$read_svMetrics()}} \item \href{#method-Wf_dragen-read_cnvMetrics}{\code{Wf_dragen$read_cnvMetrics()}} \item \href{#method-Wf_dragen-read_fastqcMetrics}{\code{Wf_dragen$read_fastqcMetrics()}} +\item \href{#method-Wf_dragen-read_gcMetrics}{\code{Wf_dragen$read_gcMetrics()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} } } @@ -323,6 +324,23 @@ Read \code{fastqc_metrics.csv} file. \if{html}{\out{
}}\preformatted{Wf_dragen$read_fastqcMetrics(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_gcMetrics}{}}} +\subsection{Method \code{read_gcMetrics()}}{ +Read \code{gc_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_gcMetrics(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/man/dragen_gc_metrics_read.Rd b/man/dragen_gc_metrics_read.Rd new file mode 100644 index 0000000..fc53959 --- /dev/null +++ b/man/dragen_gc_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_gc_metrics_read} +\alias{dragen_gc_metrics_read} +\title{Read DRAGEN GC Metrics} +\usage{ +dragen_gc_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Reads the \code{gc_metrics.csv} file output from DRAGEN. +} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R index 73866fd..6d25116 100644 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -2,7 +2,7 @@ # File R/dragen.R: @testexamples -test_that("Function time_metrics_process() @ L1244", { +test_that("Function time_metrics_process() @ L1312", { p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") x <- TimeMetricsFile$new(p) From 32d1138830e812e67790e028b270ad811e6821b8 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 18 Oct 2024 00:26:14 +1100 Subject: [PATCH 11/17] dragen: parse umi metrics --- NAMESPACE | 1 + R/dragen.R | 655 ++++++++++-------- R/tso_dragen.R | 8 +- man/Wf_dragen.Rd | 18 + man/dragen_umi_metrics_read.Rd | 17 + .../test-roxytest-testexamples-dragen.R | 2 +- 6 files changed, 415 insertions(+), 286 deletions(-) create mode 100644 man/dragen_umi_metrics_read.Rd diff --git a/NAMESPACE b/NAMESPACE index 4a1f49b..8350259 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,7 @@ export(dragen_gc_metrics_read) export(dragen_mapping_metrics_read) export(dragen_sv_metrics_read) export(dragen_trimmer_metrics_read) +export(dragen_umi_metrics_read) export(dragen_vc_metrics_read) export(dtw_Wf_tso_ctdna_tumor_only) export(dtw_Wf_tso_ctdna_tumor_only_v2) diff --git a/R/dragen.R b/R/dragen.R index fbebdc6..d3587b6 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -1,3 +1,90 @@ +#' Read DRAGEN UMI Metrics +#' +#' Reads the `umi_metrics.csv` file output from DRAGEN. +#' +#' @param x Path to file. +#' +#' @return Tibble with metrics. +#' @export +dragen_umi_metrics_read <- function(x) { + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("UMI STATISTICS", d0[1])) + abbrev_nm <- tibble::tribble( + ~raw, ~clean, ~target, + "number of reads", "reads_tot", TRUE, + "number of reads with valid or correctable umis", "reads_umi_valid_correctable", TRUE, + "number of reads in discarded families", "reads_discarded_families", TRUE, + "reads filtered out", "reads_filtered_out", FALSE, + "reads with all-g umis filtered out", "reads_filtered_out_all_g_umis", FALSE, + "reads with uncorrectable umis", "reads_uncorrectable_umis", FALSE, + "total number of families", "families_tot", FALSE, + "families contextually corrected", "families_contextually_corrected", FALSE, + "families shifted", "families_shifted", FALSE, + "families discarded", "families_discarded_tot", TRUE, + "families discarded by min-support-reads", "families_discarded_minsupportreads", TRUE, + "families discarded by duplex/simplex", "families_discarded_duplexsimplex", TRUE, + "families with ambiguous correction", "families_ambiguous_correction", TRUE, + "duplex families", "duplex_families", TRUE, + "consensus pairs emitted", "consensus_pairs_emitted", FALSE, + "mean family depth", "avg_family_depth", TRUE, + "number of collapsible regions", "collapsible_regions_tot", FALSE, + "min collapsible region size (num reads)", "collapsible_region_size_min", FALSE, + "max collapsible region size (num reads)", "collapsible_region_size_max", FALSE, + "mean collapsible region size (num reads)", "collapsible_region_size_mean", FALSE, + "collapsible region size standard deviation", "collapsible_region_size_sd", FALSE, + "histogram of num supporting fragments", "histo_num_supporting_fragments", TRUE, + "histogram of unique umis per fragment position", "histo_unique_umis", FALSE + ) + abbrev_nm_target <- abbrev_nm |> + dplyr::filter(.data$target) |> + dplyr::mutate( + raw = as.character(glue("on target {.data$raw}")), + clean = as.character(glue("{.data$clean}_ontarget")) + ) + abbrev_nm <- abbrev_nm |> + dplyr::bind_rows(abbrev_nm_target) |> + dplyr::select("raw", "clean") |> + tibble::deframe() + d1 <- d0 |> + tibble::as_tibble_col(column_name = "value") |> + tidyr::separate_wider_delim( + "value", + names = c("category", "dummy1", "var", "count", "pct"), + delim = ",", too_few = "align_start" + ) |> + dplyr::mutate( + var = tolower(.data$var), + var = dplyr::recode(.data$var, !!!abbrev_nm) + ) + hist <- d1 |> + dplyr::filter(grepl("histo", .data$var)) |> + dplyr::select(name = "var", "count") |> + dplyr::mutate( + count = gsub("\\{|\\}", "", .data$count), + count = strsplit(.data$count, "\\|") + ) |> + tidyr::unnest(count) |> + dplyr::mutate(count = as.numeric(.data$count)) |> + tidyr::nest(.by = "name") + d2 <- d1 |> + dplyr::filter(!grepl("histo", .data$var)) |> + dplyr::select("var", "count", "pct") |> + tidyr::pivot_longer(c("count", "pct"), names_to = "name", values_to = "value") |> + dplyr::mutate( + value = as.numeric(.data$value), + name = dplyr::if_else(.data$name == "count", "", "_pct"), + var = glue("{.data$var}{.data$name}") + ) |> + dplyr::select("var", "value") |> + dplyr::filter(!is.na(.data$value)) |> + tidyr::pivot_wider(names_from = "var", values_from = "value") + res <- list(metrics = d2) |> + tibble::enframe(name = "name", value = "data") |> + dplyr::bind_rows(hist) |> + dplyr::mutate(name = glue("umi_{.data$name}")) + res +} + #' Read DRAGEN GC Metrics #' #' Reads the `gc_metrics.csv` file output from DRAGEN. @@ -79,8 +166,8 @@ dragen_cnv_metrics_read <- function(x) { d0 <- readr::read_lines(x) assertthat::assert_that(grepl("SEX GENOTYPER", d0[1])) abbrev_nm <- c( - "Bases in reference genome" = "bases_in_ref_genome_dragen", - "Average alignment coverage over genome" = "cov_alignment_avg_over_genome_dragen", + "Bases in reference genome" = "bases_in_ref_genome", + "Average alignment coverage over genome" = "cov_alignment_avg_over_genome", "Number of alignment records" = "n_alignment_records", "Number of filtered records (total)" = "n_filtered_records_tot", "Number of filtered records (duplicates)" = "n_filtered_records_dup", @@ -173,28 +260,28 @@ dragen_trimmer_metrics_read <- function(x) { d <- readr::read_lines(x) assertthat::assert_that(grepl("TRIMMER STATISTICS", d[1])) abbrev_nm <- c( - "Total input reads" = "reads_tot_input_dragen", - "Total input bases" = "bases_tot_dragen", - "Total input bases R1" = "bases_r1_dragen", - "Total input bases R2" = "bases_r2_dragen", - "Average input read length" = "read_len_avg_dragen", - "Total trimmed reads" = "reads_trimmed_tot_dragen", - "Total trimmed bases" = "bases_trimmed_tot_dragen", - "Average bases trimmed per read" = "bases_trimmed_avg_per_read_dragen", - "Average bases trimmed per trimmed read" = "bases_trimmed_avg_per_trimmedread_dragen", - "Remaining poly-G K-mers R1 3prime" = "polygkmers3r1_remaining_dragen", - "Remaining poly-G K-mers R2 3prime" = "polygkmers3r2_remaining_dragen", - "Poly-G soft trimmed reads unfiltered R1 3prime" = "polyg_soft_trimmed_reads_unfilt_3r1_dragen", - "Poly-G soft trimmed reads unfiltered R2 3prime" = "polyg_soft_trimmed_reads_unfilt_3r2_dragen", - "Poly-G soft trimmed reads filtered R1 3prime" = "polyg_soft_trimmed_reads_filt_3r1_dragen", - "Poly-G soft trimmed reads filtered R2 3prime" = "polyg_soft_trimmed_reads_filt_3r2_dragen", - "Poly-G soft trimmed bases unfiltered R1 3prime" = "polyg_soft_trimmed_bases_unfilt_3r1_dragen", - "Poly-G soft trimmed bases unfiltered R2 3prime" = "polyg_soft_trimmed_bases_unfilt_3r2_dragen", - "Poly-G soft trimmed bases filtered R1 3prime" = "polyg_soft_trimmed_bases_filt_3r1_dragen", - "Poly-G soft trimmed bases filtered R2 3prime" = "polyg_soft_trimmed_bases_filt_3r2_dragen", - "Total filtered reads" = "reads_tot_filt_dragen", - "Reads filtered for minimum read length R1" = "reads_filt_minreadlenr1_dragen", - "Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2_dragen" + "Total input reads" = "reads_tot_input", + "Total input bases" = "bases_tot", + "Total input bases R1" = "bases_r1", + "Total input bases R2" = "bases_r2", + "Average input read length" = "read_len_avg", + "Total trimmed reads" = "reads_trimmed_tot", + "Total trimmed bases" = "bases_trimmed_tot", + "Average bases trimmed per read" = "bases_trimmed_avg_per_read", + "Average bases trimmed per trimmed read" = "bases_trimmed_avg_per_trimmedread", + "Remaining poly-G K-mers R1 3prime" = "polygkmers3r1_remaining", + "Remaining poly-G K-mers R2 3prime" = "polygkmers3r2_remaining", + "Poly-G soft trimmed reads unfiltered R1 3prime" = "polyg_soft_trimmed_reads_unfilt_3r1", + "Poly-G soft trimmed reads unfiltered R2 3prime" = "polyg_soft_trimmed_reads_unfilt_3r2", + "Poly-G soft trimmed reads filtered R1 3prime" = "polyg_soft_trimmed_reads_filt_3r1", + "Poly-G soft trimmed reads filtered R2 3prime" = "polyg_soft_trimmed_reads_filt_3r2", + "Poly-G soft trimmed bases unfiltered R1 3prime" = "polyg_soft_trimmed_bases_unfilt_3r1", + "Poly-G soft trimmed bases unfiltered R2 3prime" = "polyg_soft_trimmed_bases_unfilt_3r2", + "Poly-G soft trimmed bases filtered R1 3prime" = "polyg_soft_trimmed_bases_filt_3r1", + "Poly-G soft trimmed bases filtered R2 3prime" = "polyg_soft_trimmed_bases_filt_3r2", + "Total filtered reads" = "reads_tot_filt", + "Reads filtered for minimum read length R1" = "reads_filt_minreadlenr1", + "Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2" ) d |> @@ -227,31 +314,31 @@ dragen_trimmer_metrics_read <- function(x) { dragen_vc_metrics_read <- function(x) { abbrev_nm1 <- tibble::tribble( ~raw, ~clean, ~region, - "Total", "var_tot_dragen", FALSE, - "Biallelic", "var_biallelic_dragen", FALSE, - "Multiallelic", "var_multiallelic_dragen", FALSE, - "SNPs", "var_snp_dragen", FALSE, - "Insertions (Hom)", "var_ins_hom_dragen", FALSE, - "Insertions (Het)", "var_ins_het_dragen", FALSE, - "Deletions (Hom)", "var_del_hom_dragen", FALSE, - "Deletions (Het)", "var_del_het_dragen", FALSE, - "Indels (Het)", "var_indel_het_dragen", FALSE, + "Total", "var_tot", FALSE, + "Biallelic", "var_biallelic", FALSE, + "Multiallelic", "var_multiallelic", FALSE, + "SNPs", "var_snp", FALSE, + "Insertions (Hom)", "var_ins_hom", FALSE, + "Insertions (Het)", "var_ins_het", FALSE, + "Deletions (Hom)", "var_del_hom", FALSE, + "Deletions (Het)", "var_del_het", FALSE, + "Indels (Het)", "var_indel_het", FALSE, "Chr X number of SNPs over ", "var_snp_x_over_", TRUE, "Chr Y number of SNPs over ", "var_snp_y_over_", TRUE, "(Chr X SNPs)/(chr Y SNPs) ratio over ", "var_x_over_y_snp_ratio_over_", TRUE, - "SNP Transitions", "var_snp_transitions_dragen", FALSE, - "SNP Transversions", "var_snp_transversions_dragen", FALSE, - "Ti/Tv ratio", "var_ti_tv_ratio_dragen", FALSE, - "Heterozygous", "var_heterozygous_dragen", FALSE, - "Homozygous", "var_homozygous_dragen", FALSE, - "Het/Hom ratio", "var_het_hom_ratio_dragen", FALSE, - "In dbSNP", "var_in_dbsnp_dragen", FALSE, - "Not in dbSNP", "var_nin_dbsnp_dragen", FALSE, - "Percent Callability", "callability_pct_dragen", FALSE, - "Percent Autosome Callability", "callability_auto_pct_dragen", FALSE, - "Number of samples", "sample_num_dragen", FALSE, - "Reads Processed", "reads_processed_dragen", FALSE, - "Child Sample", "sample_child_dragen", FALSE + "SNP Transitions", "var_snp_transitions", FALSE, + "SNP Transversions", "var_snp_transversions", FALSE, + "Ti/Tv ratio", "var_ti_tv_ratio", FALSE, + "Heterozygous", "var_heterozygous", FALSE, + "Homozygous", "var_homozygous", FALSE, + "Het/Hom ratio", "var_het_hom_ratio", FALSE, + "In dbSNP", "var_in_dbsnp", FALSE, + "Not in dbSNP", "var_nin_dbsnp", FALSE, + "Percent Callability", "callability_pct", FALSE, + "Percent Autosome Callability", "callability_auto_pct", FALSE, + "Number of samples", "sample_num", FALSE, + "Reads Processed", "reads_processed", FALSE, + "Child Sample", "sample_child", FALSE ) raw <- readr::read_lines(x) assertthat::assert_that(grepl("VARIANT CALLER", raw[1])) @@ -285,7 +372,7 @@ dragen_vc_metrics_read <- function(x) { abbrev_nm <- abbrev_nm1 |> dplyr::mutate( raw = ifelse(.data$region, glue("{.data$raw}{str1}"), .data$raw), - clean = ifelse(.data$region, glue("{.data$clean}{reg1}_dragen"), .data$clean) + clean = ifelse(.data$region, glue("{.data$clean}{reg1}"), .data$clean) ) |> dplyr::select("raw", "clean") |> tibble::deframe() @@ -327,78 +414,78 @@ dragen_vc_metrics_read <- function(x) { #' @export dragen_mapping_metrics_read <- function(x) { abbrev_nm <- c( - "Total input reads" = "reads_tot_input_dragen", - "Number of duplicate marked reads" = "reads_num_dupmarked_dragen", - "Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed_dragen", - "Number of unique reads (excl. duplicate marked reads)" = "reads_num_uniq_dragen", - "Reads with mate sequenced" = "reads_w_mate_seq_dragen", - "Reads without mate sequenced" = "reads_wo_mate_seq_dragen", - "QC-failed reads" = "reads_qcfail_dragen", - "Mapped reads" = "reads_mapped_dragen", - "Mapped reads adjusted for filtered mapping" = "reads_mapped_adjfilt_dragen", - "Mapped reads R1" = "reads_mapped_r1_dragen", - "Mapped reads R2" = "reads_mapped_r2_dragen", - "Number of unique & mapped reads (excl. duplicate marked reads)" = "reads_num_uniq_mapped_dragen", - "Unmapped reads" = "reads_unmapped_dragen", - "Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt_dragen", - "Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj_dragen", - "Singleton reads (itself mapped; mate unmapped)" = "reads_singleton_dragen", - "Paired reads (itself & mate mapped)" = "reads_paired_dragen", - "Properly paired reads" = "reads_paired_proper_dragen", - "Not properly paired reads (discordant)" = "reads_discordant_dragen", - "Paired reads mapped to different chromosomes" = "reads_paired_mapped_diff_chrom_dragen", - "Paired reads mapped to different chromosomes (MAPQ>=10)" = "reads_paired_mapped_diff_chrom_mapq10_dragen", - "Reads with MAPQ [40:inf)" = "reads_mapq_40_inf_dragen", - "Reads with MAPQ [30:40)" = "reads_mapq_30_40_dragen", - "Reads with MAPQ [20:30)" = "reads_mapq_20_30_dragen", - "Reads with MAPQ [10:20)" = "reads_mapq_10_20_dragen", - "Reads with MAPQ [ 0:10)" = "reads_mapq_0_10_dragen", - "Reads with MAPQ NA (Unmapped reads)" = "reads_mapq_na_unmapped_dragen", - "Reads with indel R1" = "reads_indel_r1_dragen", - "Reads with indel R2" = "reads_indel_r2_dragen", - "Total bases" = "bases_tot_dragen", - "Total bases R1" = "bases_tot_r1_dragen", - "Total bases R2" = "bases_tot_r2_dragen", - "Mapped bases" = "bases_mapped_dragen", - "Mapped bases R1" = "bases_mapped_r1_dragen", - "Mapped bases R2" = "bases_mapped_r2_dragen", - "Soft-clipped bases" = "bases_softclip_dragen", - "Soft-clipped bases R1" = "bases_softclip_r1_dragen", - "Soft-clipped bases R2" = "bases_softclip_r2_dragen", - "Hard-clipped bases" = "bases_hardclip_dragen", - "Hard-clipped bases R1" = "bases_hardclip_r1_dragen", - "Hard-clipped bases R2" = "bases_hardclip_r2_dragen", - "Mismatched bases R1" = "bases_mismatched_r1_dragen", - "Mismatched bases R2" = "bases_mismatched_r2_dragen", - "Mismatched bases R1 (excl. indels)" = "bases_mismatched_r1_noindels_dragen", - "Mismatched bases R2 (excl. indels)" = "bases_mismatched_r2_noindels_dragen", - "Q30 bases" = "bases_q30_dragen", - "Q30 bases R1" = "bases_q30_r1_dragen", - "Q30 bases R2" = "bases_q30_r2_dragen", - "Q30 bases (excl. dups & clipped bases)" = "bases_q30_nodups_noclipped_dragen", - "Total alignments" = "alignments_tot_dragen", - "Secondary alignments" = "alignments_secondary_dragen", - "Supplementary (chimeric) alignments" = "alignments_chimeric_dragen", - "Estimated read length" = "read_len_dragen", - "Bases in reference genome" = "bases_in_ref_genome_dragen", - "Bases in target bed [% of genome]" = "bases_in_target_bed_genome_pct_dragen", - "Insert length: mean" = "insert_len_mean_dragen", - "Insert length: median" = "insert_len_median_dragen", - "Insert length: standard deviation" = "insert_len_std_dev_dragen", - "Provided sex chromosome ploidy" = "ploidy_sex_chrom_provided_dragen", - "Estimated sample contamination" = "contamination_est_dragen", - "Estimated sample contamination standard error" = "contamination_stderr_est_dragen", - "DRAGEN mapping rate [mil. reads/second]" = "mapping_rate_dragen_milreads_per_sec_dragen", - "Total reads in RG" = "reads_tot_rg_dragen", - "Mapped reads adjusted for excluded mapping" = "reads_mapped_adjexcl_dragen", - "Mapped reads adjusted for filtered and excluded mapping" = "reads_mapped_adjfiltexcl_dragen", - "Unmapped reads adjusted for excluded mapping" = "reads_unmapped_adjexcl_dragen", - "Unmapped reads adjusted for filtered and excluded mapping" = "reads_unmapped_adjfiltexcl_dragen", - "Reads mapping to multiple locations" = "reads_map_multiloc_dragen", - "Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj_dragen", - "Reads with splice junction" = "reads_splicejunc_dragen", - "Average sequenced coverage over genome" = "cov_avg_seq_over_genome_dragen", - "Filtered rRNA reads" = "reads_rrna_filtered_dragen" + "Total input reads" = "reads_tot_input", + "Number of duplicate marked reads" = "reads_num_dupmarked", + "Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed", + "Number of unique reads (excl. duplicate marked reads)" = "reads_num_uniq", + "Reads with mate sequenced" = "reads_w_mate_seq", + "Reads without mate sequenced" = "reads_wo_mate_seq", + "QC-failed reads" = "reads_qcfail", + "Mapped reads" = "reads_mapped", + "Mapped reads adjusted for filtered mapping" = "reads_mapped_adjfilt", + "Mapped reads R1" = "reads_mapped_r1", + "Mapped reads R2" = "reads_mapped_r2", + "Number of unique & mapped reads (excl. duplicate marked reads)" = "reads_num_uniq_mapped", + "Unmapped reads" = "reads_unmapped", + "Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt", + "Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj", + "Singleton reads (itself mapped; mate unmapped)" = "reads_singleton", + "Paired reads (itself & mate mapped)" = "reads_paired", + "Properly paired reads" = "reads_paired_proper", + "Not properly paired reads (discordant)" = "reads_discordant", + "Paired reads mapped to different chromosomes" = "reads_paired_mapped_diff_chrom", + "Paired reads mapped to different chromosomes (MAPQ>=10)" = "reads_paired_mapped_diff_chrom_mapq10", + "Reads with MAPQ [40:inf)" = "reads_mapq_40_inf", + "Reads with MAPQ [30:40)" = "reads_mapq_30_40", + "Reads with MAPQ [20:30)" = "reads_mapq_20_30", + "Reads with MAPQ [10:20)" = "reads_mapq_10_20", + "Reads with MAPQ [ 0:10)" = "reads_mapq_0_10", + "Reads with MAPQ NA (Unmapped reads)" = "reads_mapq_na_unmapped", + "Reads with indel R1" = "reads_indel_r1", + "Reads with indel R2" = "reads_indel_r2", + "Total bases" = "bases_tot", + "Total bases R1" = "bases_tot_r1", + "Total bases R2" = "bases_tot_r2", + "Mapped bases" = "bases_mapped", + "Mapped bases R1" = "bases_mapped_r1", + "Mapped bases R2" = "bases_mapped_r2", + "Soft-clipped bases" = "bases_softclip", + "Soft-clipped bases R1" = "bases_softclip_r1", + "Soft-clipped bases R2" = "bases_softclip_r2", + "Hard-clipped bases" = "bases_hardclip", + "Hard-clipped bases R1" = "bases_hardclip_r1", + "Hard-clipped bases R2" = "bases_hardclip_r2", + "Mismatched bases R1" = "bases_mismatched_r1", + "Mismatched bases R2" = "bases_mismatched_r2", + "Mismatched bases R1 (excl. indels)" = "bases_mismatched_r1_noindels", + "Mismatched bases R2 (excl. indels)" = "bases_mismatched_r2_noindels", + "Q30 bases" = "bases_q30", + "Q30 bases R1" = "bases_q30_r1", + "Q30 bases R2" = "bases_q30_r2", + "Q30 bases (excl. dups & clipped bases)" = "bases_q30_nodups_noclipped", + "Total alignments" = "alignments_tot", + "Secondary alignments" = "alignments_secondary", + "Supplementary (chimeric) alignments" = "alignments_chimeric", + "Estimated read length" = "read_len", + "Bases in reference genome" = "bases_in_ref_genome", + "Bases in target bed [% of genome]" = "bases_in_target_bed_genome_pct", + "Insert length: mean" = "insert_len_mean", + "Insert length: median" = "insert_len_median", + "Insert length: standard deviation" = "insert_len_std_dev", + "Provided sex chromosome ploidy" = "ploidy_sex_chrom_provided", + "Estimated sample contamination" = "contamination_est", + "Estimated sample contamination standard error" = "contamination_stderr_est", + "DRAGEN mapping rate [mil. reads/second]" = "mapping_rate_dragen_milreads_per_sec", + "Total reads in RG" = "reads_tot_rg", + "Mapped reads adjusted for excluded mapping" = "reads_mapped_adjexcl", + "Mapped reads adjusted for filtered and excluded mapping" = "reads_mapped_adjfiltexcl", + "Unmapped reads adjusted for excluded mapping" = "reads_unmapped_adjexcl", + "Unmapped reads adjusted for filtered and excluded mapping" = "reads_unmapped_adjfiltexcl", + "Reads mapping to multiple locations" = "reads_map_multiloc", + "Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj", + "Reads with splice junction" = "reads_splicejunc", + "Average sequenced coverage over genome" = "cov_avg_seq_over_genome", + "Filtered rRNA reads" = "reads_rrna_filtered" ) raw <- readr::read_lines(x) assertthat::assert_that(grepl("MAPPING/ALIGNING", raw[1])) @@ -442,8 +529,8 @@ dragen_coverage_metrics_read <- function(x) { # all rows except 'Aligned bases' and 'Aligned reads' refer to the region abbrev_nm <- tibble::tribble( ~raw, ~clean, ~region, - "Aligned bases", "bases_aligned_tot_dragen", FALSE, - "Aligned reads", "reads_aligned_tot_dragen", FALSE, + "Aligned bases", "bases_aligned_tot", FALSE, + "Aligned reads", "reads_aligned_tot", FALSE, "Aligned bases in ", "bases_aligned_", TRUE, "Average alignment coverage over ", "cov_alignment_avg_over_", TRUE, "Uniformity of coverage (PCT > 0.2*mean) over ", "cov_uniformity_pct_gt02mean_", TRUE, @@ -488,7 +575,7 @@ dragen_coverage_metrics_read <- function(x) { abbrev_nm <- abbrev_nm |> dplyr::mutate( raw = ifelse(.data$region, glue("{.data$raw}{str1}"), .data$raw), - clean = ifelse(.data$region, glue("{.data$clean}{reg1}_dragen"), .data$clean) + clean = ifelse(.data$region, glue("{.data$clean}{reg1}"), .data$clean) ) |> dplyr::select("raw", "clean") |> tibble::deframe() @@ -509,7 +596,7 @@ dragen_coverage_metrics_read <- function(x) { var = gsub("inf", "Inf", .data$var) ) |> tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> - dplyr::mutate(var = as.character(glue("cov_pct_{start}_{end}_{reg1}_dragen"))) |> + dplyr::mutate(var = as.character(glue("cov_pct_{start}_{end}_{reg1}"))) |> dplyr::select("var", "value") res <- dplyr::bind_rows(res1, res2) |> dplyr::mutate( @@ -695,19 +782,19 @@ WgsCoverageMetricsFile <- R6::R6Class( #' @return tibble with one row and metrics spread across individual columns. read = function() { abbrev_nm <- c( - "Aligned bases" = "bases_aligned_dragen", - "Aligned bases in genome" = "bases_aligned_in_genome_dragen", - "Average alignment coverage over genome" = "cov_alignment_avg_over_genome_dragen", - "Uniformity of coverage (PCT > 0.2*mean) over genome" = "cov_uniformity_over_genome_pct_gt02mean_dragen", - "Uniformity of coverage (PCT > 0.4*mean) over genome" = "cov_uniformity_over_genome_pct_gt04mean_dragen", - "Average chr X coverage over genome" = "cov_avg_x_over_genome_dragen", - "Average chr Y coverage over genome" = "cov_avg_y_over_genome_dragen", - "Average mitochondrial coverage over genome" = "cov_avg_mt_over_genome_dragen", - "Average autosomal coverage over genome" = "cov_avg_auto_over_genome_dragen", - "Median autosomal coverage over genome" = "cov_median_auto_over_genome_dragen", - "Mean/Median autosomal coverage ratio over genome" = "cov_mean_median_auto_ratio_over_genome_dragen", - "Aligned reads" = "reads_aligned_dragen", - "Aligned reads in genome" = "reads_aligned_in_genome_dragen" + "Aligned bases" = "bases_aligned", + "Aligned bases in genome" = "bases_aligned_in_genome", + "Average alignment coverage over genome" = "cov_alignment_avg_over_genome", + "Uniformity of coverage (PCT > 0.2*mean) over genome" = "cov_uniformity_over_genome_pct_gt02mean", + "Uniformity of coverage (PCT > 0.4*mean) over genome" = "cov_uniformity_over_genome_pct_gt04mean", + "Average chr X coverage over genome" = "cov_avg_x_over_genome", + "Average chr Y coverage over genome" = "cov_avg_y_over_genome", + "Average mitochondrial coverage over genome" = "cov_avg_mt_over_genome", + "Average autosomal coverage over genome" = "cov_avg_auto_over_genome", + "Median autosomal coverage over genome" = "cov_median_auto_over_genome", + "Mean/Median autosomal coverage ratio over genome" = "cov_mean_median_auto_ratio_over_genome", + "Aligned reads" = "reads_aligned", + "Aligned reads in genome" = "reads_aligned_in_genome" ) x <- self$path @@ -736,7 +823,7 @@ WgsCoverageMetricsFile <- R6::R6Class( var = gsub("inf", "Inf", .data$var) ) |> tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> - dplyr::mutate(var = as.character(glue("cov_genome_pct_{start}_{end}_dragen"))) |> + dplyr::mutate(var = as.character(glue("cov_genome_pct_{start}_{end}"))) |> dplyr::select("var", "value") res <- dplyr::bind_rows(res1, res2) |> dplyr::mutate( @@ -973,77 +1060,77 @@ MappingMetricsFile <- R6::R6Class( #' @return tibble with one row of X metrics per read group. read = function() { abbrev_nm <- c( - "Total input reads" = "reads_tot_input_dragen", - "Number of duplicate marked reads" = "reads_num_dupmarked_dragen", - "Number of unique reads (excl. duplicate marked reads)" = "reads_num_uniq_dragen", - "Reads with mate sequenced" = "reads_w_mate_seq_dragen", - "Reads without mate sequenced" = "reads_wo_mate_seq_dragen", - "QC-failed reads" = "reads_qcfail_dragen", - "Mapped reads adjusted for excluded mapping" = "reads_mapped_adjexcl_dragen", - "Mapped reads adjusted for filtered and excluded mapping" = "reads_mapped_adjfiltexcl_dragen", - "Unmapped reads adjusted for excluded mapping" = "reads_unmapped_adjexcl_dragen", - "Unmapped reads adjusted for filtered and excluded mapping" = "reads_unmapped_adjfiltexcl_dragen", - "Reads mapping to multiple locations" = "reads_map_multiloc_dragen", - "Hard-clipped bases R1" = "bases_hardclip_r1_dragen", - "Hard-clipped bases R2" = "bases_hardclip_r2_dragen", - "Soft-clipped bases" = "bases_softclip_dragen", - "Hard-clipped bases" = "bases_hardclip_dragen", - "Mapped reads" = "reads_mapped_dragen", - "Mapped reads adjusted for filtered mapping" = "reads_mapped_adjfilt_dragen", - "Mapped reads R1" = "reads_mapped_r1_dragen", - "Mapped reads R2" = "reads_mapped_r2_dragen", - "Number of unique & mapped reads (excl. duplicate marked reads)" = "reads_num_uniq_mapped_dragen", - "Unmapped reads" = "reads_unmapped_dragen", - "Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt_dragen", - "Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj_dragen", - "Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj_dragen", - "Singleton reads (itself mapped; mate unmapped)" = "reads_singleton_dragen", - "Paired reads (itself & mate mapped)" = "reads_paired_dragen", - "Properly paired reads" = "reads_paired_proper_dragen", - "Not properly paired reads (discordant)" = "reads_discordant_dragen", - "Paired reads mapped to different chromosomes" = "reads_paired_mapped_diff_chrom_dragen", - "Paired reads mapped to different chromosomes (MAPQ>=10)" = "reads_paired_mapped_diff_chrom_mapq10_dragen", - "Reads with MAPQ [40:inf)" = "reads_mapq_40_inf_dragen", - "Reads with MAPQ [30:40)" = "reads_mapq_30_40_dragen", - "Reads with MAPQ [20:30)" = "reads_mapq_20_30_dragen", - "Reads with MAPQ [10:20)" = "reads_mapq_10_20_dragen", - "Reads with MAPQ [ 0:10)" = "reads_mapq_0_10_dragen", - "Reads with MAPQ NA (Unmapped reads)" = "reads_mapq_na_unmapped_dragen", - "Reads with indel R1" = "reads_indel_r1_dragen", - "Reads with indel R2" = "reads_indel_r2_dragen", - "Reads with splice junction" = "reads_splicejunc_dragen", - "Total bases" = "bases_tot_dragen", - "Total bases R1" = "bases_tot_r1_dragen", - "Total bases R2" = "bases_tot_r2_dragen", - "Mapped bases" = "bases_mapped_dragen", - "Mapped bases R1" = "bases_mapped_r1_dragen", - "Mapped bases R2" = "bases_mapped_r2_dragen", - "Soft-clipped bases R1" = "bases_softclip_r1_dragen", - "Soft-clipped bases R2" = "bases_softclip_r2_dragen", - "Mismatched bases R1" = "bases_mismatched_r1_dragen", - "Mismatched bases R2" = "bases_mismatched_r2_dragen", - "Mismatched bases R1 (excl. indels)" = "bases_mismatched_r1_noindels_dragen", - "Mismatched bases R2 (excl. indels)" = "bases_mismatched_r2_noindels_dragen", - "Q30 bases" = "bases_q30_dragen", - "Q30 bases R1" = "bases_q30_r1_dragen", - "Q30 bases R2" = "bases_q30_r2_dragen", - "Q30 bases (excl. dups & clipped bases)" = "bases_q30_nodups_noclipped_dragen", - "Total alignments" = "alignments_tot_dragen", - "Secondary alignments" = "alignments_secondary_dragen", - "Supplementary (chimeric) alignments" = "alignments_chimeric_dragen", - "Estimated read length" = "read_len_dragen", - "Bases in reference genome" = "bases_in_ref_genome_dragen", - "Bases in target bed [% of genome]" = "bases_in_target_bed_genome_pct_dragen", - "Average sequenced coverage over genome" = "cov_avg_seq_over_genome_dragen", - "Insert length: mean" = "insert_len_mean_dragen", - "Insert length: median" = "insert_len_median_dragen", - "Insert length: standard deviation" = "insert_len_std_dev_dragen", - "Provided sex chromosome ploidy" = "ploidy_sex_chrom_provided_dragen", - "Estimated sample contamination" = "contamination_est_dragen", - "DRAGEN mapping rate [mil. reads/second]" = "mapping_rate_dragen_milreads_per_sec_dragen", - "Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed_dragen", - "Total reads in RG" = "reads_tot_rg_dragen", - "Filtered rRNA reads" = "reads_rrna_filtered_dragen" + "Total input reads" = "reads_tot_input", + "Number of duplicate marked reads" = "reads_num_dupmarked", + "Number of unique reads (excl. duplicate marked reads)" = "reads_num_uniq", + "Reads with mate sequenced" = "reads_w_mate_seq", + "Reads without mate sequenced" = "reads_wo_mate_seq", + "QC-failed reads" = "reads_qcfail", + "Mapped reads adjusted for excluded mapping" = "reads_mapped_adjexcl", + "Mapped reads adjusted for filtered and excluded mapping" = "reads_mapped_adjfiltexcl", + "Unmapped reads adjusted for excluded mapping" = "reads_unmapped_adjexcl", + "Unmapped reads adjusted for filtered and excluded mapping" = "reads_unmapped_adjfiltexcl", + "Reads mapping to multiple locations" = "reads_map_multiloc", + "Hard-clipped bases R1" = "bases_hardclip_r1", + "Hard-clipped bases R2" = "bases_hardclip_r2", + "Soft-clipped bases" = "bases_softclip", + "Hard-clipped bases" = "bases_hardclip", + "Mapped reads" = "reads_mapped", + "Mapped reads adjusted for filtered mapping" = "reads_mapped_adjfilt", + "Mapped reads R1" = "reads_mapped_r1", + "Mapped reads R2" = "reads_mapped_r2", + "Number of unique & mapped reads (excl. duplicate marked reads)" = "reads_num_uniq_mapped", + "Unmapped reads" = "reads_unmapped", + "Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt", + "Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj", + "Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj", + "Singleton reads (itself mapped; mate unmapped)" = "reads_singleton", + "Paired reads (itself & mate mapped)" = "reads_paired", + "Properly paired reads" = "reads_paired_proper", + "Not properly paired reads (discordant)" = "reads_discordant", + "Paired reads mapped to different chromosomes" = "reads_paired_mapped_diff_chrom", + "Paired reads mapped to different chromosomes (MAPQ>=10)" = "reads_paired_mapped_diff_chrom_mapq10", + "Reads with MAPQ [40:inf)" = "reads_mapq_40_inf", + "Reads with MAPQ [30:40)" = "reads_mapq_30_40", + "Reads with MAPQ [20:30)" = "reads_mapq_20_30", + "Reads with MAPQ [10:20)" = "reads_mapq_10_20", + "Reads with MAPQ [ 0:10)" = "reads_mapq_0_10", + "Reads with MAPQ NA (Unmapped reads)" = "reads_mapq_na_unmapped", + "Reads with indel R1" = "reads_indel_r1", + "Reads with indel R2" = "reads_indel_r2", + "Reads with splice junction" = "reads_splicejunc", + "Total bases" = "bases_tot", + "Total bases R1" = "bases_tot_r1", + "Total bases R2" = "bases_tot_r2", + "Mapped bases" = "bases_mapped", + "Mapped bases R1" = "bases_mapped_r1", + "Mapped bases R2" = "bases_mapped_r2", + "Soft-clipped bases R1" = "bases_softclip_r1", + "Soft-clipped bases R2" = "bases_softclip_r2", + "Mismatched bases R1" = "bases_mismatched_r1", + "Mismatched bases R2" = "bases_mismatched_r2", + "Mismatched bases R1 (excl. indels)" = "bases_mismatched_r1_noindels", + "Mismatched bases R2 (excl. indels)" = "bases_mismatched_r2_noindels", + "Q30 bases" = "bases_q30", + "Q30 bases R1" = "bases_q30_r1", + "Q30 bases R2" = "bases_q30_r2", + "Q30 bases (excl. dups & clipped bases)" = "bases_q30_nodups_noclipped", + "Total alignments" = "alignments_tot", + "Secondary alignments" = "alignments_secondary", + "Supplementary (chimeric) alignments" = "alignments_chimeric", + "Estimated read length" = "read_len", + "Bases in reference genome" = "bases_in_ref_genome", + "Bases in target bed [% of genome]" = "bases_in_target_bed_genome_pct", + "Average sequenced coverage over genome" = "cov_avg_seq_over_genome", + "Insert length: mean" = "insert_len_mean", + "Insert length: median" = "insert_len_median", + "Insert length: standard deviation" = "insert_len_std_dev", + "Provided sex chromosome ploidy" = "ploidy_sex_chrom_provided", + "Estimated sample contamination" = "contamination_est", + "DRAGEN mapping rate [mil. reads/second]" = "mapping_rate_dragen_milreads_per_sec", + "Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed", + "Total reads in RG" = "reads_tot_rg", + "Filtered rRNA reads" = "reads_rrna_filtered" ) x <- self$path raw <- readr::read_lines(x) @@ -1119,34 +1206,34 @@ PloidyEstimationMetricsFile <- R6::R6Class( raw <- readr::read_lines(x) assertthat::assert_that(grepl("PLOIDY ESTIMATION", raw[1])) abbrev_nm <- c( - "Autosomal median coverage" = "cov_auto_median_dragen", - "X median coverage" = "cov_x_median_dragen", - "Y median coverage" = "cov_y_median_dragen", - "1 median / Autosomal median" = "cov_1_div_auto_median_dragen", - "2 median / Autosomal median" = "cov_2_div_auto_median_dragen", - "3 median / Autosomal median" = "cov_3_div_auto_median_dragen", - "4 median / Autosomal median" = "cov_4_div_auto_median_dragen", - "5 median / Autosomal median" = "cov_5_div_auto_median_dragen", - "6 median / Autosomal median" = "cov_6_div_auto_median_dragen", - "7 median / Autosomal median" = "cov_7_div_auto_median_dragen", - "8 median / Autosomal median" = "cov_8_div_auto_median_dragen", - "9 median / Autosomal median" = "cov_9_div_auto_median_dragen", - "10 median / Autosomal median" = "cov_10_div_auto_median_dragen", - "11 median / Autosomal median" = "cov_11_div_auto_median_dragen", - "12 median / Autosomal median" = "cov_12_div_auto_median_dragen", - "13 median / Autosomal median" = "cov_13_div_auto_median_dragen", - "14 median / Autosomal median" = "cov_14_div_auto_median_dragen", - "15 median / Autosomal median" = "cov_15_div_auto_median_dragen", - "16 median / Autosomal median" = "cov_16_div_auto_median_dragen", - "17 median / Autosomal median" = "cov_17_div_auto_median_dragen", - "18 median / Autosomal median" = "cov_18_div_auto_median_dragen", - "19 median / Autosomal median" = "cov_19_div_auto_median_dragen", - "20 median / Autosomal median" = "cov_20_div_auto_median_dragen", - "21 median / Autosomal median" = "cov_21_div_auto_median_dragen", - "22 median / Autosomal median" = "cov_22_div_auto_median_dragen", - "X median / Autosomal median" = "cov_x_div_auto_median_dragen", - "Y median / Autosomal median" = "cov_y_div_auto_median_dragen", - "Ploidy estimation" = "ploidy_est_dragen" + "Autosomal median coverage" = "cov_auto_median", + "X median coverage" = "cov_x_median", + "Y median coverage" = "cov_y_median", + "1 median / Autosomal median" = "cov_1_div_auto_median", + "2 median / Autosomal median" = "cov_2_div_auto_median", + "3 median / Autosomal median" = "cov_3_div_auto_median", + "4 median / Autosomal median" = "cov_4_div_auto_median", + "5 median / Autosomal median" = "cov_5_div_auto_median", + "6 median / Autosomal median" = "cov_6_div_auto_median", + "7 median / Autosomal median" = "cov_7_div_auto_median", + "8 median / Autosomal median" = "cov_8_div_auto_median", + "9 median / Autosomal median" = "cov_9_div_auto_median", + "10 median / Autosomal median" = "cov_10_div_auto_median", + "11 median / Autosomal median" = "cov_11_div_auto_median", + "12 median / Autosomal median" = "cov_12_div_auto_median", + "13 median / Autosomal median" = "cov_13_div_auto_median", + "14 median / Autosomal median" = "cov_14_div_auto_median", + "15 median / Autosomal median" = "cov_15_div_auto_median", + "16 median / Autosomal median" = "cov_16_div_auto_median", + "17 median / Autosomal median" = "cov_17_div_auto_median", + "18 median / Autosomal median" = "cov_18_div_auto_median", + "19 median / Autosomal median" = "cov_19_div_auto_median", + "20 median / Autosomal median" = "cov_20_div_auto_median", + "21 median / Autosomal median" = "cov_21_div_auto_median", + "22 median / Autosomal median" = "cov_22_div_auto_median", + "X median / Autosomal median" = "cov_x_div_auto_median", + "Y median / Autosomal median" = "cov_y_div_auto_median", + "Ploidy estimation" = "ploidy_est" ) d <- raw |> @@ -1158,7 +1245,7 @@ PloidyEstimationMetricsFile <- R6::R6Class( ) |> tidyr::pivot_wider(names_from = "var", values_from = "value") # now convert all except 'Ploidy estimation' to numeric - cols1 <- colnames(d)[colnames(d) != "ploidy_est_dragen"] + cols1 <- colnames(d)[colnames(d) != "ploidy_est"] d |> dplyr::mutate(dplyr::across(dplyr::all_of(cols1), as.numeric)) }, @@ -1339,31 +1426,31 @@ VCMetricsFile <- R6::R6Class( #' @return tibble with one row and metrics spread across individual columns. read = function() { abbrev_nm <- c( - "Total" = "var_tot_dragen", - "Biallelic" = "var_biallelic_dragen", - "Multiallelic" = "var_multiallelic_dragen", - "SNPs" = "var_snp_dragen", - "Insertions (Hom)" = "var_ins_hom_dragen", - "Insertions (Het)" = "var_ins_het_dragen", - "Deletions (Hom)" = "var_del_hom_dragen", - "Deletions (Het)" = "var_del_het_dragen", - "Indels (Het)" = "var_indel_het_dragen", - "Chr X number of SNPs over genome" = "var_snp_x_over_genome_dragen", - "Chr Y number of SNPs over genome" = "var_snp_y_over_genome_dragen", - "(Chr X SNPs)/(chr Y SNPs) ratio over genome" = "var_x_over_y_snp_ratio_over_genome_dragen", - "SNP Transitions" = "var_snp_transitions_dragen", - "SNP Transversions" = "var_snp_transversions_dragen", - "Ti/Tv ratio" = "var_ti_tv_ratio_dragen", - "Heterozygous" = "var_heterozygous_dragen", - "Homozygous" = "var_homozygous_dragen", - "Het/Hom ratio" = "var_het_hom_ratio_dragen", - "In dbSNP" = "var_in_dbsnp_dragen", - "Not in dbSNP" = "var_nin_dbsnp_dragen", - "Percent Callability" = "callability_pct_dragen", - "Percent Autosome Callability" = "callability_auto_pct_dragen", - "Number of samples" = "sample_num_dragen", - "Reads Processed" = "reads_processed_dragen", - "Child Sample" = "sample_child_dragen" + "Total" = "var_tot", + "Biallelic" = "var_biallelic", + "Multiallelic" = "var_multiallelic", + "SNPs" = "var_snp", + "Insertions (Hom)" = "var_ins_hom", + "Insertions (Het)" = "var_ins_het", + "Deletions (Hom)" = "var_del_hom", + "Deletions (Het)" = "var_del_het", + "Indels (Het)" = "var_indel_het", + "Chr X number of SNPs over genome" = "var_snp_x_over_genome", + "Chr Y number of SNPs over genome" = "var_snp_y_over_genome", + "(Chr X SNPs)/(chr Y SNPs) ratio over genome" = "var_x_over_y_snp_ratio_over_genome", + "SNP Transitions" = "var_snp_transitions", + "SNP Transversions" = "var_snp_transversions", + "Ti/Tv ratio" = "var_ti_tv_ratio", + "Heterozygous" = "var_heterozygous", + "Homozygous" = "var_homozygous", + "Het/Hom ratio" = "var_het_hom_ratio", + "In dbSNP" = "var_in_dbsnp", + "Not in dbSNP" = "var_nin_dbsnp", + "Percent Callability" = "callability_pct", + "Percent Autosome Callability" = "callability_auto_pct", + "Number of samples" = "sample_num", + "Reads Processed" = "reads_processed", + "Child Sample" = "sample_child" ) x <- self$path raw <- readr::read_lines(x) @@ -1445,28 +1532,28 @@ TrimmerMetricsFile <- R6::R6Class( d <- readr::read_lines(x) assertthat::assert_that(grepl("TRIMMER STATISTICS", d[1])) abbrev_nm <- c( - "Total input reads" = "reads_tot_input_dragen", - "Total input bases" = "bases_tot_dragen", - "Total input bases R1" = "bases_r1_dragen", - "Total input bases R2" = "bases_r2_dragen", - "Average input read length" = "read_len_avg_dragen", - "Total trimmed reads" = "reads_trimmed_tot_dragen", - "Total trimmed bases" = "bases_trimmed_tot_dragen", - "Average bases trimmed per read" = "bases_trimmed_avg_per_read_dragen", - "Average bases trimmed per trimmed read" = "bases_trimmed_avg_per_trimmedread_dragen", - "Remaining poly-G K-mers R1 3prime" = "polygkmers3r1_remaining_dragen", - "Remaining poly-G K-mers R2 3prime" = "polygkmers3r2_remaining_dragen", - "Poly-G soft trimmed reads unfiltered R1 3prime" = "polyg_soft_trimmed_reads_unfilt_3r1_dragen", - "Poly-G soft trimmed reads unfiltered R2 3prime" = "polyg_soft_trimmed_reads_unfilt_3r2_dragen", - "Poly-G soft trimmed reads filtered R1 3prime" = "polyg_soft_trimmed_reads_filt_3r1_dragen", - "Poly-G soft trimmed reads filtered R2 3prime" = "polyg_soft_trimmed_reads_filt_3r2_dragen", - "Poly-G soft trimmed bases unfiltered R1 3prime" = "polyg_soft_trimmed_bases_unfilt_3r1_dragen", - "Poly-G soft trimmed bases unfiltered R2 3prime" = "polyg_soft_trimmed_bases_unfilt_3r2_dragen", - "Poly-G soft trimmed bases filtered R1 3prime" = "polyg_soft_trimmed_bases_filt_3r1_dragen", - "Poly-G soft trimmed bases filtered R2 3prime" = "polyg_soft_trimmed_bases_filt_3r2_dragen", - "Total filtered reads" = "reads_tot_filt_dragen", - "Reads filtered for minimum read length R1" = "reads_filt_minreadlenr1_dragen", - "Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2_dragen" + "Total input reads" = "reads_tot_input", + "Total input bases" = "bases_tot", + "Total input bases R1" = "bases_r1", + "Total input bases R2" = "bases_r2", + "Average input read length" = "read_len_avg", + "Total trimmed reads" = "reads_trimmed_tot", + "Total trimmed bases" = "bases_trimmed_tot", + "Average bases trimmed per read" = "bases_trimmed_avg_per_read", + "Average bases trimmed per trimmed read" = "bases_trimmed_avg_per_trimmedread", + "Remaining poly-G K-mers R1 3prime" = "polygkmers3r1_remaining", + "Remaining poly-G K-mers R2 3prime" = "polygkmers3r2_remaining", + "Poly-G soft trimmed reads unfiltered R1 3prime" = "polyg_soft_trimmed_reads_unfilt_3r1", + "Poly-G soft trimmed reads unfiltered R2 3prime" = "polyg_soft_trimmed_reads_unfilt_3r2", + "Poly-G soft trimmed reads filtered R1 3prime" = "polyg_soft_trimmed_reads_filt_3r1", + "Poly-G soft trimmed reads filtered R2 3prime" = "polyg_soft_trimmed_reads_filt_3r2", + "Poly-G soft trimmed bases unfiltered R1 3prime" = "polyg_soft_trimmed_bases_unfilt_3r1", + "Poly-G soft trimmed bases unfiltered R2 3prime" = "polyg_soft_trimmed_bases_unfilt_3r2", + "Poly-G soft trimmed bases filtered R1 3prime" = "polyg_soft_trimmed_bases_filt_3r1", + "Poly-G soft trimmed bases filtered R2 3prime" = "polyg_soft_trimmed_bases_filt_3r2", + "Total filtered reads" = "reads_tot_filt", + "Reads filtered for minimum read length R1" = "reads_filt_minreadlenr1", + "Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2" ) d |> diff --git a/R/tso_dragen.R b/R/tso_dragen.R index 03d8845..a7aa883 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -69,7 +69,7 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\.sv_metrics\\.csv$"), "svMetrics", glue("{pref}\\.time_metrics\\.csv$"), "timeMetrics", glue("{pref}\\.trimmer_metrics\\.csv$"), "trimmerMetrics", - glue("{pref}\\.umi_metrics\\.csv$"), "DOWNLOAD_ONLY", + glue("{pref}\\.umi_metrics\\.csv$"), "umiMetrics", glue("{pref}\\.vc_metrics\\.csv$"), "vcMetrics" ) regexes <- reg1 |> @@ -248,6 +248,12 @@ Wf_dragen <- R6::R6Class( read_gcMetrics = function(x) { dat <- dragen_gc_metrics_read(x) dat + }, + #' @description Read `umi_metrics.csv` file. + #' @param x Path to file. + read_umiMetrics = function(x) { + dat <- dragen_umi_metrics_read(x) + dat } ) # end public ) # end Wf_dragen diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 04b3d0e..2387a47 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -58,6 +58,7 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_cnvMetrics}{\code{Wf_dragen$read_cnvMetrics()}} \item \href{#method-Wf_dragen-read_fastqcMetrics}{\code{Wf_dragen$read_fastqcMetrics()}} \item \href{#method-Wf_dragen-read_gcMetrics}{\code{Wf_dragen$read_gcMetrics()}} +\item \href{#method-Wf_dragen-read_umiMetrics}{\code{Wf_dragen$read_umiMetrics()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} } } @@ -341,6 +342,23 @@ Read \code{gc_metrics.csv} file. \if{html}{\out{
}}\preformatted{Wf_dragen$read_gcMetrics(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_umiMetrics}{}}} +\subsection{Method \code{read_umiMetrics()}}{ +Read \code{umi_metrics.csv} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_umiMetrics(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/man/dragen_umi_metrics_read.Rd b/man/dragen_umi_metrics_read.Rd new file mode 100644 index 0000000..ca80a82 --- /dev/null +++ b/man/dragen_umi_metrics_read.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_umi_metrics_read} +\alias{dragen_umi_metrics_read} +\title{Read DRAGEN UMI Metrics} +\usage{ +dragen_umi_metrics_read(x) +} +\arguments{ +\item{x}{Path to file.} +} +\value{ +Tibble with metrics. +} +\description{ +Reads the \code{umi_metrics.csv} file output from DRAGEN. +} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R index 6d25116..bbadc08 100644 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -2,7 +2,7 @@ # File R/dragen.R: @testexamples -test_that("Function time_metrics_process() @ L1312", { +test_that("Function time_metrics_process() @ L1398", { p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") x <- TimeMetricsFile$new(p) From 1e4cdb43186d7d52a35e44775c046016b6ea034c Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 18 Oct 2024 01:23:13 +1100 Subject: [PATCH 12/17] dragen: parse msi metrics --- R/dragen.R | 2 +- R/dragen_fastqc.R | 7 ---- R/tso.R | 9 ++++- R/tso_dragen.R | 17 +++++++-- man/FastqcMetricsFile.Rd | 8 ----- man/Wf_dragen.Rd | 36 +++++++++++++++++++ .../test-roxytest-testexamples-dragen.R | 2 +- 7 files changed, 61 insertions(+), 20 deletions(-) diff --git a/R/dragen.R b/R/dragen.R index d3587b6..42811f9 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -63,7 +63,7 @@ dragen_umi_metrics_read <- function(x) { count = gsub("\\{|\\}", "", .data$count), count = strsplit(.data$count, "\\|") ) |> - tidyr::unnest(count) |> + tidyr::unnest("count") |> dplyr::mutate(count = as.numeric(.data$count)) |> tidyr::nest(.by = "name") d2 <- d1 |> diff --git a/R/dragen_fastqc.R b/R/dragen_fastqc.R index 3615cd6..4390037 100644 --- a/R/dragen_fastqc.R +++ b/R/dragen_fastqc.R @@ -4,13 +4,6 @@ #' Contains methods for reading and displaying contents of #' the `fastqc_metrics.csv` file output from DRAGEN. #' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.fastqc_metrics.csv.gz", package = "dracarys") -#' fq <- FastqcMetricsFile$new(x) -#' d <- fq$read() -#' fq$write(d, out_dir = tempdir(), prefix = "seqc_fq", out_format = "tsv") -#' # fq$plot(d) -#' #' @export FastqcMetricsFile <- R6::R6Class( "FastqcMetricsFile", diff --git a/R/tso.R b/R/tso.R index a302003..7596449 100644 --- a/R/tso.R +++ b/R/tso.R @@ -471,6 +471,13 @@ tso_msi_read <- function(x) { if (j[["PercentageUnstableSites"]] == "NaN") { j[["PercentageUnstableSites"]] <- NA_real_ } + num_cols <- c( + "TotalMicrosatelliteSitesAssessed", "TotalMicrosatelliteSitesUnstable", + "PercentageUnstableSites", "SumDistance", "SumJsd" + ) tibble::as_tibble_row(j) |> - dplyr::mutate(ResultIsValid = as.character(.data$ResultIsValid)) + dplyr::mutate( + dplyr::across(dplyr::any_of(num_cols), as.numeric), + ResultIsValid = as.character(.data$ResultIsValid), + ) } diff --git a/R/tso_dragen.R b/R/tso_dragen.R index a7aa883..d5175a3 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -64,8 +64,8 @@ Wf_dragen <- R6::R6Class( glue("{pref}\\.gc_metrics\\.csv$"), "gcMetrics", glue("{pref}\\.gvcf_metrics\\.csv$"), "vcMetrics", glue("{pref}\\.mapping_metrics\\.csv$"), "mappingMetrics", - glue("{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", - glue("{pref}\\.microsat_output\\.json$"), "DOWNLOAD_ONLY", + glue("{pref}\\.microsat_diffs\\.txt$"), "msiDiffs", + glue("{pref}\\.microsat_output\\.json$"), "msi", glue("{pref}\\.sv_metrics\\.csv$"), "svMetrics", glue("{pref}\\.time_metrics\\.csv$"), "timeMetrics", glue("{pref}\\.trimmer_metrics\\.csv$"), "trimmerMetrics", @@ -254,6 +254,19 @@ Wf_dragen <- R6::R6Class( read_umiMetrics = function(x) { dat <- dragen_umi_metrics_read(x) dat + }, + #' @description Read `microsat_output.json` file. + #' @param x Path to file. + read_msi = function(x) { + dat <- tso_msi_read(x) + tibble::tibble(name = "msi", data = list(dat[])) + }, + #' @description Read `microsat_diffs.txt` file. + #' @param x Path to file. + read_msiDiffs = function(x) { + dat <- readr::read_tsv(x, col_types = "cdccddc") |> + dplyr::rename(Chromosome = "#Chromosome") + tibble::tibble(name = "msidiffs", data = list(dat[])) } ) # end public ) # end Wf_dragen diff --git a/man/FastqcMetricsFile.Rd b/man/FastqcMetricsFile.Rd index ec13b16..301bce3 100644 --- a/man/FastqcMetricsFile.Rd +++ b/man/FastqcMetricsFile.Rd @@ -6,14 +6,6 @@ \description{ Contains methods for reading and displaying contents of the \code{fastqc_metrics.csv} file output from DRAGEN. -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.fastqc_metrics.csv.gz", package = "dracarys") -fq <- FastqcMetricsFile$new(x) -d <- fq$read() -fq$write(d, out_dir = tempdir(), prefix = "seqc_fq", out_format = "tsv") -# fq$plot(d) - } \section{Super class}{ \code{\link[dracarys:File]{dracarys::File}} -> \code{FastqcMetricsFile} diff --git a/man/Wf_dragen.Rd b/man/Wf_dragen.Rd index 2387a47..141e855 100644 --- a/man/Wf_dragen.Rd +++ b/man/Wf_dragen.Rd @@ -59,6 +59,8 @@ d_write <- t1$write( \item \href{#method-Wf_dragen-read_fastqcMetrics}{\code{Wf_dragen$read_fastqcMetrics()}} \item \href{#method-Wf_dragen-read_gcMetrics}{\code{Wf_dragen$read_gcMetrics()}} \item \href{#method-Wf_dragen-read_umiMetrics}{\code{Wf_dragen$read_umiMetrics()}} +\item \href{#method-Wf_dragen-read_msi}{\code{Wf_dragen$read_msi()}} +\item \href{#method-Wf_dragen-read_msiDiffs}{\code{Wf_dragen$read_msiDiffs()}} \item \href{#method-Wf_dragen-clone}{\code{Wf_dragen$clone()}} } } @@ -359,6 +361,40 @@ Read \code{umi_metrics.csv} file. \if{html}{\out{
}}\preformatted{Wf_dragen$read_umiMetrics(x)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_msi}{}}} +\subsection{Method \code{read_msi()}}{ +Read \code{microsat_output.json} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_msi(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_dragen-read_msiDiffs}{}}} +\subsection{Method \code{read_msiDiffs()}}{ +Read \code{microsat_diffs.txt} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_dragen$read_msiDiffs(x)}\if{html}{\out{
}} +} + \subsection{Arguments}{ \if{html}{\out{
}} \describe{ diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R index bbadc08..8db3f7e 100644 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ b/tests/testthat/test-roxytest-testexamples-dragen.R @@ -2,7 +2,7 @@ # File R/dragen.R: @testexamples -test_that("Function time_metrics_process() @ L1398", { +test_that("Function time_metrics_process() @ L1399", { p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") x <- TimeMetricsFile$new(p) From 1d68fd1e56ad1644ee2029cba864488e69d995f9 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 18 Oct 2024 01:48:35 +1100 Subject: [PATCH 13/17] dragen: remove per-file R6 classes (man/) --- man/FastqcMetricsFile.Rd | 93 ----------- man/FragmentLengthHistFile.Rd | 131 ---------------- man/MappingMetricsFile.Rd | 106 ------------- man/ReplayFile.Rd | 94 ----------- man/SvMetricsFile.Rd | 94 ----------- man/TimeMetricsFile.Rd | 100 ------------ man/TrimmerMetricsFile.Rd | 100 ------------ man/VCMetricsFile.Rd | 94 ----------- man/WgsContigMeanCovFile.Rd | 146 ------------------ man/WgsCoverageMetricsFile.Rd | 105 ------------- man/WgsFineHistFile.Rd | 136 ---------------- man/WgsHistFile.Rd | 94 ----------- man/dragen_contig_mean_coverage_plot.Rd | 23 +++ ...ist_plot.Rd => dragen_fraglenhist_plot.Rd} | 10 +- man/time_metrics_process.Rd | 30 ---- .../test-roxytest-testexamples-dragen.R | 13 -- 16 files changed, 28 insertions(+), 1341 deletions(-) delete mode 100644 man/FastqcMetricsFile.Rd delete mode 100644 man/FragmentLengthHistFile.Rd delete mode 100644 man/MappingMetricsFile.Rd delete mode 100644 man/ReplayFile.Rd delete mode 100644 man/SvMetricsFile.Rd delete mode 100644 man/TimeMetricsFile.Rd delete mode 100644 man/TrimmerMetricsFile.Rd delete mode 100644 man/VCMetricsFile.Rd delete mode 100644 man/WgsContigMeanCovFile.Rd delete mode 100644 man/WgsCoverageMetricsFile.Rd delete mode 100644 man/WgsFineHistFile.Rd delete mode 100644 man/WgsHistFile.Rd create mode 100644 man/dragen_contig_mean_coverage_plot.Rd rename man/{tso_fraglenhist_plot.Rd => dragen_fraglenhist_plot.Rd} (66%) delete mode 100644 man/time_metrics_process.Rd delete mode 100644 tests/testthat/test-roxytest-testexamples-dragen.R diff --git a/man/FastqcMetricsFile.Rd b/man/FastqcMetricsFile.Rd deleted file mode 100644 index 301bce3..0000000 --- a/man/FastqcMetricsFile.Rd +++ /dev/null @@ -1,93 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen_fastqc.R -\name{FastqcMetricsFile} -\alias{FastqcMetricsFile} -\title{FastqcMetrics R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \code{fastqc_metrics.csv} file output from DRAGEN. -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{FastqcMetricsFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-FastqcMetricsFile-read}{\code{FastqcMetricsFile$read()}} -\item \href{#method-FastqcMetricsFile-write}{\code{FastqcMetricsFile$write()}} -\item \href{#method-FastqcMetricsFile-clone}{\code{FastqcMetricsFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-FastqcMetricsFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{fastqc_metrics.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{FastqcMetricsFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble. TODO. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-FastqcMetricsFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{fastqc_metrics.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{FastqcMetricsFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-FastqcMetricsFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{FastqcMetricsFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/FragmentLengthHistFile.Rd b/man/FragmentLengthHistFile.Rd deleted file mode 100644 index 07c9081..0000000 --- a/man/FragmentLengthHistFile.Rd +++ /dev/null @@ -1,131 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{FragmentLengthHistFile} -\alias{FragmentLengthHistFile} -\title{FragmentLengthHistFile R6 Class} -\description{ -Contains methods for reading and plotting contents of -the \code{fragment_length_hist.csv} file output from DRAGEN. -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.fragment_length_hist.csv.gz", package = "dracarys") -fl <- FragmentLengthHistFile$new(x) -d <- fl$read() # or read(fl) -fl$plot(d) # or plot(fl) -fl$write(d |> dplyr::filter(count > 10), out_dir = tempdir(), prefix = "seqc_fl") -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{FragmentLengthHistFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-FragmentLengthHistFile-read}{\code{FragmentLengthHistFile$read()}} -\item \href{#method-FragmentLengthHistFile-write}{\code{FragmentLengthHistFile$write()}} -\item \href{#method-FragmentLengthHistFile-plot}{\code{FragmentLengthHistFile$plot()}} -\item \href{#method-FragmentLengthHistFile-clone}{\code{FragmentLengthHistFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-FragmentLengthHistFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{fragment_length_hist.csv} file, which contains the -fragment length distribution for each sample. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{FragmentLengthHistFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -A tibble with the following columns: -\itemize{ -\item sample: name of sample -\item fragmentLength: estimated fragment length -\item count: number of reads with estimated fragment length -} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-FragmentLengthHistFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{fragment_length_hist.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{FragmentLengthHistFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-FragmentLengthHistFile-plot}{}}} -\subsection{Method \code{plot()}}{ -Plots the fragment length distributions as given in the -\code{fragment_length_hist.csv} file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{FragmentLengthHistFile$plot(d, min_count = 10)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{min_count}}{Minimum read count to be plotted (Default: 10).} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -A ggplot2 plot containing fragment lengths on X axis and read counts -on Y axis for each sample. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-FragmentLengthHistFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{FragmentLengthHistFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/MappingMetricsFile.Rd b/man/MappingMetricsFile.Rd deleted file mode 100644 index 62d5cfb..0000000 --- a/man/MappingMetricsFile.Rd +++ /dev/null @@ -1,106 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{MappingMetricsFile} -\alias{MappingMetricsFile} -\title{MappingMetricsFile R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \code{mapping_metrics.csv} file output from DRAGEN. -This file contains mapping and aligning metrics, like the metrics computed by -the Samtools Flagstat command. These metrics are available on an aggregate -level (over all input data), and on a per read group level. NOTE: we are -keeping only the read group level metrics (i.e. removing the aggregate data). -Unless explicitly stated, the metrics units are in reads (i.e., not in -terms of pairs or alignments). -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.mapping_metrics.csv.gz", package = "dracarys") -mm <- MappingMetricsFile$new(x) -d <- mm$read() # or read(mm) -mm$write(d, out_dir = tempdir(), prefix = "seqc_mm", out_format = "tsv") - -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{MappingMetricsFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-MappingMetricsFile-read}{\code{MappingMetricsFile$read()}} -\item \href{#method-MappingMetricsFile-write}{\code{MappingMetricsFile$write()}} -\item \href{#method-MappingMetricsFile-clone}{\code{MappingMetricsFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-MappingMetricsFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{mapping_metrics.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{MappingMetricsFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with one row of X metrics per read group. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-MappingMetricsFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{mapping_metrics.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{MappingMetricsFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-MappingMetricsFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{MappingMetricsFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/ReplayFile.Rd b/man/ReplayFile.Rd deleted file mode 100644 index c2c34eb..0000000 --- a/man/ReplayFile.Rd +++ /dev/null @@ -1,94 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{ReplayFile} -\alias{ReplayFile} -\title{ReplayFile R6 Class} -\description{ -Contains methods for reading contents of -the \code{replay.json} file output from DRAGEN, which contains the DRAGEN command -line, parameters and version for the specific run. -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II-replay.json.gz", package = "dracarys") -r <- ReplayFile$new(x) -d <- r$read() # or read(r) -r$write(d, out_dir = tempdir(), prefix = "seqc_replay", out_format = "tsv") -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{ReplayFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-ReplayFile-read}{\code{ReplayFile$read()}} -\item \href{#method-ReplayFile-write}{\code{ReplayFile$write()}} -\item \href{#method-ReplayFile-clone}{\code{ReplayFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ReplayFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{replay.json} file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ReplayFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with one row and metrics spread across individual columns. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ReplayFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{replay.json} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ReplayFile$write(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ReplayFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ReplayFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/SvMetricsFile.Rd b/man/SvMetricsFile.Rd deleted file mode 100644 index e5c775d..0000000 --- a/man/SvMetricsFile.Rd +++ /dev/null @@ -1,94 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{SvMetricsFile} -\alias{SvMetricsFile} -\title{SvMetricsFile R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \code{sv_metrics.csv} file output from DRAGEN -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.sv_metrics.csv.gz", package = "dracarys") -sv <- SvMetricsFile$new(x) -d <- sv$read() -sv$write(d, out_dir = tempdir(), prefix = "seqc_sv", out_format = "tsv") - -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{SvMetricsFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-SvMetricsFile-read}{\code{SvMetricsFile$read()}} -\item \href{#method-SvMetricsFile-write}{\code{SvMetricsFile$write()}} -\item \href{#method-SvMetricsFile-clone}{\code{SvMetricsFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SvMetricsFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{sv_metrics.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SvMetricsFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with one row and metrics spread across individual columns. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SvMetricsFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{sv_metrics.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SvMetricsFile$write(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SvMetricsFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SvMetricsFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/TimeMetricsFile.Rd b/man/TimeMetricsFile.Rd deleted file mode 100644 index 43b463d..0000000 --- a/man/TimeMetricsFile.Rd +++ /dev/null @@ -1,100 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{TimeMetricsFile} -\alias{TimeMetricsFile} -\title{TimeMetricsFile R6 Class} -\description{ -Contains methods for reading contents of -the \code{time_metrics.csv} file output from DRAGEN, which contains -a breakdown of the run duration for each DRAGEN process. -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") -tm <- TimeMetricsFile$new(x) -d <- tm$read() # or read(tm) -tm$write(d, out_dir = tempdir(), prefix = "seqc_time", out_format = "tsv") -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{TimeMetricsFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-TimeMetricsFile-read}{\code{TimeMetricsFile$read()}} -\item \href{#method-TimeMetricsFile-write}{\code{TimeMetricsFile$write()}} -\item \href{#method-TimeMetricsFile-clone}{\code{TimeMetricsFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-TimeMetricsFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{time_metrics.csv} file. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TimeMetricsFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with one row and metrics spread across individual columns. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-TimeMetricsFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{time_metrics.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TimeMetricsFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-TimeMetricsFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TimeMetricsFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/TrimmerMetricsFile.Rd b/man/TrimmerMetricsFile.Rd deleted file mode 100644 index a3b4fe8..0000000 --- a/man/TrimmerMetricsFile.Rd +++ /dev/null @@ -1,100 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{TrimmerMetricsFile} -\alias{TrimmerMetricsFile} -\title{TrimmerMetricsFile R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \code{trimmer_metrics.csv} file output from DRAGEN -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.trimmer_metrics.csv.gz", package = "dracarys") -tm <- TrimmerMetricsFile$new(x) -d <- tm$read() -tm$write(d, out_dir = tempdir(), prefix = "seqc_tm", out_format = "tsv") - -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{TrimmerMetricsFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-TrimmerMetricsFile-read}{\code{TrimmerMetricsFile$read()}} -\item \href{#method-TrimmerMetricsFile-write}{\code{TrimmerMetricsFile$write()}} -\item \href{#method-TrimmerMetricsFile-clone}{\code{TrimmerMetricsFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-TrimmerMetricsFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{trimmer_metrics.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrimmerMetricsFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with one row and metrics spread across individual columns. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-TrimmerMetricsFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{trimmer_metrics.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrimmerMetricsFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-TrimmerMetricsFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrimmerMetricsFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/VCMetricsFile.Rd b/man/VCMetricsFile.Rd deleted file mode 100644 index 3011160..0000000 --- a/man/VCMetricsFile.Rd +++ /dev/null @@ -1,94 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{VCMetricsFile} -\alias{VCMetricsFile} -\title{VCMetricsFile R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \code{vc_metrics.csv} file output from DRAGEN, which contains variant calling metrics. -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.vc_metrics.csv.gz", package = "dracarys") -vm <- VCMetricsFile$new(x) -d <- vm$read() # or read(vm) -vm$write(d, out_dir = tempdir(), prefix = "seqc_vc", out_format = "tsv") - -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{VCMetricsFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-VCMetricsFile-read}{\code{VCMetricsFile$read()}} -\item \href{#method-VCMetricsFile-write}{\code{VCMetricsFile$write()}} -\item \href{#method-VCMetricsFile-clone}{\code{VCMetricsFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-VCMetricsFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{vc_metrics.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{VCMetricsFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with one row and metrics spread across individual columns. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-VCMetricsFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{vc_metrics.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{VCMetricsFile$write(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-VCMetricsFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{VCMetricsFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/WgsContigMeanCovFile.Rd b/man/WgsContigMeanCovFile.Rd deleted file mode 100644 index 49b974b..0000000 --- a/man/WgsContigMeanCovFile.Rd +++ /dev/null @@ -1,146 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{WgsContigMeanCovFile} -\alias{WgsContigMeanCovFile} -\title{WgsContigMeanCovFile R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \verb{wgs_contig_mean_cov_.csv} file output from DRAGEN. -This file contains the estimated coverage for all contigs, and an autosomal -estimated coverage. -} -\examples{ -x1 <- system.file("extdata/wgs/SEQC-II.wgs_contig_mean_cov_normal.csv.gz", package = "dracarys") -x2 <- system.file("extdata/wgs/SEQC-II.wgs_contig_mean_cov_tumor.csv.gz", package = "dracarys") -cc1 <- WgsContigMeanCovFile$new(x1) -cc2 <- WgsContigMeanCovFile$new(x2) -d1 <- cc1$read() -d2 <- cc2$read() -cc1$write(d1, out_dir = tempdir(), prefix = "seqc_n", out_format = "tsv") -cc2$write(d2, out_dir = tempdir(), prefix = "seqc_t", out_format = "tsv") - -cc1$plot(d1) -cc2$plot(d2) - -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{WgsContigMeanCovFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-WgsContigMeanCovFile-read}{\code{WgsContigMeanCovFile$read()}} -\item \href{#method-WgsContigMeanCovFile-write}{\code{WgsContigMeanCovFile$write()}} -\item \href{#method-WgsContigMeanCovFile-plot}{\code{WgsContigMeanCovFile$plot()}} -\item \href{#method-WgsContigMeanCovFile-clone}{\code{WgsContigMeanCovFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsContigMeanCovFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \verb{wgs_contig_mean_cov_.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsContigMeanCovFile$read(keep_alt = TRUE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{keep_alt}}{Keep the ALT + Mito chromosomes?} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -tibble with the following columns: -\itemize{ -\item label: file label. -\item chrom: contig name. -\item n_bases: number of bases aligned to contig (excludes bases from -duplicate marked reads, reads with MAPQ=0, and clipped bases). -\item coverage: col2 / contig length -} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsContigMeanCovFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \verb{wgs_contig_mean_cov_.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsContigMeanCovFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsContigMeanCovFile-plot}{}}} -\subsection{Method \code{plot()}}{ -Plots the \verb{wgs_contig_mean_cov_.csv} files. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsContigMeanCovFile$plot(d, top_alt_n = 15)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{top_alt_n}}{Number of top covered alt contigs to plot per phenotype.} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -A ggplot2 object with chromosomes on X axis, and coverage on Y axis. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsContigMeanCovFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsContigMeanCovFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/WgsCoverageMetricsFile.Rd b/man/WgsCoverageMetricsFile.Rd deleted file mode 100644 index 322ce83..0000000 --- a/man/WgsCoverageMetricsFile.Rd +++ /dev/null @@ -1,105 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{WgsCoverageMetricsFile} -\alias{WgsCoverageMetricsFile} -\title{WgsCoverageMetricsFile R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \verb{wgs_coverage_metrics_.csv} file output from DRAGEN. -This file contains read depth of coverage metrics. -} -\examples{ -x1 <- system.file("extdata/wgs/SEQC-II.wgs_coverage_metrics_normal.csv.gz", package = "dracarys") -x2 <- system.file("extdata/wgs/SEQC-II.wgs_coverage_metrics_tumor.csv.gz", package = "dracarys") -cm1 <- WgsCoverageMetricsFile$new(x1) -cm2 <- WgsCoverageMetricsFile$new(x2) -d1 <- read(cm1) -d2 <- read(cm2) -cm1$write(d1, out_dir = tempdir(), prefix = "seqc_n", out_format = "tsv") -cm2$write(d2, out_dir = tempdir(), prefix = "seqc_t", out_format = "tsv") - -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{WgsCoverageMetricsFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-WgsCoverageMetricsFile-read}{\code{WgsCoverageMetricsFile$read()}} -\item \href{#method-WgsCoverageMetricsFile-write}{\code{WgsCoverageMetricsFile$write()}} -\item \href{#method-WgsCoverageMetricsFile-clone}{\code{WgsCoverageMetricsFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsCoverageMetricsFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \verb{wgs_coverage_metrics_.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsCoverageMetricsFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with one row and metrics spread across individual columns. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsCoverageMetricsFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \verb{wgs_coverage_metrics_.csv} file output -from DRAGEN -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsCoverageMetricsFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsCoverageMetricsFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsCoverageMetricsFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/WgsFineHistFile.Rd b/man/WgsFineHistFile.Rd deleted file mode 100644 index b3f5407..0000000 --- a/man/WgsFineHistFile.Rd +++ /dev/null @@ -1,136 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{WgsFineHistFile} -\alias{WgsFineHistFile} -\title{WgsFineHistFile R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \verb{wgs_fine_hist_.csv} file output from DRAGEN. -This file contains two columns: Depth and Overall. -The value in the Depth column ranges from 0 to 1000+ and the Overall -column indicates the number of loci covered at the corresponding depth. -} -\examples{ -x1 <- system.file("extdata/wgs/SEQC-II.wgs_fine_hist_normal.csv.gz", package = "dracarys") -x2 <- system.file("extdata/wgs/SEQC-II.wgs_fine_hist_tumor.csv.gz", package = "dracarys") -ch1 <- WgsFineHistFile$new(x1) -ch2 <- WgsFineHistFile$new(x2) -d1 <- read(ch1) -d2 <- read(ch2) -ch1$plot(d1) -ch2$plot(d2) -ch1$write(d1, out_dir = tempdir(), prefix = "seqc_n", out_format = "tsv") -ch2$write(d2, out_dir = tempdir(), prefix = "seqc_t", out_format = "tsv") -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{WgsFineHistFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-WgsFineHistFile-read}{\code{WgsFineHistFile$read()}} -\item \href{#method-WgsFineHistFile-write}{\code{WgsFineHistFile$write()}} -\item \href{#method-WgsFineHistFile-plot}{\code{WgsFineHistFile$plot()}} -\item \href{#method-WgsFineHistFile-clone}{\code{WgsFineHistFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsFineHistFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \verb{wgs_fine_hist_.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsFineHistFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with following columns: -\itemize{ -\item depth -\item number of loci with given depth -} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsFineHistFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \verb{wgs_fine_hist_.csv} file output -from DRAGEN -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsFineHistFile$write( - d, - out_dir = NULL, - prefix, - out_format = "tsv", - drid = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsFineHistFile-plot}{}}} -\subsection{Method \code{plot()}}{ -Plots the \verb{wgs_fine_hist_.csv} files. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsFineHistFile$plot(d, x_lim = c(0, 300))}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{x_lim}}{X axis range to plot.} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -A ggplot2 object with depth of coverage on X axis, -and number of loci with that depth on Y axis. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsFineHistFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsFineHistFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/WgsHistFile.Rd b/man/WgsHistFile.Rd deleted file mode 100644 index 0fecd69..0000000 --- a/man/WgsHistFile.Rd +++ /dev/null @@ -1,94 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{WgsHistFile} -\alias{WgsHistFile} -\title{WgsHistFile R6 Class} -\description{ -Contains methods for reading and displaying contents of -the \code{wgs_hist.csv} file output from DRAGEN -} -\examples{ -x <- system.file("extdata/wgs/SEQC-II.wgs_hist.csv.gz", package = "dracarys") -h <- WgsHistFile$new(x) -d <- h$read() -h$write(d, out_dir = tempdir(), prefix = "seqc_sv", out_format = "tsv") - -} -\section{Super class}{ -\code{\link[dracarys:File]{dracarys::File}} -> \code{WgsHistFile} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-WgsHistFile-read}{\code{WgsHistFile$read()}} -\item \href{#method-WgsHistFile-write}{\code{WgsHistFile$write()}} -\item \href{#method-WgsHistFile-clone}{\code{WgsHistFile$clone()}} -} -} -\if{html}{\out{ -
Inherited methods - -
-}} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsHistFile-read}{}}} -\subsection{Method \code{read()}}{ -Reads the \code{wgs_hist.csv} file output from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsHistFile$read()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -tibble with one row and metrics spread across individual columns. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsHistFile-write}{}}} -\subsection{Method \code{write()}}{ -Writes a tidy version of the \code{wgs_hist.csv} file output -from DRAGEN. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsHistFile$write(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{d}}{Parsed object from \code{self$read()}.} - -\item{\code{out_dir}}{Output directory.} - -\item{\code{prefix}}{Prefix of output file(s).} - -\item{\code{out_format}}{Format of output file(s).} - -\item{\code{drid}}{dracarys ID to use for the dataset (e.g. \code{wfrid.123}, \code{prid.456}).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-WgsHistFile-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{WgsHistFile$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/dragen_contig_mean_coverage_plot.Rd b/man/dragen_contig_mean_coverage_plot.Rd new file mode 100644 index 0000000..2a98030 --- /dev/null +++ b/man/dragen_contig_mean_coverage_plot.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dragen.R +\name{dragen_contig_mean_coverage_plot} +\alias{dragen_contig_mean_coverage_plot} +\title{Plot DRAGEN Contig Mean Coverage +TODO +Plots the \verb{wgs_contig_mean_cov_.csv} files.} +\usage{ +dragen_contig_mean_coverage_plot(d, top_alt_n = 15) +} +\arguments{ +\item{d}{Parsed tibble.} + +\item{top_alt_n}{Number of top covered alt contigs to plot per phenotype.} +} +\value{ +A ggplot2 object with chromosomes on X axis, and coverage on Y axis. +} +\description{ +Plot DRAGEN Contig Mean Coverage +TODO +Plots the \verb{wgs_contig_mean_cov_.csv} files. +} diff --git a/man/tso_fraglenhist_plot.Rd b/man/dragen_fraglenhist_plot.Rd similarity index 66% rename from man/tso_fraglenhist_plot.Rd rename to man/dragen_fraglenhist_plot.Rd index 7a6daad..e046df2 100644 --- a/man/tso_fraglenhist_plot.Rd +++ b/man/dragen_fraglenhist_plot.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tso.R -\name{tso_fraglenhist_plot} -\alias{tso_fraglenhist_plot} -\title{Plot Fragment Length Hist} +% Please edit documentation in R/dragen.R +\name{dragen_fraglenhist_plot} +\alias{dragen_fraglenhist_plot} +\title{DRAGEN Fragment Length Hist Plot} \usage{ -tso_fraglenhist_plot(d, min_count = 10) +dragen_fraglenhist_plot(d, min_count = 10) } \arguments{ \item{d}{Parsed tibble.} diff --git a/man/time_metrics_process.Rd b/man/time_metrics_process.Rd deleted file mode 100644 index 91b6b20..0000000 --- a/man/time_metrics_process.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dragen.R -\name{time_metrics_process} -\alias{time_metrics_process} -\title{Process Multiple TimeMetricsFile Objects} -\usage{ -time_metrics_process(x, id = seq_len(length(x))) -} -\arguments{ -\item{x}{Atomic vector with one or more TimeMetricsFile objects.} - -\item{id}{ID for each input, which is used to disambiguate files -generated from same samples. Default: index from 1 to length of \code{x}.} -} -\value{ -tibble with the following columns: -\itemize{ -\item Step: DRAGEN step -\item Time: time in HH:MM -} -} -\description{ -Processes multiple TimeMetricsFile objects. -} -\examples{ -p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") -x <- TimeMetricsFile$new(p) -(tm <- time_metrics_process(c(x, x), id = c("run1", "run2"))) - -} diff --git a/tests/testthat/test-roxytest-testexamples-dragen.R b/tests/testthat/test-roxytest-testexamples-dragen.R deleted file mode 100644 index 8db3f7e..0000000 --- a/tests/testthat/test-roxytest-testexamples-dragen.R +++ /dev/null @@ -1,13 +0,0 @@ -# Generated by roxytest: do not edit by hand! - -# File R/dragen.R: @testexamples - -test_that("Function time_metrics_process() @ L1399", { - - p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") - x <- TimeMetricsFile$new(p) - (tm <- time_metrics_process(c(x, x), id = c("run1", "run2"))) - - expect_equal(nrow(tm), 2) -}) - From 91d30162e90e398104726ce07d0835f6de9efae8 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 18 Oct 2024 01:50:41 +1100 Subject: [PATCH 14/17] dragen: remove per-file R6 classes (R/) --- NAMESPACE | 13 - R/dragen.R | 1123 +++++---------------------------------------- R/dragen_fastqc.R | 48 -- R/tso.R | 28 -- 4 files changed, 104 insertions(+), 1108 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 8350259..6239a8d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,19 +4,11 @@ S3method(read,File) export(BcftoolsStatsFile) export(BclconvertReports) export(BclconvertReports375) -export(FastqcMetricsFile) export(File) -export(FragmentLengthHistFile) -export(MappingMetricsFile) export(MultiqcFile) export(PcgrJsonFile) export(PcgrTiersFile) export(PloidyEstimationMetricsFile) -export(ReplayFile) -export(SvMetricsFile) -export(TimeMetricsFile) -export(TrimmerMetricsFile) -export(VCMetricsFile) export(Wf) export(Wf_dragen) export(Wf_sash) @@ -25,10 +17,6 @@ export(Wf_tso_ctdna_tumor_only) export(Wf_tso_ctdna_tumor_only_v2) export(Wf_umccrise) export(Wf_umccrise_download_tidy_write) -export(WgsContigMeanCovFile) -export(WgsCoverageMetricsFile) -export(WgsFineHistFile) -export(WgsHistFile) export(bcftools_parse_vcf) export(bcftools_parse_vcf_regions) export(date_log) @@ -79,7 +67,6 @@ export(s3_list_files_filter_relevant) export(s3_search) export(session_info_kable) export(tidy_files) -export(time_metrics_process) export(tso_acfc_plot) export(tso_acfc_read) export(tso_combinedvaro_smallv_read) diff --git a/R/dragen.R b/R/dragen.R index 42811f9..90cfd8b 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -1,3 +1,32 @@ +#' DRAGEN Fragment Length Hist Plot +#' +#' Plots the fragment length distributions as given in the +#' `fragment_length_hist` file. +#' +#' @param d Parsed tibble. +#' @param min_count Minimum read count to be plotted (def: 10). +#' +#' @return A ggplot2 plot containing fragment lengths on X axis and read counts +#' on Y axis for each sample. +dragen_fraglenhist_plot <- function(d, min_count = 10) { + assertthat::assert_that(is.numeric(min_count), min_count >= 0) + d |> + dplyr::filter(.data$Count >= min_count) |> + ggplot2::ggplot(ggplot2::aes(x = .data$FragmentLength, y = .data$Count)) + + ggplot2::geom_line() + + ggplot2::labs(title = "Fragment Length Distribution") + + ggplot2::xlab("Fragment Length (bp)") + + ggplot2::ylab(glue("Read Count (min: {min_count})")) + + ggplot2::theme_minimal() + + ggplot2::theme( + legend.position = c(0.9, 0.9), + legend.justification = c(1, 1), + panel.grid.minor = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_blank(), + plot.title = ggplot2::element_text(colour = "#2c3e50", size = 14, face = "bold") + ) +} + #' Read DRAGEN UMI Metrics #' #' Reads the `umi_metrics.csv` file output from DRAGEN. @@ -608,577 +637,87 @@ dragen_coverage_metrics_read <- function(x) { return(res) } -#' WgsContigMeanCovFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `wgs_contig_mean_cov_.csv` file output from DRAGEN. -#' This file contains the estimated coverage for all contigs, and an autosomal -#' estimated coverage. -#' -#' @examples -#' x1 <- system.file("extdata/wgs/SEQC-II.wgs_contig_mean_cov_normal.csv.gz", package = "dracarys") -#' x2 <- system.file("extdata/wgs/SEQC-II.wgs_contig_mean_cov_tumor.csv.gz", package = "dracarys") -#' cc1 <- WgsContigMeanCovFile$new(x1) -#' cc2 <- WgsContigMeanCovFile$new(x2) -#' d1 <- cc1$read() -#' d2 <- cc2$read() -#' cc1$write(d1, out_dir = tempdir(), prefix = "seqc_n", out_format = "tsv") -#' cc2$write(d2, out_dir = tempdir(), prefix = "seqc_t", out_format = "tsv") -#' -#' cc1$plot(d1) -#' cc2$plot(d2) -#' -#' @export -WgsContigMeanCovFile <- R6::R6Class( - "WgsContigMeanCovFile", - inherit = File, - public = list( - #' @description - #' Reads the `wgs_contig_mean_cov_.csv` file output from DRAGEN. - #' - #' @param keep_alt Keep the ALT + Mito chromosomes? - #' @return tibble with the following columns: - #' - label: file label. - #' - chrom: contig name. - #' - n_bases: number of bases aligned to contig (excludes bases from - #' duplicate marked reads, reads with MAPQ=0, and clipped bases). - #' - coverage: col2 / contig length - read = function(keep_alt = TRUE) { - x <- self$path - readr::read_csv(x, col_names = c("chrom", "n_bases", "coverage"), col_types = "cdd") |> - dplyr::filter( - if (!keep_alt) { - !grepl("chrM|MT|_|Autosomal|HLA-|EBV", .data$chrom) - } else { - TRUE - } - ) - }, - - #' @description - #' Writes a tidy version of the `wgs_contig_mean_cov_.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - }, - - - #' @description Plots the `wgs_contig_mean_cov_.csv` files. - #' @param d Parsed object from `self$read()`. - #' @param top_alt_n Number of top covered alt contigs to plot per phenotype. - #' @return A ggplot2 object with chromosomes on X axis, and coverage on Y axis. - plot = function(d, top_alt_n = 15) { - assertthat::assert_that(top_alt_n >= 0) - - # Display chr1-22, X, Y at top (M goes to bottom). - # Display top 20 of the rest, plus rest as 'other', at bottom - main_chrom1 <- c(1:22, "X", "Y") - main_chrom2 <- c(paste0("chr", main_chrom1)) - main_chrom <- c(main_chrom1, main_chrom2, "Autosomal regions") - min_cvg <- 0.000001 - - d <- d |> - dplyr::mutate( - panel = dplyr::if_else(.data$chrom %in% main_chrom, "main", "alt"), - panel = factor(.data$panel, levels = c("main", "alt")) - ) |> - dplyr::select("chrom", "coverage", "panel") - - main_panel <- d |> dplyr::filter(.data$panel == "main") - alt_panel <- d |> dplyr::filter(.data$panel == "alt") - top_alt <- alt_panel |> - dplyr::top_n(top_alt_n, wt = .data$coverage) |> - dplyr::arrange(dplyr::desc(.data$coverage)) |> - dplyr::pull(.data$chrom) |> - unique() - - alt_panel2 <- alt_panel |> - dplyr::mutate(alt_group = dplyr::if_else(.data$chrom %in% top_alt, "top", "bottom")) - - alt_panel_final <- alt_panel2 |> - dplyr::group_by(.data$alt_group) |> - dplyr::summarise(mean_cov = mean(.data$coverage)) |> - dplyr::inner_join(alt_panel2, by = c("alt_group")) |> - dplyr::mutate( - chrom = dplyr::if_else(.data$alt_group == "bottom", "OTHER", .data$chrom), - coverage = dplyr::if_else(.data$alt_group == "bottom", .data$mean_cov, .data$coverage) - ) |> - dplyr::distinct() |> - dplyr::filter(coverage > min_cvg) |> - dplyr::ungroup() |> - dplyr::select("chrom", "coverage", "panel") - - chrom_fac_levels <- c(main_chrom, "chrM", "MT", top_alt[!top_alt %in% c("chrM", "MT")], "OTHER") - d <- dplyr::bind_rows(main_panel, alt_panel_final) |> - dplyr::mutate(chrom = factor(.data$chrom, levels = chrom_fac_levels)) - - d |> - dplyr::mutate(label = "sampleA") |> - ggplot2::ggplot( - ggplot2::aes( - x = .data$chrom, y = .data$coverage, group = .data$label, - ) - ) + - ggplot2::geom_point() + - ggplot2::geom_line() + - ggplot2::scale_y_continuous( - limits = c(0, NA), expand = c(0, 0), labels = scales::comma, - breaks = scales::pretty_breaks(n = 8) - ) + - ggplot2::theme_minimal() + - ggplot2::labs(title = "Mean Coverage Per Chromosome", colour = "Label") + - ggplot2::xlab("Chromosome") + - ggplot2::ylab("Coverage") + - ggplot2::theme( - legend.position = "top", - panel.grid.minor = ggplot2::element_blank(), - panel.grid.major.y = ggplot2::element_blank(), - strip.background = ggplot2::element_blank(), - strip.text.x = ggplot2::element_blank(), - axis.text.x = ggplot2::element_text(angle = 45, vjust = 1, hjust = 1, size = 6), - plot.title = ggplot2::element_text(colour = "#2c3e50", size = 14, face = "bold"), - panel.spacing = ggplot2::unit(2, "lines") - ) + - ggplot2::facet_wrap(ggplot2::vars(.data$panel), nrow = 2, scales = "free") - } - ) -) - -#' WgsCoverageMetricsFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `wgs_coverage_metrics_.csv` file output from DRAGEN. -#' This file contains read depth of coverage metrics. -#' -#' @examples -#' x1 <- system.file("extdata/wgs/SEQC-II.wgs_coverage_metrics_normal.csv.gz", package = "dracarys") -#' x2 <- system.file("extdata/wgs/SEQC-II.wgs_coverage_metrics_tumor.csv.gz", package = "dracarys") -#' cm1 <- WgsCoverageMetricsFile$new(x1) -#' cm2 <- WgsCoverageMetricsFile$new(x2) -#' d1 <- read(cm1) -#' d2 <- read(cm2) -#' cm1$write(d1, out_dir = tempdir(), prefix = "seqc_n", out_format = "tsv") -#' cm2$write(d2, out_dir = tempdir(), prefix = "seqc_t", out_format = "tsv") -#' -#' @export -WgsCoverageMetricsFile <- R6::R6Class( - "WgsCoverageMetricsFile", - inherit = File, - public = list( - #' @description - #' Reads the `wgs_coverage_metrics_.csv` file output from DRAGEN. - #' - #' @return tibble with one row and metrics spread across individual columns. - read = function() { - abbrev_nm <- c( - "Aligned bases" = "bases_aligned", - "Aligned bases in genome" = "bases_aligned_in_genome", - "Average alignment coverage over genome" = "cov_alignment_avg_over_genome", - "Uniformity of coverage (PCT > 0.2*mean) over genome" = "cov_uniformity_over_genome_pct_gt02mean", - "Uniformity of coverage (PCT > 0.4*mean) over genome" = "cov_uniformity_over_genome_pct_gt04mean", - "Average chr X coverage over genome" = "cov_avg_x_over_genome", - "Average chr Y coverage over genome" = "cov_avg_y_over_genome", - "Average mitochondrial coverage over genome" = "cov_avg_mt_over_genome", - "Average autosomal coverage over genome" = "cov_avg_auto_over_genome", - "Median autosomal coverage over genome" = "cov_median_auto_over_genome", - "Mean/Median autosomal coverage ratio over genome" = "cov_mean_median_auto_ratio_over_genome", - "Aligned reads" = "reads_aligned", - "Aligned reads in genome" = "reads_aligned_in_genome" - ) - - x <- self$path - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1])) - - res <- raw |> - tibble::as_tibble_col(column_name = "value") |> - tidyr::separate_wider_delim( - "value", - delim = ",", too_few = "align_start", - names = c("category", "dummy1", "var", "value", "pct") - ) - # split to rename the - # "PCT of genome with coverage [100x: inf)" values - res1 <- res |> - # pct just shows 100% for a couple rows - dplyr::filter(!grepl("PCT of genome with coverage", .data$var)) |> - dplyr::select("var", "value") - res2 <- res |> - dplyr::filter(grepl("PCT of genome with coverage", .data$var)) |> - dplyr::mutate( - var = sub("PCT of genome with coverage ", "", .data$var), - var = gsub("\\[|\\]|\\(|\\)| ", "", .data$var), - var = gsub("x", "", .data$var), - var = gsub("inf", "Inf", .data$var) - ) |> - tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> - dplyr::mutate(var = as.character(glue("cov_genome_pct_{start}_{end}"))) |> - dplyr::select("var", "value") - res <- dplyr::bind_rows(res1, res2) |> - dplyr::mutate( - value = dplyr::na_if(.data$value, "NA"), - value = as.numeric(.data$value), - var = dplyr::recode(.data$var, !!!abbrev_nm) - ) |> - tidyr::pivot_wider(names_from = "var", values_from = "value") - }, - #' @description - #' Writes a tidy version of the `wgs_coverage_metrics_.csv` file output - #' from DRAGEN - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } - ) -) - -#' WgsFineHistFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `wgs_fine_hist_.csv` file output from DRAGEN. -#' This file contains two columns: Depth and Overall. -#' The value in the Depth column ranges from 0 to 1000+ and the Overall -#' column indicates the number of loci covered at the corresponding depth. -#' -#' @examples -#' x1 <- system.file("extdata/wgs/SEQC-II.wgs_fine_hist_normal.csv.gz", package = "dracarys") -#' x2 <- system.file("extdata/wgs/SEQC-II.wgs_fine_hist_tumor.csv.gz", package = "dracarys") -#' ch1 <- WgsFineHistFile$new(x1) -#' ch2 <- WgsFineHistFile$new(x2) -#' d1 <- read(ch1) -#' d2 <- read(ch2) -#' ch1$plot(d1) -#' ch2$plot(d2) -#' ch1$write(d1, out_dir = tempdir(), prefix = "seqc_n", out_format = "tsv") -#' ch2$write(d2, out_dir = tempdir(), prefix = "seqc_t", out_format = "tsv") -#' @export -WgsFineHistFile <- R6::R6Class( - "WgsFineHistFile", - inherit = File, - public = list( - #' @description - #' Reads the `wgs_fine_hist_.csv` file output from DRAGEN. - #' @return tibble with following columns: - #' - depth - #' - number of loci with given depth - read = function() { - x <- self$path - d <- readr::read_csv(x, col_types = "cd") - assertthat::assert_that(all(colnames(d) == c("Depth", "Overall"))) - - # there's a max Depth of 2000+, so convert to numeric for easier plotting - d |> - dplyr::mutate( - Depth = ifelse(grepl("+", .data$Depth), sub("(\\d*)\\+", "\\1", .data$Depth), .data$Depth), - Depth = as.integer(.data$Depth) - ) |> - dplyr::select(depth = "Depth", n_loci = "Overall") - }, - #' @description - #' Writes a tidy version of the `wgs_fine_hist_.csv` file output - #' from DRAGEN - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - }, - - #' @description Plots the `wgs_fine_hist_.csv` files. - #' @param d Parsed object from `self$read()`. - #' @param x_lim X axis range to plot. - #' @return A ggplot2 object with depth of coverage on X axis, - #' and number of loci with that depth on Y axis. - plot = function(d, x_lim = c(0, 300)) { - assertthat::assert_that(length(x_lim) == 2) - d |> - ggplot2::ggplot(ggplot2::aes(x = .data$depth, y = .data$n_loci)) + - ggplot2::geom_line() + - ggplot2::coord_cartesian(xlim = x_lim) + - ggplot2::scale_y_continuous(labels = scales::label_comma()) + - ggplot2::scale_x_continuous(n.breaks = 8) + - ggplot2::labs(title = "Coverage Distribution", colour = "Label") + - ggplot2::xlab("Depth of Coverage") + - ggplot2::ylab("Number of Loci with Given Coverage") + - ggplot2::theme_minimal() + - ggplot2::theme( - legend.position = c(0.9, 0.9), - legend.justification = c(1, 1), - panel.grid.minor = ggplot2::element_blank(), - panel.grid.major.y = ggplot2::element_blank(), - plot.title = ggplot2::element_text(colour = "#2c3e50", size = 14, face = "bold") - ) - } - ) -) - -#' FragmentLengthHistFile R6 Class -#' -#' @description -#' Contains methods for reading and plotting contents of -#' the `fragment_length_hist.csv` file output from DRAGEN. +#' Plot DRAGEN Contig Mean Coverage +#' TODO +#' Plots the `wgs_contig_mean_cov_.csv` files. #' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.fragment_length_hist.csv.gz", package = "dracarys") -#' fl <- FragmentLengthHistFile$new(x) -#' d <- fl$read() # or read(fl) -#' fl$plot(d) # or plot(fl) -#' fl$write(d |> dplyr::filter(count > 10), out_dir = tempdir(), prefix = "seqc_fl") -#' @export -FragmentLengthHistFile <- R6::R6Class( - "FragmentLengthHistFile", - inherit = File, - public = list( - #' @description Reads the `fragment_length_hist.csv` file, which contains the - #' fragment length distribution for each sample. - #' @return A tibble with the following columns: - #' - sample: name of sample - #' - fragmentLength: estimated fragment length - #' - count: number of reads with estimated fragment length - read = function() { - x <- self$path - d <- readr::read_lines(x) - assertthat::assert_that(grepl("#Sample", d[1])) - - d |> - tibble::enframe() |> - dplyr::mutate( - sample = dplyr::if_else( - grepl("#Sample", .data$value), - sub("#Sample: (.*)", "\\1", .data$value), - NA_character_ - ) - ) |> - tidyr::fill("sample", .direction = "down") |> - dplyr::filter(!grepl("#Sample: |FragmentLength,Count", .data$value)) |> - tidyr::separate_wider_delim(cols = "value", names = c("fragmentLength", "count"), delim = ",") |> - dplyr::mutate( - count = as.numeric(.data$count), - fragmentLength = as.numeric(.data$fragmentLength) - ) |> - dplyr::select("sample", "fragmentLength", "count") - }, - #' @description - #' Writes a tidy version of the `fragment_length_hist.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - }, +#' @param d Parsed tibble. +#' @param top_alt_n Number of top covered alt contigs to plot per phenotype. +#' @return A ggplot2 object with chromosomes on X axis, and coverage on Y axis. +dragen_contig_mean_coverage_plot <- function(d, top_alt_n = 15) { + assertthat::assert_that(top_alt_n >= 0) + # Display chr1-22, X, Y at top (M goes to bottom). + # Display top 20 of the rest, plus rest as 'other', at bottom + main_chrom1 <- c(1:22, "X", "Y") + main_chrom2 <- c(paste0("chr", main_chrom1)) + main_chrom <- c(main_chrom1, main_chrom2, "Autosomal regions") + min_cvg <- 0.000001 - #' @description Plots the fragment length distributions as given in the - #' `fragment_length_hist.csv` file. - #' - #' @param d Parsed object from `self$read()`. - #' @param min_count Minimum read count to be plotted (Default: 10). - #' @return A ggplot2 plot containing fragment lengths on X axis and read counts - #' on Y axis for each sample. - plot = function(d, min_count = 10) { - assertthat::assert_that(min_count >= 0) - d <- d |> - dplyr::filter(.data$count >= min_count) + d <- d |> + dplyr::mutate( + panel = dplyr::if_else(.data$chrom %in% main_chrom, "main", "alt"), + panel = factor(.data$panel, levels = c("main", "alt")) + ) |> + dplyr::select("chrom", "coverage", "panel") + + main_panel <- d |> dplyr::filter(.data$panel == "main") + alt_panel <- d |> dplyr::filter(.data$panel == "alt") + top_alt <- alt_panel |> + dplyr::top_n(top_alt_n, wt = .data$coverage) |> + dplyr::arrange(dplyr::desc(.data$coverage)) |> + dplyr::pull(.data$chrom) |> + unique() + + alt_panel2 <- alt_panel |> + dplyr::mutate(alt_group = dplyr::if_else(.data$chrom %in% top_alt, "top", "bottom")) + + alt_panel_final <- alt_panel2 |> + dplyr::group_by(.data$alt_group) |> + dplyr::summarise(mean_cov = mean(.data$coverage)) |> + dplyr::inner_join(alt_panel2, by = c("alt_group")) |> + dplyr::mutate( + chrom = dplyr::if_else(.data$alt_group == "bottom", "OTHER", .data$chrom), + coverage = dplyr::if_else(.data$alt_group == "bottom", .data$mean_cov, .data$coverage) + ) |> + dplyr::distinct() |> + dplyr::filter(.data$coverage > min_cvg) |> + dplyr::ungroup() |> + dplyr::select("chrom", "coverage", "panel") - d |> - ggplot2::ggplot(ggplot2::aes(x = .data$fragmentLength, y = .data$count, colour = sample)) + - ggplot2::geom_line() + - ggplot2::labs(title = "Fragment Length Distribution") + - ggplot2::xlab("Fragment Length (bp)") + - ggplot2::ylab(glue("Read Count (min: {min_count})")) + - ggplot2::theme_minimal() + - ggplot2::theme( - legend.position = c(0.9, 0.9), - legend.justification = c(1, 1), - panel.grid.minor = ggplot2::element_blank(), - panel.grid.major = ggplot2::element_blank(), - plot.title = ggplot2::element_text(colour = "#2c3e50", size = 14, face = "bold") - ) - } - ) -) + chrom_fac_levels <- c(main_chrom, "chrM", "MT", top_alt[!top_alt %in% c("chrM", "MT")], "OTHER") + d <- dplyr::bind_rows(main_panel, alt_panel_final) |> + dplyr::mutate(chrom = factor(.data$chrom, levels = chrom_fac_levels)) -#' MappingMetricsFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `mapping_metrics.csv` file output from DRAGEN. -#' This file contains mapping and aligning metrics, like the metrics computed by -#' the Samtools Flagstat command. These metrics are available on an aggregate -#' level (over all input data), and on a per read group level. NOTE: we are -#' keeping only the read group level metrics (i.e. removing the aggregate data). -#' Unless explicitly stated, the metrics units are in reads (i.e., not in -#' terms of pairs or alignments). -#' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.mapping_metrics.csv.gz", package = "dracarys") -#' mm <- MappingMetricsFile$new(x) -#' d <- mm$read() # or read(mm) -#' mm$write(d, out_dir = tempdir(), prefix = "seqc_mm", out_format = "tsv") -#' -#' @export -MappingMetricsFile <- R6::R6Class( - "MappingMetricsFile", - inherit = File, - public = list( - #' @description - #' Reads the `mapping_metrics.csv` file output from DRAGEN. - #' - #' @return tibble with one row of X metrics per read group. - read = function() { - abbrev_nm <- c( - "Total input reads" = "reads_tot_input", - "Number of duplicate marked reads" = "reads_num_dupmarked", - "Number of unique reads (excl. duplicate marked reads)" = "reads_num_uniq", - "Reads with mate sequenced" = "reads_w_mate_seq", - "Reads without mate sequenced" = "reads_wo_mate_seq", - "QC-failed reads" = "reads_qcfail", - "Mapped reads adjusted for excluded mapping" = "reads_mapped_adjexcl", - "Mapped reads adjusted for filtered and excluded mapping" = "reads_mapped_adjfiltexcl", - "Unmapped reads adjusted for excluded mapping" = "reads_unmapped_adjexcl", - "Unmapped reads adjusted for filtered and excluded mapping" = "reads_unmapped_adjfiltexcl", - "Reads mapping to multiple locations" = "reads_map_multiloc", - "Hard-clipped bases R1" = "bases_hardclip_r1", - "Hard-clipped bases R2" = "bases_hardclip_r2", - "Soft-clipped bases" = "bases_softclip", - "Hard-clipped bases" = "bases_hardclip", - "Mapped reads" = "reads_mapped", - "Mapped reads adjusted for filtered mapping" = "reads_mapped_adjfilt", - "Mapped reads R1" = "reads_mapped_r1", - "Mapped reads R2" = "reads_mapped_r2", - "Number of unique & mapped reads (excl. duplicate marked reads)" = "reads_num_uniq_mapped", - "Unmapped reads" = "reads_unmapped", - "Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt", - "Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj", - "Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj", - "Singleton reads (itself mapped; mate unmapped)" = "reads_singleton", - "Paired reads (itself & mate mapped)" = "reads_paired", - "Properly paired reads" = "reads_paired_proper", - "Not properly paired reads (discordant)" = "reads_discordant", - "Paired reads mapped to different chromosomes" = "reads_paired_mapped_diff_chrom", - "Paired reads mapped to different chromosomes (MAPQ>=10)" = "reads_paired_mapped_diff_chrom_mapq10", - "Reads with MAPQ [40:inf)" = "reads_mapq_40_inf", - "Reads with MAPQ [30:40)" = "reads_mapq_30_40", - "Reads with MAPQ [20:30)" = "reads_mapq_20_30", - "Reads with MAPQ [10:20)" = "reads_mapq_10_20", - "Reads with MAPQ [ 0:10)" = "reads_mapq_0_10", - "Reads with MAPQ NA (Unmapped reads)" = "reads_mapq_na_unmapped", - "Reads with indel R1" = "reads_indel_r1", - "Reads with indel R2" = "reads_indel_r2", - "Reads with splice junction" = "reads_splicejunc", - "Total bases" = "bases_tot", - "Total bases R1" = "bases_tot_r1", - "Total bases R2" = "bases_tot_r2", - "Mapped bases" = "bases_mapped", - "Mapped bases R1" = "bases_mapped_r1", - "Mapped bases R2" = "bases_mapped_r2", - "Soft-clipped bases R1" = "bases_softclip_r1", - "Soft-clipped bases R2" = "bases_softclip_r2", - "Mismatched bases R1" = "bases_mismatched_r1", - "Mismatched bases R2" = "bases_mismatched_r2", - "Mismatched bases R1 (excl. indels)" = "bases_mismatched_r1_noindels", - "Mismatched bases R2 (excl. indels)" = "bases_mismatched_r2_noindels", - "Q30 bases" = "bases_q30", - "Q30 bases R1" = "bases_q30_r1", - "Q30 bases R2" = "bases_q30_r2", - "Q30 bases (excl. dups & clipped bases)" = "bases_q30_nodups_noclipped", - "Total alignments" = "alignments_tot", - "Secondary alignments" = "alignments_secondary", - "Supplementary (chimeric) alignments" = "alignments_chimeric", - "Estimated read length" = "read_len", - "Bases in reference genome" = "bases_in_ref_genome", - "Bases in target bed [% of genome]" = "bases_in_target_bed_genome_pct", - "Average sequenced coverage over genome" = "cov_avg_seq_over_genome", - "Insert length: mean" = "insert_len_mean", - "Insert length: median" = "insert_len_median", - "Insert length: standard deviation" = "insert_len_std_dev", - "Provided sex chromosome ploidy" = "ploidy_sex_chrom_provided", - "Estimated sample contamination" = "contamination_est", - "DRAGEN mapping rate [mil. reads/second]" = "mapping_rate_dragen_milreads_per_sec", - "Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed", - "Total reads in RG" = "reads_tot_rg", - "Filtered rRNA reads" = "reads_rrna_filtered" + d |> + dplyr::mutate(label = "sampleA") |> + ggplot2::ggplot( + ggplot2::aes( + x = .data$chrom, y = .data$coverage, group = .data$label, ) - x <- self$path - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("MAPPING/ALIGNING", raw[1])) - # tidy - d <- raw |> - tibble::as_tibble_col(column_name = "value") |> - tidyr::separate_wider_delim( - "value", - names = c("category", "RG", "var", "count", "pct"), - delim = ",", too_few = "align_start" - ) |> - dplyr::filter(.data$RG != "") |> - dplyr::mutate( - count = dplyr::na_if(.data$count, "NA"), - count = as.numeric(.data$count), - pct = as.numeric(.data$pct), - var = dplyr::recode(.data$var, !!!abbrev_nm) - ) |> - dplyr::select("RG", "var", "count", "pct") - # pivot - d |> - tidyr::pivot_longer(c("count", "pct")) |> - dplyr::mutate( - name = dplyr::if_else(.data$name == "count", "", "_pct"), - var = glue("{.data$var}{.data$name}") - ) |> - dplyr::select("RG", "var", "value") |> - dplyr::filter(!is.na(.data$value)) |> - tidyr::pivot_wider(names_from = "var", values_from = "value") - }, - #' @description - #' Writes a tidy version of the `mapping_metrics.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } - ) -) + ) + + ggplot2::geom_point() + + ggplot2::geom_line() + + ggplot2::scale_y_continuous( + limits = c(0, NA), expand = c(0, 0), labels = scales::comma, + breaks = scales::pretty_breaks(n = 8) + ) + + ggplot2::theme_minimal() + + ggplot2::labs(title = "Mean Coverage Per Chromosome", colour = "Label") + + ggplot2::xlab("Chromosome") + + ggplot2::ylab("Coverage") + + ggplot2::theme( + legend.position = "top", + panel.grid.minor = ggplot2::element_blank(), + panel.grid.major.y = ggplot2::element_blank(), + strip.background = ggplot2::element_blank(), + strip.text.x = ggplot2::element_blank(), + axis.text.x = ggplot2::element_text(angle = 45, vjust = 1, hjust = 1, size = 6), + plot.title = ggplot2::element_text(colour = "#2c3e50", size = 14, face = "bold"), + panel.spacing = ggplot2::unit(2, "lines") + ) + + ggplot2::facet_wrap(ggplot2::vars(.data$panel), nrow = 2, scales = "free") +} #' PloidyEstimationMetricsFile R6 Class #' @@ -1267,460 +806,6 @@ PloidyEstimationMetricsFile <- R6::R6Class( ) ) -#' ReplayFile R6 Class -#' -#' @description -#' Contains methods for reading contents of -#' the `replay.json` file output from DRAGEN, which contains the DRAGEN command -#' line, parameters and version for the specific run. -#' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II-replay.json.gz", package = "dracarys") -#' r <- ReplayFile$new(x) -#' d <- r$read() # or read(r) -#' r$write(d, out_dir = tempdir(), prefix = "seqc_replay", out_format = "tsv") -#' @export -ReplayFile <- R6::R6Class( - "ReplayFile", - inherit = File, - public = list( - #' @description Reads the `replay.json` file. - #' @return tibble with one row and metrics spread across individual columns. - read = function() { - x <- self$path - res <- x |> - jsonlite::read_json(simplifyVector = TRUE) |> - purrr::map_if(is.data.frame, tibble::as_tibble) - - req_elements <- c("command_line", "hash_table_build", "dragen_config", "system") - assertthat::assert_that(all(names(res) %in% req_elements)) - res[["system"]] <- res[["system"]] |> - tibble::as_tibble_row() - res[["hash_table_build"]] <- res[["hash_table_build"]] |> - tibble::as_tibble_row() - # we don't care if the columns are characters, no analysis likely to be done on dragen options - # (though never say never!) - res[["dragen_config"]] <- res[["dragen_config"]] |> - tidyr::pivot_wider(names_from = "name", values_from = "value") - - dplyr::bind_cols(res) - }, - #' @description - #' Writes a tidy version of the `replay.json` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } - ) -) - -#' TimeMetricsFile R6 Class -#' -#' @description -#' Contains methods for reading contents of -#' the `time_metrics.csv` file output from DRAGEN, which contains -#' a breakdown of the run duration for each DRAGEN process. -#' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") -#' tm <- TimeMetricsFile$new(x) -#' d <- tm$read() # or read(tm) -#' tm$write(d, out_dir = tempdir(), prefix = "seqc_time", out_format = "tsv") -#' @export -TimeMetricsFile <- R6::R6Class( - "TimeMetricsFile", - inherit = File, - public = list( - #' @description Reads the `time_metrics.csv` file. - #' @return tibble with one row and metrics spread across individual columns. - read = function() { - x <- self$path - cn <- c("dummy1", "dummy2", "Step", "time_hrs", "time_sec") - ct <- readr::cols(.default = "c", time_hrs = readr::col_time(format = "%T"), time_sec = "d") - d <- readr::read_csv(x, col_names = cn, col_types = ct) - assertthat::assert_that(d$dummy1[1] == "RUN TIME", is.na(d$dummy2[1])) - assertthat::assert_that(inherits(d$time_hrs, "hms")) - d |> - dplyr::mutate( - Step = tools::toTitleCase(sub("Time ", "", .data$Step)), - Time = substr(.data$time_hrs, 1, 5) - ) |> - dplyr::select("Step", "Time") |> - tidyr::pivot_wider(names_from = "Step", values_from = "Time") |> - dplyr::relocate("Total Runtime") - }, - #' @description - #' Writes a tidy version of the `time_metrics.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } - ) -) - -#' Process Multiple TimeMetricsFile Objects -#' -#' Processes multiple TimeMetricsFile objects. -#' -#' @param x Atomic vector with one or more TimeMetricsFile objects. -#' @param id ID for each input, which is used to disambiguate files -#' generated from same samples. Default: index from 1 to length of `x`. -#' @return tibble with the following columns: -#' - Step: DRAGEN step -#' - Time: time in HH:MM -#' -#' @examples -#' p <- system.file("extdata/wgs/SEQC-II.time_metrics.csv.gz", package = "dracarys") -#' x <- TimeMetricsFile$new(p) -#' (tm <- time_metrics_process(c(x, x), id = c("run1", "run2"))) -#' -#' @testexamples -#' expect_equal(nrow(tm), 2) -#' -#' @export -time_metrics_process <- function(x, id = seq_len(length(x))) { - assertthat::assert_that(all(purrr::map_lgl(x, ~ inherits(.x, "TimeMetricsFile")))) - x |> - purrr::map(read) |> - purrr::set_names(id) |> - dplyr::bind_rows(.id = "ID") -} - -#' VCMetricsFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `vc_metrics.csv` file output from DRAGEN, which contains variant calling metrics. -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.vc_metrics.csv.gz", package = "dracarys") -#' vm <- VCMetricsFile$new(x) -#' d <- vm$read() # or read(vm) -#' vm$write(d, out_dir = tempdir(), prefix = "seqc_vc", out_format = "tsv") -#' -#' @export -VCMetricsFile <- R6::R6Class( - "VCMetricsFile", - inherit = File, - public = list( - #' @description - #' Reads the `vc_metrics.csv` file output from DRAGEN. - #' - #' @return tibble with one row and metrics spread across individual columns. - read = function() { - abbrev_nm <- c( - "Total" = "var_tot", - "Biallelic" = "var_biallelic", - "Multiallelic" = "var_multiallelic", - "SNPs" = "var_snp", - "Insertions (Hom)" = "var_ins_hom", - "Insertions (Het)" = "var_ins_het", - "Deletions (Hom)" = "var_del_hom", - "Deletions (Het)" = "var_del_het", - "Indels (Het)" = "var_indel_het", - "Chr X number of SNPs over genome" = "var_snp_x_over_genome", - "Chr Y number of SNPs over genome" = "var_snp_y_over_genome", - "(Chr X SNPs)/(chr Y SNPs) ratio over genome" = "var_x_over_y_snp_ratio_over_genome", - "SNP Transitions" = "var_snp_transitions", - "SNP Transversions" = "var_snp_transversions", - "Ti/Tv ratio" = "var_ti_tv_ratio", - "Heterozygous" = "var_heterozygous", - "Homozygous" = "var_homozygous", - "Het/Hom ratio" = "var_het_hom_ratio", - "In dbSNP" = "var_in_dbsnp", - "Not in dbSNP" = "var_nin_dbsnp", - "Percent Callability" = "callability_pct", - "Percent Autosome Callability" = "callability_auto_pct", - "Number of samples" = "sample_num", - "Reads Processed" = "reads_processed", - "Child Sample" = "sample_child" - ) - x <- self$path - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("VARIANT CALLER", raw[1])) - # tidy - d <- raw |> - tibble::as_tibble_col(column_name = "value") |> - tidyr::separate_wider_delim( - "value", - names = c("category", "sample", "var", "count", "pct"), - delim = ",", too_few = "align_start" - ) |> - dplyr::mutate( - var = dplyr::recode(.data$var, !!!abbrev_nm), - count = dplyr::na_if(.data$count, "NA"), - count = as.numeric(.data$count), - pct = round(as.numeric(.data$pct), 2), - category = dplyr::case_when( - grepl("SUMMARY", .data$category) ~ "summary", - grepl("PREFILTER", .data$category) ~ "prefilter", - grepl("POSTFILTER", .data$category) ~ "postfilter", - TRUE ~ "unknown" - ) - ) |> - dplyr::filter(.data$category != "summary") |> - dplyr::select("category", "sample", "var", "count", "pct") - # pivot - d |> - tidyr::pivot_longer(c("count", "pct")) |> - dplyr::mutate( - name = dplyr::if_else(.data$name == "count", "", "_pct"), - var = glue("{.data$var}{.data$name}") - ) |> - dplyr::select("category", "sample", "var", "value") |> - dplyr::filter(!is.na(.data$value)) |> - tidyr::pivot_wider(names_from = "var", values_from = "value") - }, - #' @description - #' Writes a tidy version of the `vc_metrics.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } - ) -) - -#' TrimmerMetricsFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `trimmer_metrics.csv` file output from DRAGEN -#' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.trimmer_metrics.csv.gz", package = "dracarys") -#' tm <- TrimmerMetricsFile$new(x) -#' d <- tm$read() -#' tm$write(d, out_dir = tempdir(), prefix = "seqc_tm", out_format = "tsv") -#' -#' @export -TrimmerMetricsFile <- R6::R6Class( - "TrimmerMetricsFile", - inherit = File, - public = list( - #' @description - #' Reads the `trimmer_metrics.csv` file output from DRAGEN. - #' - #' @return tibble with one row and metrics spread across individual columns. - read = function() { - x <- self$path - d <- readr::read_lines(x) - assertthat::assert_that(grepl("TRIMMER STATISTICS", d[1])) - abbrev_nm <- c( - "Total input reads" = "reads_tot_input", - "Total input bases" = "bases_tot", - "Total input bases R1" = "bases_r1", - "Total input bases R2" = "bases_r2", - "Average input read length" = "read_len_avg", - "Total trimmed reads" = "reads_trimmed_tot", - "Total trimmed bases" = "bases_trimmed_tot", - "Average bases trimmed per read" = "bases_trimmed_avg_per_read", - "Average bases trimmed per trimmed read" = "bases_trimmed_avg_per_trimmedread", - "Remaining poly-G K-mers R1 3prime" = "polygkmers3r1_remaining", - "Remaining poly-G K-mers R2 3prime" = "polygkmers3r2_remaining", - "Poly-G soft trimmed reads unfiltered R1 3prime" = "polyg_soft_trimmed_reads_unfilt_3r1", - "Poly-G soft trimmed reads unfiltered R2 3prime" = "polyg_soft_trimmed_reads_unfilt_3r2", - "Poly-G soft trimmed reads filtered R1 3prime" = "polyg_soft_trimmed_reads_filt_3r1", - "Poly-G soft trimmed reads filtered R2 3prime" = "polyg_soft_trimmed_reads_filt_3r2", - "Poly-G soft trimmed bases unfiltered R1 3prime" = "polyg_soft_trimmed_bases_unfilt_3r1", - "Poly-G soft trimmed bases unfiltered R2 3prime" = "polyg_soft_trimmed_bases_unfilt_3r2", - "Poly-G soft trimmed bases filtered R1 3prime" = "polyg_soft_trimmed_bases_filt_3r1", - "Poly-G soft trimmed bases filtered R2 3prime" = "polyg_soft_trimmed_bases_filt_3r2", - "Total filtered reads" = "reads_tot_filt", - "Reads filtered for minimum read length R1" = "reads_filt_minreadlenr1", - "Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2" - ) - - d |> - tibble::as_tibble_col(column_name = "value") |> - tidyr::separate_wider_delim("value", names = c("category", "extra", "var", "count", "pct"), delim = ",", too_few = "align_start") |> - dplyr::mutate( - count = as.numeric(.data$count), - pct = round(as.numeric(.data$pct), 2), - var = dplyr::recode(.data$var, !!!abbrev_nm) - ) |> - dplyr::select("var", "count", "pct") |> - tidyr::pivot_longer(c("count", "pct")) |> - dplyr::filter(!is.na(.data$value)) |> - dplyr::mutate( - name = dplyr::if_else(.data$name == "count", "", "_pct"), - var = glue("{.data$var}{.data$name}") - ) |> - dplyr::select("var", "value") |> - tidyr::pivot_wider(names_from = "var", values_from = "value") - }, - #' @description - #' Writes a tidy version of the `trimmer_metrics.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } - ) -) - -#' SvMetricsFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `sv_metrics.csv` file output from DRAGEN -#' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.sv_metrics.csv.gz", package = "dracarys") -#' sv <- SvMetricsFile$new(x) -#' d <- sv$read() -#' sv$write(d, out_dir = tempdir(), prefix = "seqc_sv", out_format = "tsv") -#' -#' @export -SvMetricsFile <- R6::R6Class( - "SvMetricsFile", - inherit = File, - public = list( - #' @description - #' Reads the `sv_metrics.csv` file output from DRAGEN. - #' - #' @return tibble with one row and metrics spread across individual columns. - read = function() { - x <- self$path - d <- readr::read_lines(x) - assertthat::assert_that(grepl("SV SUMMARY", d[1])) - abbrev_nm <- c( - "Number of deletions (PASS)" = "n_del", - "Number of insertions (PASS)" = "n_ins", - "Number of duplications (PASS)" = "n_dup", - "Number of breakend pairs (PASS)" = "n_bnd" - ) - d |> - tibble::as_tibble_col(column_name = "value") |> - dplyr::filter(!grepl("Total number of structural variants", .data$value)) |> - tidyr::separate_wider_delim( - "value", - names = c("svsum", "sample", "var", "count", "pct"), delim = ",", - too_few = "align_start" - ) |> - dplyr::mutate( - count = as.numeric(.data$count), - pct = round(as.numeric(.data$pct), 2), - var = dplyr::recode(.data$var, !!!abbrev_nm) - ) |> - dplyr::select("var", "count", "pct") |> - tidyr::pivot_longer(c("count", "pct")) |> - dplyr::mutate( - name = dplyr::if_else(.data$name == "count", "", "_pct"), - var = glue("{.data$var}{.data$name}") - ) |> - dplyr::arrange(.data$name, .data$var) |> - dplyr::select("var", "value") |> - tidyr::pivot_wider(names_from = "var", values_from = "value") - }, - #' @description - #' Writes a tidy version of the `sv_metrics.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } - ) -) - -#' WgsHistFile R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `wgs_hist.csv` file output from DRAGEN -#' -#' @examples -#' x <- system.file("extdata/wgs/SEQC-II.wgs_hist.csv.gz", package = "dracarys") -#' h <- WgsHistFile$new(x) -#' d <- h$read() -#' h$write(d, out_dir = tempdir(), prefix = "seqc_sv", out_format = "tsv") -#' -#' @export -WgsHistFile <- R6::R6Class( - "WgsHistFile", - inherit = File, - public = list( - #' @description - #' Reads the `wgs_hist.csv` file output from DRAGEN. - #' - #' @return tibble with one row and metrics spread across individual columns. - read = function() { - x <- self$path - d <- readr::read_csv(x, col_names = c("var", "pct"), col_types = "cd") - d |> - dplyr::mutate( - var = sub("PCT of bases in wgs with coverage ", "", .data$var), - var = gsub("\\[|\\]|\\(|\\)", "", .data$var), - var = gsub("x", "", .data$var), - var = gsub("inf", "Inf", .data$var) - ) |> - tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |> - dplyr::mutate( - start = as.numeric(.data$start), - end = as.numeric(.data$end), - pct = round(.data$pct, 2), - cumsum = cumsum(.data$pct) - ) - }, - #' @description - #' Writes a tidy version of the `wgs_hist.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid) - } - ) -) - dragen_subprefix <- function(x, suffix) { # L2401290.exon_contig_mean_cov.csv -> exon # L2401290.target_bed_contig_mean_cov.csv -> target_bed diff --git a/R/dragen_fastqc.R b/R/dragen_fastqc.R index 4390037..0ac31bc 100644 --- a/R/dragen_fastqc.R +++ b/R/dragen_fastqc.R @@ -1,51 +1,3 @@ -#' FastqcMetrics R6 Class -#' -#' @description -#' Contains methods for reading and displaying contents of -#' the `fastqc_metrics.csv` file output from DRAGEN. -#' -#' @export -FastqcMetricsFile <- R6::R6Class( - "FastqcMetricsFile", - inherit = File, - public = list( - #' @description - #' Reads the `fastqc_metrics.csv` file output from DRAGEN. - #' - #' @return tibble. TODO. - read = function() { - x <- self$path - res <- dragen_fastqc_metrics_read(x) - }, - #' @description - #' Writes a tidy version of the `fastqc_metrics.csv` file output - #' from DRAGEN. - #' - #' @param d Parsed object from `self$read()`. - #' @param prefix Prefix of output file(s). - #' @param out_dir Output directory. - #' @param out_format Format of output file(s). - #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). - write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) { - if (!is.null(out_dir)) { - prefix <- file.path(out_dir, prefix) - } - d_write <- d |> - tibble::enframe(name = "section") |> - dplyr::rowwise() |> - dplyr::mutate( - section_low = tolower(.data$section), - p = glue("{prefix}_{.data$section_low}"), - out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid)) - ) |> - dplyr::ungroup() |> - dplyr::select("section", "value") |> - tibble::deframe() - invisible(d_write) - } - ) -) - #' DRAGEN FASTQC Metrics #' #' Read DRAGEN `fastqc_metrics.csv` file. diff --git a/R/tso.R b/R/tso.R index 7596449..6df0cde 100644 --- a/R/tso.R +++ b/R/tso.R @@ -325,34 +325,6 @@ tso_tmbt_read <- function(x) { d[] } -#' Plot Fragment Length Hist -#' -#' Plots the fragment length distributions as given in the -#' `fragment_length_hist` file. -#' -#' @param d Parsed tibble. -#' @param min_count Minimum read count to be plotted (def: 10). -#' -#' @return A ggplot2 plot containing fragment lengths on X axis and read counts -#' on Y axis for each sample. -tso_fraglenhist_plot <- function(d, min_count = 10) { - assertthat::assert_that(is.numeric(min_count), min_count >= 0) - d |> - dplyr::filter(.data$Count >= min_count) |> - ggplot2::ggplot(ggplot2::aes(x = .data$FragmentLength, y = .data$Count)) + - ggplot2::geom_line() + - ggplot2::labs(title = "Fragment Length Distribution") + - ggplot2::xlab("Fragment Length (bp)") + - ggplot2::ylab(glue("Read Count (min: {min_count})")) + - ggplot2::theme_minimal() + - ggplot2::theme( - legend.position = c(0.9, 0.9), - legend.justification = c(1, 1), - panel.grid.minor = ggplot2::element_blank(), - panel.grid.major = ggplot2::element_blank(), - plot.title = ggplot2::element_text(colour = "#2c3e50", size = 14, face = "bold") - ) -} #' Read TSO TargetRegionCoverage File #' From 9c32347f478220f2ae220c23f03e43f2197a5eb9 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 18 Oct 2024 21:30:15 +1100 Subject: [PATCH 15/17] dragen: add dirty name checker --- R/dragen.R | 89 ++++++++++++++++++++++------- R/dragen_fastqc.R | 18 +++--- R/tso_dragen.R | 2 +- R/utils.R | 15 +++++ man/dragen_cnv_metrics_read.Rd | 6 ++ man/dragen_coverage_metrics_read.Rd | 5 ++ man/dragen_gc_metrics_read.Rd | 6 ++ man/dragen_mapping_metrics_read.Rd | 5 ++ man/dragen_sv_metrics_read.Rd | 5 ++ man/dragen_trimmer_metrics_read.Rd | 5 ++ man/dragen_umi_metrics_read.Rd | 5 ++ man/dragen_vc_metrics_read.Rd | 5 ++ 12 files changed, 135 insertions(+), 31 deletions(-) diff --git a/R/dragen.R b/R/dragen.R index 90cfd8b..2e6cbbd 100644 --- a/R/dragen.R +++ b/R/dragen.R @@ -34,6 +34,10 @@ dragen_fraglenhist_plot <- function(d, min_count = 10) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_umi_metrics_read(x) +#' } #' @export dragen_umi_metrics_read <- function(x) { d0 <- readr::read_lines(x) @@ -85,6 +89,7 @@ dragen_umi_metrics_read <- function(x) { var = tolower(.data$var), var = dplyr::recode(.data$var, !!!abbrev_nm) ) + dirty_names_cleaned(d1$var, abbrev_nm, x) hist <- d1 |> dplyr::filter(grepl("histo", .data$var)) |> dplyr::select(name = "var", "count") |> @@ -121,6 +126,11 @@ dragen_umi_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_gc_metrics_read(x) +#' } +#' #' @export dragen_gc_metrics_read <- function(x) { d0 <- readr::read_lines(x) @@ -155,6 +165,7 @@ dragen_gc_metrics_read <- function(x) { name = dplyr::recode(.data$name, !!!abbrev_nm) ) |> tidyr::pivot_wider(names_from = "name", values_from = "value") + dirty_names_cleaned(colnames(summary), abbrev_nm, x) # GC BIAS DETAILS details <- d1 |> @@ -190,6 +201,11 @@ dragen_gc_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_cnv_metrics_read(x) +#' } +#' #' @export dragen_cnv_metrics_read <- function(x) { d0 <- readr::read_lines(x) @@ -230,7 +246,9 @@ dragen_cnv_metrics_read <- function(x) { count = as.numeric(.data$count), pct = round(as.numeric(.data$pct), 2), var = dplyr::recode(.data$var, !!!abbrev_nm) - ) |> + ) + dirty_names_cleaned(d2$var, abbrev_nm, x) + d2 <- d2 |> dplyr::select("var", "count", "pct") |> tidyr::pivot_longer(c("count", "pct")) |> dplyr::filter(!is.na(.data$value)) |> @@ -251,6 +269,10 @@ dragen_cnv_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_sv_metrics_read(x) +#' } #' @export dragen_sv_metrics_read <- function(x) { d <- readr::read_lines(x) @@ -261,7 +283,7 @@ dragen_sv_metrics_read <- function(x) { "Number of duplications (PASS)" = "dup", "Number of breakend pairs (PASS)" = "bnd" ) - d |> + res <- d |> tibble::as_tibble_col(column_name = "value") |> dplyr::filter(!grepl("Total number of structural variants", .data$value)) |> tidyr::separate_wider_delim( @@ -275,6 +297,8 @@ dragen_sv_metrics_read <- function(x) { ) |> dplyr::select("var", "count") |> tidyr::pivot_wider(names_from = "var", values_from = "count") + dirty_names_cleaned(colnames(res), abbrev_nm, x) + res } #' Read DRAGEN Trimmer Metrics @@ -284,10 +308,14 @@ dragen_sv_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_trimmer_metrics_read(x) +#' } #' @export dragen_trimmer_metrics_read <- function(x) { - d <- readr::read_lines(x) - assertthat::assert_that(grepl("TRIMMER STATISTICS", d[1])) + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("TRIMMER STATISTICS", d0[1])) abbrev_nm <- c( "Total input reads" = "reads_tot_input", "Total input bases" = "bases_tot", @@ -313,7 +341,7 @@ dragen_trimmer_metrics_read <- function(x) { "Reads filtered for minimum read length R2" = "reads_filt_minreadlenr2" ) - d |> + d1 <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim("value", names = c("category", "extra", "var", "count", "pct"), delim = ",", too_few = "align_start") |> dplyr::mutate( @@ -321,7 +349,9 @@ dragen_trimmer_metrics_read <- function(x) { pct = round(as.numeric(.data$pct), 2), var = dplyr::recode(.data$var, !!!abbrev_nm) ) |> - dplyr::select("var", "count", "pct") |> + dplyr::select("var", "count", "pct") + dirty_names_cleaned(d1$var, abbrev_nm, x) + d1 |> tidyr::pivot_longer(c("count", "pct")) |> dplyr::filter(!is.na(.data$value)) |> dplyr::mutate( @@ -339,6 +369,10 @@ dragen_trimmer_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_vc_metrics_read(x) +#' } #' @export dragen_vc_metrics_read <- function(x) { abbrev_nm1 <- tibble::tribble( @@ -367,12 +401,14 @@ dragen_vc_metrics_read <- function(x) { "Percent Autosome Callability", "callability_auto_pct", FALSE, "Number of samples", "sample_num", FALSE, "Reads Processed", "reads_processed", FALSE, - "Child Sample", "sample_child", FALSE + "Child Sample", "sample_child", FALSE, + "Percent QC Region Callability in Region 1", "qc_region_callability_pct_region1", FALSE, + "Percent QC Region Callability in Region 2", "qc_region_callability_pct_region2", FALSE ) - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("VARIANT CALLER", raw[1])) + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("VARIANT CALLER", d0[1])) # first detect if this is genome or target region - res <- raw |> + d1 <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( "value", @@ -381,7 +417,7 @@ dragen_vc_metrics_read <- function(x) { ) reg1 <- NULL str1 <- NULL - tmp <- res |> + tmp <- d1 |> dplyr::filter(grepl("Chr X number of SNPs over ", .data$var)) |> dplyr::slice_head(n = 1) |> dplyr::pull("var") @@ -406,7 +442,7 @@ dragen_vc_metrics_read <- function(x) { dplyr::select("raw", "clean") |> tibble::deframe() - d <- res |> + d <- d1 |> dplyr::mutate( var = dplyr::recode(.data$var, !!!abbrev_nm), count = dplyr::na_if(.data$count, "NA"), @@ -421,6 +457,7 @@ dragen_vc_metrics_read <- function(x) { ) |> dplyr::filter(.data$category != "summary") |> dplyr::select("category", "sample", "var", "count", "pct") + dirty_names_cleaned(d$var, abbrev_nm, x) # pivot d |> tidyr::pivot_longer(c("count", "pct")) |> @@ -440,6 +477,10 @@ dragen_vc_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_mapping_metrics_read(x) +#' } #' @export dragen_mapping_metrics_read <- function(x) { abbrev_nm <- c( @@ -516,11 +557,11 @@ dragen_mapping_metrics_read <- function(x) { "Average sequenced coverage over genome" = "cov_avg_seq_over_genome", "Filtered rRNA reads" = "reads_rrna_filtered" ) - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("MAPPING/ALIGNING", raw[1])) + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("MAPPING/ALIGNING", d0[1])) # split by RG and non-RG # tidy - d <- raw |> + d <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( "value", @@ -535,6 +576,7 @@ dragen_mapping_metrics_read <- function(x) { RG = dplyr::if_else(.data$RG == "", "Total", .data$RG) ) |> dplyr::select("RG", "var", "count", "pct") + dirty_names_cleaned(unique(d$var), abbrev_nm, x) # pivot d |> tidyr::pivot_longer(c("count", "pct")) |> @@ -553,6 +595,10 @@ dragen_mapping_metrics_read <- function(x) { #' @param x Path to file. #' #' @return Tibble with metrics. +#' @examples +#' \dontrun{ +#' dragen_coverage_metrics_read(x) +#' } #' @export dragen_coverage_metrics_read <- function(x) { # all rows except 'Aligned bases' and 'Aligned reads' refer to the region @@ -572,10 +618,10 @@ dragen_coverage_metrics_read <- function(x) { "Mean/Median autosomal coverage ratio over ", "cov_mean_median_auto_ratio_over_", TRUE, "Aligned reads in ", "reads_aligned_in_", TRUE ) - raw <- readr::read_lines(x) - assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1])) + d0 <- readr::read_lines(x) + assertthat::assert_that(grepl("COVERAGE SUMMARY", d0[1])) # first detect if this is genome, QC coverage region, or target region - res <- raw |> + d1 <- d0 |> tibble::as_tibble_col(column_name = "value") |> tidyr::separate_wider_delim( "value", @@ -584,7 +630,7 @@ dragen_coverage_metrics_read <- function(x) { ) reg1 <- NULL str1 <- NULL - tmp <- res |> + tmp <- d1 |> dplyr::filter(grepl("PCT of .* with coverage ", .data$var)) |> dplyr::slice_head(n = 1) |> dplyr::pull("var") @@ -611,12 +657,13 @@ dragen_coverage_metrics_read <- function(x) { # split to rename the # "PCT of genome with coverage [100x: inf)" values pat <- glue("PCT of {str1} with coverage ") - res1 <- res |> + res1 <- d1 |> # pct just shows % for a couple rows which can be # calculated from their above values dplyr::filter(!grepl(pat, .data$var)) |> dplyr::select("var", "value") - res2 <- res |> + dirty_names_cleaned(res1$var, names(abbrev_nm), x) + res2 <- d1 |> dplyr::filter(grepl(pat, .data$var)) |> dplyr::mutate( var = sub(pat, "", .data$var), diff --git a/R/dragen_fastqc.R b/R/dragen_fastqc.R index 0ac31bc..ceb9ce3 100644 --- a/R/dragen_fastqc.R +++ b/R/dragen_fastqc.R @@ -120,14 +120,14 @@ dragen_fastqc_metrics_read <- function(x) { dplyr::select("mate", "seq", "bp", "value") list( - positional_base_content = pos_base_cont, - positional_base_mean_quality = pos_base_mean_qual, - positional_quality = pos_qual, - read_gc_content = gc_cont, - read_gc_content_quality = gc_cont_qual, - read_lengths = read_len, - read_mean_quality = read_mean_qual, - sequence_positions = seq_pos + fqc_positionalBaseContent = pos_base_cont, + fqc_positionalBaseMeanQuality = pos_base_mean_qual, + fqc_positionalQuality = pos_qual, + fqc_readGCContent = gc_cont, + fqc_readGCContentQuality = gc_cont_qual, + fqc_readLengths = read_len, + fqc_readMeanQuality = read_mean_qual, + fqc_sequencePositions = seq_pos ) |> - tibble::enframe(name = "fastqc_name", value = "fastqc_value") + tibble::enframe(name = "name", value = "data") } diff --git a/R/tso_dragen.R b/R/tso_dragen.R index d5175a3..dc35091 100644 --- a/R/tso_dragen.R +++ b/R/tso_dragen.R @@ -241,7 +241,7 @@ Wf_dragen <- R6::R6Class( #' @param x Path to file. read_fastqcMetrics = function(x) { dat <- dragen_fastqc_metrics_read(x) - tibble::tibble(name = "fastqcmetrics", data = list(dat[])) + dat }, #' @description Read `gc_metrics.csv` file. #' @param x Path to file. diff --git a/R/utils.R b/R/utils.R index 8691826..c2bc44c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,18 @@ +dirty_names_cleaned <- function(dirty, clean, fname = NULL) { + assertthat::assert_that( + rlang::is_atomic(dirty), + rlang::is_atomic(clean) + ) + are_dirty <- dirty[!dirty %in% clean] + msg <- paste( + "Following columns are dirty:", + paste(are_dirty, collapse = ", "), + ifelse(!is.null(fname), paste("Filename:", fname), ""), + sep = "\n" + ) + assertthat::assert_that(length(are_dirty) == 0, msg = msg) +} + #' Print current timestamp for logging #' #' @return Current timestamp as character. diff --git a/man/dragen_cnv_metrics_read.Rd b/man/dragen_cnv_metrics_read.Rd index c6e7281..c347067 100644 --- a/man/dragen_cnv_metrics_read.Rd +++ b/man/dragen_cnv_metrics_read.Rd @@ -15,3 +15,9 @@ Tibble with metrics. \description{ Reads the \code{cnv_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_cnv_metrics_read(x) +} + +} diff --git a/man/dragen_coverage_metrics_read.Rd b/man/dragen_coverage_metrics_read.Rd index 96fd3ef..dc88de2 100644 --- a/man/dragen_coverage_metrics_read.Rd +++ b/man/dragen_coverage_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{coverage_metrics.csv} file generated by DRAGEN. } +\examples{ +\dontrun{ +dragen_coverage_metrics_read(x) +} +} diff --git a/man/dragen_gc_metrics_read.Rd b/man/dragen_gc_metrics_read.Rd index fc53959..f68f4a0 100644 --- a/man/dragen_gc_metrics_read.Rd +++ b/man/dragen_gc_metrics_read.Rd @@ -15,3 +15,9 @@ Tibble with metrics. \description{ Reads the \code{gc_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_gc_metrics_read(x) +} + +} diff --git a/man/dragen_mapping_metrics_read.Rd b/man/dragen_mapping_metrics_read.Rd index 223f0a7..a2ff466 100644 --- a/man/dragen_mapping_metrics_read.Rd +++ b/man/dragen_mapping_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{mapping_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_mapping_metrics_read(x) +} +} diff --git a/man/dragen_sv_metrics_read.Rd b/man/dragen_sv_metrics_read.Rd index 50577e0..0150f5d 100644 --- a/man/dragen_sv_metrics_read.Rd +++ b/man/dragen_sv_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{sv_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_sv_metrics_read(x) +} +} diff --git a/man/dragen_trimmer_metrics_read.Rd b/man/dragen_trimmer_metrics_read.Rd index 806a86f..8a58398 100644 --- a/man/dragen_trimmer_metrics_read.Rd +++ b/man/dragen_trimmer_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{trimmer_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_trimmer_metrics_read(x) +} +} diff --git a/man/dragen_umi_metrics_read.Rd b/man/dragen_umi_metrics_read.Rd index ca80a82..16f57af 100644 --- a/man/dragen_umi_metrics_read.Rd +++ b/man/dragen_umi_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{umi_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_umi_metrics_read(x) +} +} diff --git a/man/dragen_vc_metrics_read.Rd b/man/dragen_vc_metrics_read.Rd index 9bd4dae..e0e904e 100644 --- a/man/dragen_vc_metrics_read.Rd +++ b/man/dragen_vc_metrics_read.Rd @@ -15,3 +15,8 @@ Tibble with metrics. \description{ Reads the \code{vc_metrics.csv} file output from DRAGEN. } +\examples{ +\dontrun{ +dragen_vc_metrics_read(x) +} +} From 94a48578adc3d86c0e6b68a3435dcf83818feca4 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 18 Oct 2024 23:56:58 +1100 Subject: [PATCH 16/17] Wf writer: output prefix --- R/Wf.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/Wf.R b/R/Wf.R index 4f23fc6..9d35956 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -209,6 +209,7 @@ Wf <- R6::R6Class( #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). write = function(x, outdir = NULL, prefix = NULL, format = "tsv", drid = NULL) { assertthat::assert_that(!is.null(prefix)) + assertthat::assert_that(all(c("name", "data") %in% colnames(x))) if (!is.null(outdir)) { prefix <- file.path(outdir, prefix) } @@ -219,7 +220,7 @@ Wf <- R6::R6Class( out = list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid)) ) |> dplyr::ungroup() |> - dplyr::select("name", "data") + dplyr::select("name", "data", prefix = "p") invisible(d_write) } ) # end public From d3af9e84a6c5a3d6ec944d91b37ddf99dfbc26b8 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Fri, 18 Oct 2024 23:58:04 +1100 Subject: [PATCH 17/17] tsov2: incorporate dragenCaller R6 object --- R/tsov2.R | 99 +++++++++++++-------------- man/Wf_tso_ctdna_tumor_only_v2.Rd | 20 ++++-- man/dtw_Wf_tso_ctdna_tumor_only_v2.Rd | 13 ++++ 3 files changed, 77 insertions(+), 55 deletions(-) diff --git a/R/tsov2.R b/R/tsov2.R index 8ec5175..1a08dd9 100644 --- a/R/tsov2.R +++ b/R/tsov2.R @@ -14,11 +14,21 @@ #' prefix <- "L2401290" #' t1 <- Wf_tso_ctdna_tumor_only_v2$new(path = p, prefix = prefix) #' t1$list_files(max_files = 100) +#' t1$dragenObj$list_files(max_files = 100) #' t1$list_files_filter_relevant(max_files = 300) -#' d <- t1$download_files(max_files = 100, dryrun = F) -#' d_tidy <- t1$tidy_files(d) -#' d_write <- t1$write( -#' d_tidy, +#' t1$dragenObj$list_files_filter_relevant(max_files = 300) +#' d1 <- t1$download_files(max_files = 100, dryrun = F) +#' d2 <- t1$dragenObj$download_files(max_files = 100, dryrun = F) +#' d1_tidy <- t1$tidy_files(d1) +#' d2_tidy <- t1$dragenObj$tidy_files(d2) +#' d_write1 <- t1$write( +#' d1_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = prefix, +#' format = "tsv" +#' ) +#' d_write2 <- t1$dragenObj$write( +#' d2_tidy, #' outdir = file.path(p, "dracarys_tidy"), #' prefix = prefix, #' format = "tsv" @@ -54,6 +64,8 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class( public = list( #' @field prefix The LibraryID prefix of the tumor sample (needed for path lookup). prefix = NULL, + #' @field dragenObj dragen object. + dragenObj = NULL, #' @description Create a new Wf_tso_ctdna_tumor_only_v2 object. #' @param path Path to directory with raw workflow results (from S3 or #' local filesystem). @@ -64,6 +76,7 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class( res <- glue("Results/{pref}") li <- "Logs_Intermediates" dc <- glue("{li}/DragenCaller/{pref}") + self$dragenObj <- Wf_dragen$new(path = file.path(path, dc), prefix = glue("{dc}/{prefix}")) # Results reg1 <- tibble::tribble( ~regex, ~fun, @@ -73,7 +86,7 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class( glue("{res}/{pref}\\.gene_cov_report\\.tsv$"), "cvgrepg", glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz$"), "hardfilt", glue("{res}/{pref}\\.hard-filtered\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY", - glue("{res}/{pref}\\.microsat_output\\.json$"), "msi", + # glue("{res}/{pref}\\.microsat_output\\.json$"), "msi", # in DragenCaller glue("{res}/{pref}\\.tmb.trace\\.tsv$"), "tmbt", glue("{res}/{pref}_CombinedVariantOutput\\.tsv$"), "cvo", glue("{res}/{pref}_Fusions\\.csv$"), "fus", @@ -81,50 +94,11 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class( # glue("{res}/{pref}_SmallVariants_Annotated\\.json\\.gz$"), "DOWNLOAD_ONLY", glue("{li}/SampleAnalysisResults/{pref}_SampleAnalysisResults\\.json$"), "sar" ) - # DragenCaller - reg2 <- tibble::tribble( - ~regex, ~fun, - glue("{dc}/{pref}\\-replay\\.json$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.cnv_metrics.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_contig_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.exon_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.fastqc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.fragment_length_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.gc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.gvcf_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.mapping_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.microsat_diffs\\.txt$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.microsat_output\\.json$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.sv_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_contig_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.target_bed_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.time_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_contig_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.tmb_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.trimmer_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.umi_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.vc_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_contig_mean_cov\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_coverage_metrics\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_fine_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_hist\\.csv$"), "DOWNLOAD_ONLY", - glue("{dc}/{pref}\\.wgs_overall_mean_cov\\.csv$"), "DOWNLOAD_ONLY" - ) - regexes <- dplyr::bind_rows(reg1, reg2) |> + regexes <- reg1 |> dplyr::mutate( fun = paste0("read_", .data$fun), fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun) ) - super$initialize(path = path, wname = wname, regexes = regexes) self$prefix <- prefix }, @@ -225,6 +199,19 @@ Wf_tso_ctdna_tumor_only_v2 <- R6::R6Class( #' #' @examples #' \dontrun{ +#' #---- Local ----# +#' p <- file.path( +#' "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", +#' "analysis/cttsov2/20240915ff0295ed" +#' ) +#' prefix <- "L2401290" +#' outdir <- file.path(p, "dracarys_tidy") +#' d <- dtw_Wf_tso_ctdna_tumor_only_v2( +#' path = p, prefix = prefix, outdir = outdir, +#' format = "tsv", +#' dryrun = F +#' ) +#' #' p <- file.path( #' "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", #' "analysis/cttsov2/20240915ff0295ed" @@ -244,18 +231,28 @@ dtw_Wf_tso_ctdna_tumor_only_v2 <- function(path, prefix, outdir, max_files = 1000, dryrun = FALSE) { obj <- Wf_tso_ctdna_tumor_only_v2$new(path = path, prefix = prefix) - d_dl <- obj$download_files( + d_dl1 <- obj$download_files( + outdir = outdir, max_files = max_files, dryrun = dryrun + ) + d_dl2 <- obj$dragenObj$download_files( outdir = outdir, max_files = max_files, dryrun = dryrun ) if (!dryrun) { - d_tidy <- obj$tidy_files(d_dl) - d_write <- obj$write( - d_tidy, + d_tidy1 <- obj$tidy_files(d_dl1) + d_tidy2 <- obj$dragenObj$tidy_files(d_dl2) + d_write1 <- obj$write( + d_tidy1, + outdir = outdir_tidy, + prefix = prefix, + format = format + ) + d_write2 <- obj$dragenObj$write( + d_tidy2, outdir = outdir_tidy, prefix = prefix, format = format ) - return(d_write) + return(dplyr::bind_rows(d_write1, d_write2)) } - return(d_dl) + return(dplyr::bind_rows(d_dl1, d_dl2)) } diff --git a/man/Wf_tso_ctdna_tumor_only_v2.Rd b/man/Wf_tso_ctdna_tumor_only_v2.Rd index 55e998d..fb84a29 100644 --- a/man/Wf_tso_ctdna_tumor_only_v2.Rd +++ b/man/Wf_tso_ctdna_tumor_only_v2.Rd @@ -17,11 +17,21 @@ p <- file.path( prefix <- "L2401290" t1 <- Wf_tso_ctdna_tumor_only_v2$new(path = p, prefix = prefix) t1$list_files(max_files = 100) +t1$dragenObj$list_files(max_files = 100) t1$list_files_filter_relevant(max_files = 300) -d <- t1$download_files(max_files = 100, dryrun = F) -d_tidy <- t1$tidy_files(d) -d_write <- t1$write( - d_tidy, +t1$dragenObj$list_files_filter_relevant(max_files = 300) +d1 <- t1$download_files(max_files = 100, dryrun = F) +d2 <- t1$dragenObj$download_files(max_files = 100, dryrun = F) +d1_tidy <- t1$tidy_files(d1) +d2_tidy <- t1$dragenObj$tidy_files(d2) +d_write1 <- t1$write( + d1_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = prefix, + format = "tsv" +) +d_write2 <- t1$dragenObj$write( + d2_tidy, outdir = file.path(p, "dracarys_tidy"), prefix = prefix, format = "tsv" @@ -58,6 +68,8 @@ d_write <- t2$write( \if{html}{\out{
}} \describe{ \item{\code{prefix}}{The LibraryID prefix of the tumor sample (needed for path lookup).} + +\item{\code{dragenObj}}{dragen object.} } \if{html}{\out{
}} } diff --git a/man/dtw_Wf_tso_ctdna_tumor_only_v2.Rd b/man/dtw_Wf_tso_ctdna_tumor_only_v2.Rd index 2a8ebf7..5d9423f 100644 --- a/man/dtw_Wf_tso_ctdna_tumor_only_v2.Rd +++ b/man/dtw_Wf_tso_ctdna_tumor_only_v2.Rd @@ -39,6 +39,19 @@ Downloads files from the \code{tso_ctdna_tumor_only_v2} workflow and writes them } \examples{ \dontrun{ +#---- Local ----# +p <- file.path( + "~/s3/pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", + "analysis/cttsov2/20240915ff0295ed" +) +prefix <- "L2401290" +outdir <- file.path(p, "dracarys_tidy") +d <- dtw_Wf_tso_ctdna_tumor_only_v2( + path = p, prefix = prefix, outdir = outdir, + format = "tsv", + dryrun = F +) + p <- file.path( "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production", "analysis/cttsov2/20240915ff0295ed"