diff --git a/.Rbuildignore b/.Rbuildignore index f48efd1..f24fedd 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,7 @@ ^scripts$ ^doc$ ^Meta$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ +^tests/testthat.R$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 1f8f6db..722d601 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,19 +4,18 @@ name: R-CMD-check jobs: R-CMD-check: - runs-on: macOS-latest + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@master - - uses: r-lib/actions/setup-pandoc@master - - name: Install dependencies - run: | - install.packages(c("remotes", "rcmdcheck", "knitr")) - deps <- remotes::dev_package_deps(dependencies = TRUE) - install.packages(deps$package[!is.na(deps$available)]) - if (!requireNamespace("BiocManager", quietly = TRUE)) {install.packages("BiocManager")} - BiocManager::install(deps$package[is.na(deps$available)]) - shell: Rscript {0} - - name: Check - run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") - shell: Rscript {0} + - uses: actions/checkout@v3 + - uses: r-lib/actions/setup-r@v2 + - uses: r-lib/actions/setup-pandoc@v2 + with: + pandoc-version: '2.17.1' + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck devtools + needs: check + - uses: r-lib/actions/check-r-package@v2 + with: + args: 'c("--no-manual")' + error-on: '"error"' diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..ed7650c --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,48 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4.4.1 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..3e6c483 --- /dev/null +++ b/.lintr @@ -0,0 +1,2 @@ +linters: linters_with_defaults( + indentation_linter(hanging_indent_style="tidy")) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..0c8bc3c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,52 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-merge-conflict + - id: debug-statements + - id: mixed-line-ending + - id: detect-private-key + - id: check-case-conflict + - id: check-yaml + - id: trailing-whitespace + - repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.11.0 + hooks: + - id: markdownlint-cli2 + files: \.(md|qmd)$ + types: [file] + exclude: LICENSE.md + - id: markdownlint-cli2-fix + files: \.(md|qmd)$ + types: [file] + exclude: LICENSE.md + - repo: https://github.com/lorenzwalthert/precommit + rev: v0.3.2.9027 + hooks: + - id: style-files + name: style-files + description: style files with {styler} + entry: Rscript inst/hooks/exported/style-files.R + language: r + files: '(\.[rR]profile|\.[rR]|\.[rR]md|\.[rR]nw|\.[qQ]md)$' + exclude: | + (?x)^( + renv/activate\.R| + )$ + minimum_pre_commit_version: "2.13.0" + - id: parsable-R + name: parsable-R + description: check if a .R file is parsable + entry: Rscript inst/hooks/exported/parsable-R.R + language: r + files: '\.[rR](md)?$' + minimum_pre_commit_version: "2.13.0" + - id: lintr + args: [--warn_only] + name: lintr + description: check if a `.R` file is lint free (using {lintr}) + entry: Rscript inst/hooks/exported/lintr.R + language: r + files: '(\.[rR]profile|\.R|\.Rmd|\.Rnw|\.r|\.rmd|\.rnw)$' + exclude: 'renv/activate\.R' + minimum_pre_commit_version: "2.13.0" diff --git a/DESCRIPTION b/DESCRIPTION index 4450307..af2e748 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: genecovr Title: Gene body coverage analysis to evaluate genome assemblies -Version: 0.0.0.9013 -Authors@R: +Version: 0.1.0 +Authors@R: person(given = "Per", family = "Unneberg", role = c("aut", "cre"), @@ -14,9 +14,7 @@ License: GPL-3 Encoding: UTF-8 LazyData: true Imports: - BiocGenerics, BiocParallel, - Biostrings, GenomeInfoDb, GenomicRanges (>= 1.32.0), IRanges, @@ -36,4 +34,5 @@ Suggests: VignetteBuilder: knitr Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.3 +URL: https://nbisweden.github.io/genecovr/ diff --git a/README.Rmd b/README.Rmd index d6dd96d..c030cb7 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,8 +19,15 @@ knitr::opts_chunk$set( [![R build status](https://github.com/NBISweden/genecovr/workflows/R-CMD-check/badge.svg)](https://github.com/NBISweden/genecovr/actions) -Perform gene body coverage analyses in R to evaluate genome assembly -quality. +`genecovr` is an `R` package that provides plotting functions that +summarize gene transcript to genome alignments. The main purpose is to +assess the effect of polishing and scaffolding operations has on the +quality of a genome assembly. The gene transcript set is a large +sequence set consisting of assembled transcripts from RNA-seq data +generated in relation to a genome assembly project. Therefore, +`genecovr` serves as a complement to software such as +[BUSCO](https://busco.ezlab.org/), which evaluates genome assembly +quality using a smaller set of well-defined single-copy orthologs. ## Installation @@ -28,12 +35,14 @@ You can install the released version of genecovr from [NBIS GitHub](https://github.com/nbis) with: ``` r +# If necessary, uncomment to install devtools # install.packages("devtools") devtools::install_github("NBISweden/genecovr") ``` +## Usage -## Quick usage +### genecovr script quick start There is a helper script for generating basic plots located in PACKAGE_DIR/bin/genecovr. Create a data input csv-delimited file with @@ -52,8 +61,67 @@ script to generate plots: PACKAGE_DIR/bin/genecovr indata.csv ``` -## Vignette +#### Example -Alternatively, import the library as usual in an R script and use the -package functions. See the vignette for a minimum working example. +There are example files located in PACKAGE_DIR/inst/extdata consisting +of two psl alignment files containing gmap alignments and fasta +indices for the transcript sequences and two for different assembly +versions: +- nonpolished.fai - fasta index for raw assembly +- polished.fai - fasta index for polished assembly +- transcripts.fai - fasta index for transcript sequences +- transcripts2nonpolished.psl - gmap alignments, transcripts to raw assembly +- transcripts2polished.psl - gmap alignments, transcripts to polished + assembly + +Using these files and the labels `non` and `pol` for the different +assemblies, a `genecovr` input file (called e.g., `assemblies.csv`) +would look as follows: + +``` +nonpol,transcripts2nonpolished.psl,nonpolished.fai,transcripts.fai +pol,transcripts2polished.psl,polished.fai,transcripts.fai +``` + +and the command to run would be: + +``` +genecovr assemblies.csv +``` + +#### genecovr options + +To list genecovr script options, type 'genecovr -h`: + +``` +usage: genecovr [-h] [-v] [-p number] + [-d OUTPUT_DIRECTORY] [--height HEIGHT] + [--width WIDTH] + csvfile + +positional arguments: + csvfile csv-delimited file with columns + 1. data label + 2. mapping file (supported formats: psl) + 3. assembly file (fasta or fasta index) + 4. transcript file (fasta or fasta index) + +optional arguments: + -h, --help show this help message and exit + -v, --verbose print extra output + -p number, --cpus number + number of cpus [default 1] + -d OUTPUT_DIRECTORY, --output-directory OUTPUT_DIRECTORY + output directory + --height HEIGHT figure height in inches [default 6.0] + --width WIDTH figure width in inches [default 6.0] +``` + + + +### R package vignette + +Alternatively, import the library in an R script and use the package +functions. See [Get started](articles/genecovr.html) or run +`vignette("genecovr")` for a minimum working example. diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..ed7e8b5 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,9 @@ +url: https://nbisweden.github.io/genecovr/ +template: + bootstrap: 5 + +reference: + - title: genecovr + - contents: + - matches(".*") + diff --git a/inst/bin/genecovr b/inst/bin/genecovr index ecb20f5..8abe49e 100755 --- a/inst/bin/genecovr +++ b/inst/bin/genecovr @@ -131,6 +131,7 @@ apl <- AlignmentPairsList( seqinfo.query=transcripts.sinfo[[x]]) }, BPPARAM=bpparam) ) + names(apl) <- names(psl.fn) ##------------------------------ @@ -183,8 +184,9 @@ save_plot(p, outfile) ## FIXME: number of levels should be parametrized via option +suppressPackageStartupMessages(library(dplyr)) outfile <- file.path(outdir, "qnuminsert") -x <- insertionSummary(apl, reduce=FALSE) +x <- dplyr::tibble(insertionSummary(apl, reduce=FALSE)) p <- ggplot(x, aes(id)) + geom_bar(aes(fill=cuts)) + scale_fill_viridis_d(name="qNumInsert", begin=1, end=0) @@ -199,9 +201,8 @@ message("saving ", outfile) write.csv(x, file=gzfile(outfile), row.names=FALSE) ## Also make plot based on gbc -suppressPackageStartupMessages(library(dplyr)) outfile <- file.path(outdir, "qnuminsert.gbc") -x <- insertionSummary(apl) +x <- dplyr::tibble(insertionSummary(apl)) p <- ggplot(x, aes(id)) + geom_bar(aes(fill=cuts)) + scale_fill_viridis_d(name="qNumInsert", begin=1, end=0) @@ -211,6 +212,10 @@ save_plot(p, outfile) ##-------------------- ## Save gbc data ##-------------------- +x$revmap <- as.character(x$revmap) +x$hitCoverage <- as.character(x$hitCoverage) +x$hitStart <- as.character(x$hitStart) +x$hitEnd <- as.character(x$hitEnd) outfile <- file.path(outdir, "gbcdata.tsv.gz") message("saving ", outfile) write.table(x, file=gzfile(outfile), row.names=FALSE, sep="\t") @@ -313,11 +318,13 @@ p <- ggplot(data=subset(data, n.subjects>1), outfile <- file.path(outdir, paste0("depth_breadth_seqlengths.mm", mm)) save_plot(p, outfile) - +data$revmap <- as.character(data$revmap) +data$hitCoverage <- as.character(data$hitCoverage) +data$hitStart <- as.character(data$hitStart) +data$hitEnd <- as.character(data$hitEnd) outfile <- file.path(outdir, "gene_body_coverage.csv.gz") message("saving ", outfile) -write.csv(data, gzfile(outfile), row.names=FALSE) - +write.csv(dplyr::tibble(data), gzfile(outfile), row.names=FALSE) ############################## ## Save Rdata of analysis diff --git a/man/geneBodyCoverage.Rd b/man/geneBodyCoverage.Rd index 063aa6d..0d39d41 100644 --- a/man/geneBodyCoverage.Rd +++ b/man/geneBodyCoverage.Rd @@ -37,13 +37,15 @@ breadthOfCoverage indicates the multiplicity of the query. For queries that map multiple subjects, a value close to 1 indicates the query has been split in several subjects, whereas a higher value indicates sequence duplication at the -subject level.\preformatted{The revmap column maps the output ranges to the input ranges +subject level. + +\if{html}{\out{
}}\preformatted{The revmap column maps the output ranges to the input ranges as lists of numerical ids. These ids can be used to retrieve the corresponding AlignmentPairs ranges providing a link to the subjects. The hitCoverage lists the width of each reduced hit, and hitStart and hitEnd provide the transcript coordinates of these hits. -} +}\if{html}{\out{
}} } \description{ \code{geneBodyCoverage(obj, ...)}: calculate gene body coverage from input @@ -60,7 +62,9 @@ being that if there are >1 overlaps for a seqname, the sequence is associated with multiple distinct regions in the subject sequence. Thus, the unit of interest here is the query, and the function seeks to identify reduced disjoint -regions of a query sequence that map to a subject.\preformatted{The function returns a GRanges object with reduced regions, +regions of a query sequence that map to a subject. + +\if{html}{\out{
}}\preformatted{The function returns a GRanges object with reduced regions, i.e. overlaps are merged. Information about matches and mismatches is currently dropped as it usually is not possible to infer where mismatches occur. Instead, the data column @@ -74,5 +78,5 @@ matches to the width of the region. Since the association between query and subject regions is removed, the return value is a GRanges object consisting of the reduced query ranges with a revmap and coverage attribute. -} +}\if{html}{\out{
}} } diff --git a/man/genecovr-package.Rd b/man/genecovr-package.Rd index 89c0c0c..d87372b 100644 --- a/man/genecovr-package.Rd +++ b/man/genecovr-package.Rd @@ -19,6 +19,13 @@ measurement unit is a gene or transcript model. autoplot: plot GRanges objects with gene body coverage metrics } +\seealso{ +Useful links: +\itemize{ + \item \url{https://nbisweden.github.io/genecovr/} +} + +} \author{ \strong{Maintainer}: Per Unneberg \email{per.unneberg@nbis.se} (\href{https://orcid.org/0000-0001-5735-3315}{ORCID}) diff --git a/man/reduceHitCoverage.Rd b/man/reduceHitCoverage.Rd index 1636aa1..d4f693b 100644 --- a/man/reduceHitCoverage.Rd +++ b/man/reduceHitCoverage.Rd @@ -27,7 +27,9 @@ being that if there are >1 overlaps for a seqname, the sequence is associated with multiple distinct regions in the subject sequence. Thus, the unit of interest here is the query, and the function seeks to identify reduced disjoint -regions of a query sequence that map to a subject.\preformatted{The function returns a GRanges object with reduced regions, +regions of a query sequence that map to a subject. + +\if{html}{\out{
}}\preformatted{The function returns a GRanges object with reduced regions, i.e. overlaps are merged. Information about matches and mismatches is currently dropped as it usually is not possible to infer where mismatches occur. Instead, the data column @@ -41,7 +43,7 @@ matches to the width of the region. Since the association between query and subject regions is removed, the return value is a GRanges object consisting of the reduced query ranges with a revmap and coverage attribute. -} +}\if{html}{\out{
}} } \examples{ ranges <- IRanges::IRanges( diff --git a/tests/testthat/test-genecovr.R b/tests/testthat/test-genecovr.R index 1114979..19241d6 100644 --- a/tests/testthat/test-genecovr.R +++ b/tests/testthat/test-genecovr.R @@ -1,29 +1,37 @@ -skip("Skip until resolve issue of testing package binary") +skip("Skip on github until resolve issue of testing package binary") pkg_path <- getwd() tmp <- tempdir(check = TRUE) withr::local_dir(tmp) withr::local_temp_libpaths() withr::local_path(file.path(.libPaths()[1], "genecovr", "bin")) -devtools::install(pkg=pkg_path, quick = TRUE, upgrade = "never") +devtools::install(pkg = pkg_path, quick = TRUE, upgrade = "never") create_file <- function(fn) system.file("extdata", fn, package = "genecovr") data <- as.data.frame(rbind( - c("nonpol", create_file("transcripts2nonpolished.psl"), - create_file("nonpolished.fai"), create_file("transcripts.fai")), - c("pol", create_file("transcripts2polished.psl"), - create_file("polished.fai"), create_file("transcripts.fai")) + c( + "nonpol", create_file("transcripts2nonpolished.psl"), + create_file("nonpolished.fai"), create_file("transcripts.fai") + ), + c( + "pol", create_file("transcripts2polished.psl"), + create_file("polished.fai"), create_file("transcripts.fai") + ) )) colnames(data) <- NULL withr::local_file(write.csv(data, file = "assemblies.csv", row.names = FALSE)) test_that("genecovr runs as expected", { - system(paste(paste0("R_LIBS_USER=", .libPaths()[1]), - "genecovr", "assemblies.csv")) - expect_equal(length(list.files(pattern = "*.png")), 13) - expect_equal(length(grep("Rplots", list.files(pattern = "*.pdf"), - invert = TRUE)), 13) - expect_equal(length(list.files(pattern = "*.csv.gz")), 4) + system(paste( + paste0("R_LIBS_USER=", .libPaths()[1]), + "genecovr", "assemblies.csv" + )) + expect_equal(length(list.files(pattern = "*.png")), 15) + expect_equal(length(grep("Rplots", list.files(pattern = "*.pdf"), + invert = TRUE + )), 15) + expect_equal(length(list.files(pattern = "*.csv.gz")), 4) + expect_equal(length(list.files(pattern = "*.tsv.gz")), 1) }) -withr::defer(unlink(dir, recursive = TRUE)) +withr::defer(unlink(tmp, recursive = TRUE)) diff --git a/vignettes/genecovr.Rmd b/vignettes/genecovr.Rmd index 19d4404..5966487 100644 --- a/vignettes/genecovr.Rmd +++ b/vignettes/genecovr.Rmd @@ -4,7 +4,7 @@ author: "Per Unneberg" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Vignette Title} + %\VignetteIndexEntry{Gene body coverage analysis in R} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} biblio-style: plain @@ -15,21 +15,24 @@ bibliography: bibliography.bib This vignette describes analyses of gene body coverage and other genome assembly evaluation metrics with in R using the `genecovr` -package. `ripr` contains functionality for parsing alignment files, -calculating gene body coverages, and generating simple QC metrics to -assess assembly quality output. Before we start with the example -analysis, we describe how `genecovr` represents pairwise alignments. +package. `genecovr` contains functionality for parsing alignment +files, calculating gene body coverages, and generating simple QC +metrics to assess assembly quality output. Before we start with the +example analysis, we describe how `genecovr` represents pairwise +alignments. ## R setup ```{r knitr-setup, echo=FALSE, cache=FALSE} library(knitr) -knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE, - fig.width=8, fig.height=6, autodep=TRUE, - cache=FALSE, include=TRUE, eval=TRUE, tidy=FALSE, - dev=c('png')) +knitr::opts_chunk$set( + echo = TRUE, warning = FALSE, message = FALSE, + fig.width = 8, fig.height = 6, autodep = TRUE, + cache = FALSE, include = TRUE, eval = TRUE, tidy = FALSE, + dev = c("png") +) knitr::knit_hooks$set(inline = function(x) { - prettyNum(x, big.mark=" ") + prettyNum(x, big.mark = " ") }) ``` @@ -45,9 +48,10 @@ library(rlang) ```{r ggplot-theme, cache=FALSE} library(viridis) library(RColorBrewer) -bw <- theme_bw(base_size=18) %+replace% theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +bw <- theme_bw(base_size = 18) %+replace% + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) theme_set(bw) -color.pal.4 <- brewer.pal(name = "Paired", n = 4) +color_pal_4 <- brewer.pal(name = "Paired", n = 4) psize <- 3 ``` @@ -77,29 +81,44 @@ the psl files with `readPsl`, storing the results in an `AlignmentPairsList` for convenience. ``` {r gbc-load-data} -assembly.fai.fn <- list( - nonpol = system.file("extdata", "nonpolished.fai", package="genecovr"), - pol = system.file("extdata", "polished.fai", package="genecovr") +assembly_fai_fn <- list( + nonpol = system.file("extdata", "nonpolished.fai", + package = "genecovr" + ), + pol = system.file("extdata", "polished.fai", + package = "genecovr" + ) ) -transcripts.fai.fn <- list( - nonpol = system.file("extdata", "transcripts.fai", package="genecovr"), - pol = system.file("extdata", "transcripts.fai", package="genecovr") +transcripts_fai_fn <- list( + nonpol = system.file("extdata", "transcripts.fai", + package = "genecovr" + ), + pol = system.file("extdata", "transcripts.fai", + package = "genecovr" + ) ) -assembly.sinfo <- endoapply(assembly.fai.fn, readFastaIndex) -transcripts.sinfo <- endoapply(transcripts.fai.fn, readFastaIndex) +assembly_sinfo <- endoapply(assembly_fai_fn, readFastaIndex) +transcripts_sinfo <- endoapply(transcripts_fai_fn, readFastaIndex) -psl.fn <- list( - nonpol = system.file("extdata", "transcripts2nonpolished.psl", package="genecovr"), - pol = system.file("extdata", "transcripts2polished.psl", package="genecovr") +psl_fn <- list( + nonpol = system.file("extdata", "transcripts2nonpolished.psl", + package = "genecovr" + ), + pol = system.file("extdata", "transcripts2polished.psl", + package = "genecovr" + ) ) apl <- AlignmentPairsList( - lapply(names(psl.fn), function(x) { - readPsl(psl.fn[[x]], seqinfo.sbjct=assembly.sinfo[[x]], seqinfo.query=transcripts.sinfo[[x]]) - }) + lapply(names(psl_fn), function(x) { + readPsl(psl_fn[[x]], + seqinfo.sbjct = assembly_sinfo[[x]], + seqinfo.query = transcripts_sinfo[[x]] + ) + }) ) -names(apl) <- names(psl.fn) +names(apl) <- names(psl_fn) ``` ## Plot ratio matches to width of alignment regions @@ -108,7 +127,8 @@ We first plot the ratio of matches to width of alignments with respect to the transcripts. ``` {r gbc-plot-alignment-width} -plot(apl, aes(x=id, y=matches/query.width, fill=id), which="violin") + ylim(0.8, 1) + scale_fill_viridis_d() +plot(apl, aes(x = id, y = matches / query.width, fill = id), which = "violin") + + ylim(0.8, 1) + scale_fill_viridis_d() ``` There is a clear shift to higher percentage matches in the polished @@ -121,15 +141,18 @@ We can also select multiple columns to plot in the ```{r gbc-plot-match-indel} cnames <- c("misMatches", "query.NumInsert", "query.BaseInsert") -plot(apl, aes(x=id, y=get_expr(enquo(cnames))), which="violin") + facet_wrap(. ~ name, scales="free") +plot(apl, aes(x = id, y = get_expr(enquo(cnames))), which = "violin") + + facet_wrap(. ~ name, scales = "free") ``` ```{r gbc-plot-match-indel-boxplot} -plot(apl, aes(x=id, y=get_expr(enquo(cnames))), which="boxplot") + facet_wrap(. ~ name, scales="free") +plot(apl, aes(x = id, y = get_expr(enquo(cnames))), which = "boxplot") + + facet_wrap(. ~ name, scales = "free") ``` ```{r gbc-plot-match-indel-log10-boxplot} -plot(apl, aes(x=id, y=get_expr(enquo(cnames))), which="boxplot") + facet_wrap(. ~ name, scales="free") + scale_y_continuous(trans="log10") +plot(apl, aes(x = id, y = get_expr(enquo(cnames))), which = "boxplot") + + facet_wrap(. ~ name, scales = "free") + scale_y_continuous(trans = "log10") ``` ## Plot number of indels @@ -144,10 +167,10 @@ consequence of this is that as a transcript may be split in multiple alignments, the bars are of unequal height. ```{r gbc-plot-qnuminsert} -x <- insertionSummary(apl, reduce=FALSE) +x <- insertionSummary(apl, reduce = FALSE) ggplot(x, aes(id)) + - geom_bar(aes(fill=cuts)) + - scale_fill_viridis_d(name="qNumInsert", begin=1, end=0) + geom_bar(aes(fill = cuts)) + + scale_fill_viridis_d(name = "qNumInsert", begin = 1, end = 0) ``` An alternative is to summarize the number of insertions over a @@ -159,52 +182,55 @@ the fewest number of insertions. ```{r gbc-plot-qnuminsert-gbc} x <- insertionSummary(apl) ggplot(x, aes(id)) + - geom_bar(aes(fill=cuts)) + - scale_fill_viridis_d(name="qNumInsert", begin=1, end=0) + geom_bar(aes(fill = cuts)) + + scale_fill_viridis_d(name = "qNumInsert", begin = 1, end = 0) ``` - - ## Gene body coverage - The function `geneBodyCoverage` takes an `AlignmentPairs` object and summarizes breadth of coverage and number of subject hits per transcript. ``` {r gbc-make-gene-body-coverage} -gbc <- lapply(apl, geneBodyCoverage, min.match=0.1) +gbc <- lapply(apl, geneBodyCoverage, min.match = 0.1) ``` A summary can be obtained with the `summarizeGeneBodyCoverage` function. We define a range of minimum match hit cutoffs to filter out -hits with too few matches in the aligned region. +hits with too few matches in the aligned region. ``` {r gbc-summarize-gene-body-coverage} min.match <- c(0.25, 0.5, 0.75, 0.9) names(min.match) <- min.match -gbc.summary <- lapply(apl, function(x) { - y <- do.call("rbind", lapply(min.match, function(mm) { - summarizeGeneBodyCoverage(x, min.match=mm) - })) - }) +gbc_summary <- lapply(apl, function(x) { + y <- do.call("rbind", lapply(min.match, function(mm) { + summarizeGeneBodyCoverage(x, min.match = mm) + })) +}) ``` We combine the data ``` {r gbc-plot-gene-body-coverage-combine-data} library(dplyr) -data <- dplyr::bind_rows(lapply(gbc.summary, data.frame), .id="dataset") +data <- dplyr::bind_rows(lapply(gbc_summary, data.frame), .id = "dataset") ``` + and plot the resulting coverages ```{r gbc-plot-gene-body-coverage} h <- max(data$total) -hmax <- ceiling(h/100) * 100 -ggplot(subset(data, min.match==0.25), aes(x=min.coverage, y=count, group=dataset, color=dataset)) + - geom_abline(slope=0, intercept=h) + - geom_point(aes(shape=dataset, color=dataset), size=psize) + geom_line() + - scale_color_viridis_d() + scale_y_continuous(breaks=c(pretty(data$count)), limits=c(0, hmax)) +hmax <- ceiling(h / 100) * 100 +ggplot( + subset(data, min.match == 0.25), + aes(x = min.coverage, y = count, group = dataset, color = dataset) +) + + geom_abline(slope = 0, intercept = h) + + geom_point(aes(shape = dataset, color = dataset), size = psize) + + geom_line() + + scale_color_viridis_d() + + scale_y_continuous(breaks = c(pretty(data$count)), limits = c(0, hmax)) ``` ## Number of contigs per transcript @@ -214,17 +240,25 @@ contigs. We calculate the number of subjects by coverage cutoff with the function `countSubjectsByCoverage` ```{r gbc-ncontigs-per-transcript} -data <- dplyr::bind_rows(lapply( - lapply(apl, countSubjectsByCoverage), - data.frame), - .id="dataset") - +data <- dplyr::bind_rows( + lapply( + lapply(apl, countSubjectsByCoverage), + data.frame + ), + .id = "dataset" +) ``` and plot the results ```{r gbc-plot-ncontigs-per-transcript} -ggplot(data=data, aes(x=factor(min.coverage), y=Freq, fill=n.subjects)) + geom_bar(stat='identity', position=position_stack()) + scale_fill_viridis_d(begin=1, end=0) + facet_wrap(. ~ dataset, nrow=1, labeller=label_both) +ggplot(data = data, aes( + x = factor(min.coverage), + y = Freq, fill = n.subjects +)) + + geom_bar(stat = "identity", position = position_stack()) + + scale_fill_viridis_d(begin = 1, end = 0) + + facet_wrap(. ~ dataset, nrow = 1, labeller = label_both) ``` ## Duplicated versus split transcripts @@ -238,33 +272,61 @@ of coverage against length-normalized coverage. First combine the data: ```{r gbc-depth-by-breadth-data} -data <- dplyr::bind_rows(lapply( - lapply(apl, geneBodyCoverage), - data.frame), - .id="dataset") +data <- dplyr::bind_rows( + lapply( + lapply(apl, geneBodyCoverage), + data.frame + ), + .id = "dataset" +) ``` -and plot the ratio depthOfCoverage / breadthOfCoverage against length-normalized coverage +and plot the ratio depthOfCoverage / breadthOfCoverage against +length-normalized coverage + ```{r gbc-plot-depth-by-breadth-normalized} -ggplot(data=data, aes(x=coverage, y=depthOfCoverage / breadthOfCoverage, color=factor(n.subjects))) + geom_point(size=psize) + scale_color_viridis_d(alpha=.8) + xlim(0, 1) + ylim(0, 5) + facet_wrap(. ~ dataset) +ggplot(data = data, aes( + x = coverage, y = depthOfCoverage / breadthOfCoverage, + color = factor(n.subjects) +)) + + geom_point(size = psize) + + scale_color_viridis_d(alpha = .8) + + xlim(0, 1) + + ylim(0, 5) + + facet_wrap(. ~ dataset) ``` Alternatively we can make a jitter plot of the depthOfCoverage by breadthOfCoverage against number of subjects. ```{r gbc-plot-depth-by-breadth-by-contigs} -ggplot(data=subset(data, n.subjects>1), aes(x=factor(dataset), y=depthOfCoverage / breadthOfCoverage)) + geom_jitter(size=psize, alpha=.6) + facet_wrap(. ~ n.subjects) +ggplot( + data = subset(data, n.subjects > 1), + aes(x = factor(dataset), y = depthOfCoverage / breadthOfCoverage) +) + + geom_jitter(size = psize, alpha = .6) + + facet_wrap(. ~ n.subjects) ``` A similar picture is obtained via a histogram plot. ```{r gbc-plot-depth-by-breadth-by-contigs-hist} -ggplot(data=subset(data, n.subjects>1), aes(depthOfCoverage/breadthOfCoverage)) + geom_histogram() + facet_grid(vars(dataset)) +ggplot( + data = subset(data, n.subjects > 1), + aes(depthOfCoverage / breadthOfCoverage) +) + + geom_histogram() + + facet_grid(vars(dataset)) ``` Finally, we assess whether there is a length bias in the ratio condition on the number of subjects per transcript. ```{r gbc-plot-seqlengths-by-contigs-hist} -ggplot(data=subset(data, n.subjects>1), aes(x=factor(dataset), y=seqlengths)) + geom_boxplot() + facet_grid(. ~ n.subjects) +ggplot( + data = subset(data, n.subjects > 1), + aes(x = factor(dataset), y = seqlengths) +) + + geom_boxplot() + + facet_grid(. ~ n.subjects) ```