diff --git a/.travis.scripts/builddox.sh b/.travis.scripts/builddox.sh deleted file mode 100644 index a9bf588..0000000 --- a/.travis.scripts/builddox.sh +++ /dev/null @@ -1 +0,0 @@ -#!/bin/bash diff --git a/.travis.scripts/coveralls.sh b/.travis.scripts/coveralls.sh deleted file mode 100755 index fcac1b5..0000000 --- a/.travis.scripts/coveralls.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -# Note that this only works if the tests were built using --coverage for -# compile and link flags! -#if [ "$CXX" == "g++" ]; -#then - ##sudo pip install cpp-coveralls - cd test - ./configure --with-boost=${BOOST_ROOT} ###LDFLAGS=--coverage CXXFLAGS=--coverage - make - ./variant_test - #cpp-coveralls -r ../ -e examples -e doxy -e R -e rtdocs --verbose -t ${COVERALLS_TOKEN} -#fi diff --git a/.travis.scripts/gcc.sh b/.travis.scripts/gcc.sh deleted file mode 100644 index b52ae26..0000000 --- a/.travis.scripts/gcc.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/travis/gcc_4.9.1-1_amd64.deb -sudo apt-get remove cpp libffi-dev -sudo dpkg --install gcc_4.9.1-1_amd64.deb - -echo "BEGIN Eliminating old libstdc++" -sudo rm /usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/libstdc++.a -sudo rm /usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/libstdc++.la -sudo rm /usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/libstdc++_s.a -sudo rm /usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/libstdc++_sjlj_6.dll -sudo rm /usr/lib/gcc/x86_64-linux-gnu/4.6/libstdc++.a -sudo rm /usr/lib/gcc/x86_64-linux-gnu/4.6/libstdc++.so -sudo rm /usr/lib/x86_64-linux-gnu/libstdc++.so.6 -sudo rm /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.16 -echo "END Eliminating old libstdc++" - -export LD_LIBRARY_PATH=/usr/lib64 -sudo ln -s /usr/lib64/libstd* /usr/lib/x86_64-linux-gnu/ diff --git a/.travis.scripts/travis-before-install.sh b/.travis.scripts/travis-before-install.sh deleted file mode 100755 index fe01094..0000000 --- a/.travis.scripts/travis-before-install.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#set -x -set -e -set -o pipefail - -if [ "${TRAVIS_OS_NAME}" = "osx" ]; then - brew update -fi - -if [ -n "${BOOST_VERSION}" ]; then - mkdir -p $BOOST_ROOT - wget --no-verbose --output-document=- \ - http://sourceforge.net/projects/boost/files/boost/${BOOST_VERSION}/boost_${BOOST_VERSION//./_}.tar.bz2/download \ - | tar jxf - --strip-components=1 -C "${BOOST_ROOT}" -fi diff --git a/.travis.scripts/travis-install.sh b/.travis.scripts/travis-install.sh deleted file mode 100755 index 1e1fcb0..0000000 --- a/.travis.scripts/travis-install.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -set -e -set -o pipefail - -#if [ -d "${BOOST_ROOT}" ]; then -# (cd "${BOOST_ROOT}" -# ./bootstrap.sh --with-libraries="${BOOST_LIBS}" -# ./b2 threading=multi --prefix="${BOOST_ROOT}" -d0 install -# ) -fi diff --git a/.travis.yml b/.travis.yml index 0456e14..0156bce 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,6 @@ before_install: - sudo update-alternatives --config g++ - if [ "${CXX}" = "g++" ]; then export CXX="$(which g++-4.9)"; export CC="$(which gcc-4.9)"; fi - if [ "${TRAVIS_OS_NAME}" = "osx" -a "${CXX}" = "clang++" ]; then export CXX="$(which c++)"; export CC="$(which cc)"; fi -- .travis.scripts/travis-before-install.sh script: - ./configure && make diff --git a/README.md b/README.md index 5691ddd..dcaf906 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Quick Start =========== ``` ## using the included test BAM (HCC1143) -VariantBam/src/variant test/small.bam -g 'X:1,000,000-1,100,000' -r mapq[10,100] -c counts.tsv -o mini.bam -v +VariantBam/src/variant test/small.bam -g 'X:1,000,000-1,100,000' --min-mapq 10 -c counts.tsv -o mini.bam -v ## get help VariantBam/src/variant --help @@ -62,6 +62,9 @@ variant -L $rfile -o mini.bam -v ## extract high-quality clipped reads (where clip length account for low quality bases) variant --min-phred 4 --min-clip 5 -o mini.bam -v +## extract reads with high mapq that also contain a large insertion or deletion +variant --min-mapq 20 --min-ins 10 --min-del 10 -v -o mini.bam + ## subsample to max-coverage. BAM must be sorted variant -m 100 -o mini.bam -v ``` @@ -72,20 +75,21 @@ Description VariantBam is a tool to extract/count specific sets of sequencing reads from next-generational sequencing files. To save money, disk space and I/O, one may not want to store an entire BAM on disk. In many cases, it would be more efficient to store only those read-pairs or reads who intersect some region around the variant locations. Alternatively, if your scientific question is focused on only one aspect of the data (e.g. breakpoints), many -reads can be removed without losing the information relevant to the problem. +reads can be removed without losing the information relevant to the problem, and enriching for the signal you are interested in. ##### Tool comparison -VariantBam packages into a single executable a number of filtering features not easily found using ``samtools`` + ``awk``:: +VariantBam packages into a single executable a number of filtering features not easily found using ``samtools`` + ``awk``: -> 1. Filter specifically on read clipping, orientation and insert size (all important for structural variation), while taking into account the per-base phred quality -> 2. [Interval tree][ekg] to efficiently determine if a read or read mate overlaps a region -> 3. Provide different rules for different arbitrarily-sized regions, and to provide these regions as common variant files (VCF, MAF, BED) -> 4. Select reads by matching motifs against a large dictionary using [Aho-Corasick implementation][aho] -> 5. Count reads that satisfy any number of user-defined properties -> 6. Read and write CRAM files -> 7. Selectively strip alignment tags -> 8. Support for sub-sampling to obtain a BAM file with a coverage limit +> 1. Filter specifically on read clipping, orientation and insert size (all important for structural variation) +> 2. Support for considering only high-quality bases when determining read length or clip count +> 3. [Interval tree][ekg] to efficiently determine if a read overlaps a region +> 4. Ability to link reads to a genomic region if their mate intersects that region. +> 5. Provide different rules for different arbitrarily-sized regions, and to provide these regions as common variant files (VCF, MAF, BED) +> 6. Select reads by matching motifs against a large dictionary using [Aho-Corasick implementation][aho] +> 7. Count reads that satisfy any number of user-defined properties +> 8. Selectively strip alignment tags +> 9. Support for sub-sampling to obtain a BAM file with a coverage limit VariantBam is implemented in C++ and uses [HTSlib][hlib], a highly optimized C library used as the core of [Samtools][samtools] and [BCFtools][bcf]. @@ -96,7 +100,7 @@ Examples ##### Example Use 1 Whole-genome analysis has been conducted on a BAM, generating VCF and MAF files. Ideally, these regions could be manually inspected -or reanalyzed without having to keep the entire BAM. Running VariantBam to extract only reads that overlap these events will allow +or reanalyzed without having to keep the entire BAM. Running VariantBam to extract only read-pairs that overlap these events will allow these regions to be rapidly queried, without having to keep the full BAM record. ``` ### Extract all read PAIRS that interset with a variant from a VCF @@ -106,9 +110,9 @@ variant $bam -l myvcf.vcf -o mini.bam ##### Example Use 2 In situations where the sequencing or library preparation quality is low, it may be advantageous to remove poor quality reads before starting the analysis train. VariantBam handles this by optionally taking into -account Phred base-qualities when making a decision whether to keep a sequencing read. For instance, one might +account base-qualities when making a decision whether to keep a sequencing read. For instance, one might only be interested in high quality MAPQ 0 or clipped reads. VariantBam can be -setup to apply unique Phred filters to different regions or across the entire genome, all with one-pass. +setup to apply unique base-quality filters to different regions or across the entire genome, all with one-pass. ``` ### Extract only high quality reads with >= 50 bases of phred >=4 and MAPQ >= 1 and not duplicated/hardclip/qcfail ### json @@ -147,12 +151,12 @@ variant $bam -r example3.json ##### Example Use 4 A user wants to profile a BAM for quality. They would like to count the number of clipped reads in a BAM file, so long as those reads have sufficient optical quality and mapping quality. VariantBam run with the -x flag for "counting only" -will accomplish this. Let's try an example of this, just for part of chromsome 22 +will accomplish this. Let's try an example of this, just for chromsome 22 ``` ## example4.json { "example4": { - "region" : "22:50,000,000-51,304,566" + "region" : "22", "rules": [{"clip": [5,1000], "phred": [4, 1000], "length": [20, 1000]}] @@ -161,8 +165,8 @@ will accomplish this. Let's try an example of this, just for part of chromsome 2 ## ### -variant $bam -g 22:50,000,000-51,304,566 --min-clip 5 --min-phred 4 --min-mapq 10 -x counts.txv -variant $bam -r example4.json -x counts.tsv ## using JSON +variant $bam -g 22 --min-clip 5 --min-phred 4 --min-mapq 10 -c counts.tsv +variant $bam -r example4.json -c counts.tsv ## using JSON ``` ##### Example Use 5 A team is only interested in variants in known cancer genes, and would like to analyze thousands of exomes and genomes. Running @@ -170,7 +174,7 @@ VariantBam to extract reads from only these genes, and sending the BAM files to to allow all of the relevant data to be stored on disk. ``` ### Grab only reads from predefined regions. Strip unneccessary tags and convert to CRAM for maximum compression -variant $bam -g mygenes.bed -r all -C -o mini.cram -s BI,OQ +variant $bam -l mygenes.bed -C -o mini.cram -s BI,OQ ``` ##### Example Use 6 A research team would like to extract only reads matching a certain motifs, but only if they have high optical quality. @@ -189,14 +193,23 @@ the length of a read) ### variant $bam -r example6.json ## input as a JSON +variant $bam --min-phred 4 --min-length 20 --motif mymotifs.txt ## using command line shortcuts ``` ##### Example Use 7 To reduce the size of the BAM, reads can be removed from centromeric and satellite repeat regions. These reads are rarely helpful for variant calling. -To remove reads that intersect a region, set the region as an inverse-region. In a VariantBam script, use ``!region`` or ``!mlregion``. For +To remove reads that intersect a region, set the region as an inverse-region. In a VariantBam script, use ``"exclude" : true```. For quick use on the command line, use ``-L`` or ``-G`` (opposites of ``-l`` and ``-g``). ``` -### +### json +{ + "" : { + "region" : "bad.bed", + "exclude" : true, + "matelink" : true + } +} +### variant $bam -L bad.bed -o mini.bam -v ``` diff --git a/SnowTools b/SnowTools index e79d403..7e8b4a5 160000 --- a/SnowTools +++ b/SnowTools @@ -1 +1 @@ -Subproject commit e79d4031f64fb8b96d427a2406763b8794a1eb2a +Subproject commit 7e8b4a541c816cf1d8aaea6dede4ca17dee2b27d diff --git a/rtdocs/Makefile b/rtdocs/Makefile deleted file mode 100644 index 9ae020c..0000000 --- a/rtdocs/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/VariantBam.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/VariantBam.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/VariantBam" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/VariantBam" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/rtdocs/conf.py b/rtdocs/conf.py deleted file mode 100644 index 79b9989..0000000 --- a/rtdocs/conf.py +++ /dev/null @@ -1,258 +0,0 @@ -# -*- coding: utf-8 -*- -# -# VariantBam documentation build configuration file, created by -# sphinx-quickstart on Thu May 21 14:21:38 2015. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'VariantBam' -copyright = u'2015, Jeremiah Wala' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '1.0.0' -# The full version, including alpha/beta/rc tags. -release = '1.0.0' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'VariantBamdoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'VariantBam.tex', u'VariantBam Documentation', - u'Jeremiah Wala', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'variantbam', u'VariantBam Documentation', - [u'Jeremiah Wala'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'VariantBam', u'VariantBam Documentation', - u'Jeremiah Wala', 'VariantBam', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False diff --git a/rtdocs/index.rst b/rtdocs/index.rst deleted file mode 100644 index 8d3ce37..0000000 --- a/rtdocs/index.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. VariantBam documentation master file, created by - sphinx-quickstart on Thu May 21 14:21:38 2015. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -VariantBam - One-pass extraction of sequencing reads from a BAM file using cascading rules -========================================================================================== - -Contents: - -.. toctree:: - :maxdepth: 2 - - introduction - installation - syntax - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/rtdocs/installation.rst b/rtdocs/installation.rst deleted file mode 100644 index 6976216..0000000 --- a/rtdocs/installation.rst +++ /dev/null @@ -1,46 +0,0 @@ -Installation ------------- - -**On Linux 64-bit, you may use the provided executable** - -This runs on kernel 2.6.18 and newer: -https://github.com/broadinstitute/variant-bam/src/variant - -**Otherwise, you must build the executable from source** - -The source code is available: https://github.com/broadinstitute/variant-bam/src - -Install the dependencies: - -.. code:: bash - - # Broad Institute - use GCC-4.9 - use BamTools - path_to_bamtools=/broad/software/free/Linux/redhat_5_x86_64/pkgs/pezmaster31_bamtools-6708a21 - -Download and compile the code: - -.. code:: bash - - # Clone with git; easily get updates with 'git pull': - git clone https://github.com/broadinstitute/variant-bam.git - cd variant-bam - - cd src; - ./configure --with-bamtools=${path_to_bamtools} - make # Compile. - cp ../src/variant ~/bin/ # Copy the executables wherever you like. - -C++ Libraries -~~~~~~~~~~~~ - -To compile VariantBam, you will need a modern C++ compiler that supports -`c++0x `__ and the dependencies -listed below. I compiled successfully with gcc version 4.9.0 - -`GCC, the GNU Compiler `__ - - The GNU Compiler Collection is a compiler system produced by the GNU - Project supporting various programming languages. - diff --git a/rtdocs/introduction.rst b/rtdocs/introduction.rst deleted file mode 100644 index 861133c..0000000 --- a/rtdocs/introduction.rst +++ /dev/null @@ -1,31 +0,0 @@ -Introduction ------------- - -VariantBam is a tool to extract interesting sequencing reads from a BAM file. VariantBam -was developed to be a one-pass solution for the various needs of different NGS tools. For instance, -an indel tool might be interested in MAPQ > 0 reads in 30,000 candidate regions of interest, -a SNP tool might find a different 20,000, and an SV tool might be interested in only discordant or high-quality -clipped reads across the BAM (where high-quality means they are not clipped to do low Phred quality). Alternatively, -to save money/space one may not want to store an entire BAM on disk after all the relevant VCF, MAFs, etc have been created. -Instead it would be more efficient to store only those read-pairs who intersect some region around the variant locations. -VariantBam is designed to handle all of these situations with a single pass through the BAM. - -VariantBam is implemented in C++ and relies on the BamTools API (Derek Barnett, (c) 2009) for BAM I/O. -It is worth mentioning the capabilities of the BamTools command line ``bamtools filter`` here, -which may provide a solution more to your needs than VariantBam. ``bamtools filter`` allows you to -specify a set of filters in JSON format to be applied to a BAM. See the Bamtools documentation_ for more detail. -Under what situations would you use ``bamtools filter``, and when would you use VariantBam? - -1. Extract all MAPQ 0 reads from a BAM - Either tool (prefer ``bamtools filter``) -2. Extract all reads in read group A - ``bamtools filter`` -3. Extract all reads with NM tag >= 4 - Either tool (prefer ``bamtools filter``) -4. Extract all reads with NM tag >= 4 in exons - VariantBam. -5. Remove all reads with insert sizes between 100 and 600 bp - VariantBam -6. Extract all reads and mate within 1000bp of a variant or set of genes - VariantBam -7. Extract only high-quality reads with N bases beyong phred score X - VariantBam -8. Reduce a BAM to only high quality reads around your MAFs, VCFs and BED files - VariantBam - -A manuscript for VariantBam is currently under preparation. - -.. _documentation https://raw.githubusercontent.com/wiki/pezmaster31/bamtools/Tutorial_Toolkit_BamTools-1.0.pdf - diff --git a/rtdocs/make.bat b/rtdocs/make.bat deleted file mode 100644 index 4441094..0000000 --- a/rtdocs/make.bat +++ /dev/null @@ -1,242 +0,0 @@ -@ECHO OFF - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set BUILDDIR=_build -set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . -set I18NSPHINXOPTS=%SPHINXOPTS% . -if NOT "%PAPER%" == "" ( - set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% - set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% -) - -if "%1" == "" goto help - -if "%1" == "help" ( - :help - echo.Please use `make ^` where ^ is one of - echo. html to make standalone HTML files - echo. dirhtml to make HTML files named index.html in directories - echo. singlehtml to make a single large HTML file - echo. pickle to make pickle files - echo. json to make JSON files - echo. htmlhelp to make HTML files and a HTML help project - echo. qthelp to make HTML files and a qthelp project - echo. devhelp to make HTML files and a Devhelp project - echo. epub to make an epub - echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter - echo. text to make text files - echo. man to make manual pages - echo. texinfo to make Texinfo files - echo. gettext to make PO message catalogs - echo. changes to make an overview over all changed/added/deprecated items - echo. xml to make Docutils-native XML files - echo. pseudoxml to make pseudoxml-XML files for display purposes - echo. linkcheck to check all external links for integrity - echo. doctest to run all doctests embedded in the documentation if enabled - goto end -) - -if "%1" == "clean" ( - for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i - del /q /s %BUILDDIR%\* - goto end -) - - -%SPHINXBUILD% 2> nul -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "html" ( - %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/html. - goto end -) - -if "%1" == "dirhtml" ( - %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. - goto end -) - -if "%1" == "singlehtml" ( - %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. - goto end -) - -if "%1" == "pickle" ( - %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the pickle files. - goto end -) - -if "%1" == "json" ( - %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the JSON files. - goto end -) - -if "%1" == "htmlhelp" ( - %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run HTML Help Workshop with the ^ -.hhp project file in %BUILDDIR%/htmlhelp. - goto end -) - -if "%1" == "qthelp" ( - %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run "qcollectiongenerator" with the ^ -.qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\VariantBam.qhcp - echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\VariantBam.ghc - goto end -) - -if "%1" == "devhelp" ( - %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. - goto end -) - -if "%1" == "epub" ( - %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The epub file is in %BUILDDIR%/epub. - goto end -) - -if "%1" == "latex" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdf" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf - cd %BUILDDIR%/.. - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdfja" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf-ja - cd %BUILDDIR%/.. - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "text" ( - %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The text files are in %BUILDDIR%/text. - goto end -) - -if "%1" == "man" ( - %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The manual pages are in %BUILDDIR%/man. - goto end -) - -if "%1" == "texinfo" ( - %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. - goto end -) - -if "%1" == "gettext" ( - %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The message catalogs are in %BUILDDIR%/locale. - goto end -) - -if "%1" == "changes" ( - %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes - if errorlevel 1 exit /b 1 - echo. - echo.The overview file is in %BUILDDIR%/changes. - goto end -) - -if "%1" == "linkcheck" ( - %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck - if errorlevel 1 exit /b 1 - echo. - echo.Link check complete; look for any errors in the above output ^ -or in %BUILDDIR%/linkcheck/output.txt. - goto end -) - -if "%1" == "doctest" ( - %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest - if errorlevel 1 exit /b 1 - echo. - echo.Testing of doctests in the sources finished, look at the ^ -results in %BUILDDIR%/doctest/output.txt. - goto end -) - -if "%1" == "xml" ( - %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The XML files are in %BUILDDIR%/xml. - goto end -) - -if "%1" == "pseudoxml" ( - %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. - goto end -) - -:end diff --git a/rtdocs/syntax.rst b/rtdocs/syntax.rst deleted file mode 100644 index e7ed808..0000000 --- a/rtdocs/syntax.rst +++ /dev/null @@ -1,118 +0,0 @@ -Syntax ------- - -This section will describe the syntax used by VariantBam to specify the cascades of rules and regions -that are applied to the BAM[*]_. Below is an example of a valid VariantBam script: - -.. code:: bash - - ### this is a comment. The line code below defines filters to be applied to each region/rule - region@WG - rule@!hardclip;!unmapped;!unmapped_mate;isize:[0,600];!mapq:[10,100] - rule@!hardclip;!unmapped;!unmapped_mate;clip:[10,101] - -Region -~~~~~~ - -Let's look first at the ``region`` tag. The region@ keyword marks that what follows is a region, -which is either the keyword ``WG`` for whole genome, or a VCF, MAF, Callstats or BED file. Optionally, -you can specify that a region is a bit bigger than is actually in the file. You can do this by "padding" -the regions around the sites. For example: - -``region@myvcf.vcf,pad:1000`` - -You can also state that the region applies to reads who don't necessarily overlap the region, but their pair-mate does. - -``region@myvcf,pad:1000,mate`` - -Note that the syntax is such that you must specify the file immediately after the @, following by other options -in any order. - -Rules -~~~~~ - -The next two lines specify a pair of rules, marked with the ``rule@`` tag. -The default rule is to include every read, and the conditions are meant to be -thought of as exclusion criteria. You can take the "opposite" of a condition by prefixing -with a ``!``. For example, the first rule in the above example states: - -Keep all reads EXCEPT any read that satisfies the following: Hardclipped, is unmapped, has unmapped mate, -has insert size greater than 600, does NOT have mapping quality between 10 and 100. Thus, we are going to get low mapping -quality discordant reads from this query. And equivalent specification would be: - -``rule@!hardclip;!unmapped;!unmapped_mate;isize:[0,600];mapq:[0,9]`` - -VariantBam handles multiple rules in the following way. For each read, VariantBam -will cycle through the rules within a region until the read satisfies a rule. When it -does, it includes the reads and stops checking. The logic for the entire collectoin is then as follows: - -On a given rule line, the read must satisfy ALL conditions (logical AND) - -Across different rules, the read nead only satisfy ONE rule (logical OR) - -To illustrate this, note that there is a small discrepancy in the first rule of the above. In the BAM format, -unmapped reads and reads with unmapped mates are given an insert size of 0. However, in the same rule -a condition is described to keep all reads with insert sizes 0-600 inclusive. Recalling the AND logic -within a rule, VariantBam will exclude the read, because it fails the ``!unmapped`` criteria. - -Below is another example which uses the ability of VariantBam to interpret VCFs and BED files, -and apply rules separately to them. - -.. code:: bash - - ### declare that my region is a VCF file with pads of 1000 on either side of the variant. - ### use the "mate" keyword to specify that pairs whose mate falls in the region belong to this rule - region@/home/unix/jwala/myvcf.vcf,mate,pad:1000 - #### I want to keep all the reads (this the default). Ill be explicit with the "every" keyword - rule@every - #### I might also have a BED file which gives a list of exons. In here, I just want to keep "variant" reads - #### so I can specify something like: - region@/home/unix/jwala/myexonlist.bed - rule@y!isize:[100,600];!unmapped;!unmapped_mate - -Global -~~~~~~ - -To make things more clear and reduce redundancy, you can also type a ``global@`` rule anywhere in the stack, -and it will append that rule to everything below. For example, to exclude hardclipped, duplicate, qcfail and -supplementary reads in every region, you would do: - -.. code:: bash - - global@!hardclip;!duplicate;!qcfail;!supplementary - region@WG - rule@!isize:[0,600] - rule@clip:[10,101];mapq:[1,60] - region@myvcf.vcf - -is equivalent to - -.. code:: bash - - region@WG - rule@!isize:[0,600];!hardclip;!duplicate;!qcfail;!supplementary - rule@clip:[10,101];mapq:[1,60];!hardclip;!duplicate;!qcfail;!supplementary - region@myvcf.vcf - rule@!hardclip;!duplicate;!qcfail;!supplementary - -The global tag will apply through all of the regions. If you want to reset it for everything, just add ``global@every`` -back onto the stack. - -To make things run a little faster, you can set the order so that the more inclusive regions / rules are first. This only -applies if there is an overlap among regions. This is because VariantBam will move down the list of regions -that apply to this read and stop as soon as it meets an inclusion criteria. I prefer to start with a whole-genome region / rule -set, and then add more fine-mapped regions later. - -Command Line Script -~~~~~~~~~~~~~~~~~~~ - -The usual method of inputing rules is with a VariantBam script as a text file (passed to -VariantBam with the ``-r`` flag). However, sometimes it is useful to not have to write an intermediate -file and just feed rules directly in. In that case, just pass a string literal to the -r flag, and VariantBam -will parse directly. Just separate lines with a ``%``. For instance, you might run something like the following: - -``variant -i big.bam -o small.bam -r 'global@!hardclip%region@WG%rule@!isize:[0,600];%rule@clip:[10,101];mapq:[1,60]%region@myvcf.vcf'`` - -Note the single quotes so that it is interpreted as a string literal in BASH. - -.. [*] A standard format like JSON would be better and may be implemented in the future. diff --git a/src/VariantBamWalker.cpp b/src/VariantBamWalker.cpp index 794f697..a3c0073 100644 --- a/src/VariantBamWalker.cpp +++ b/src/VariantBamWalker.cpp @@ -75,7 +75,10 @@ void VariantBamWalker::writeVariantBam() buffer.clear(); } } + } else if (!fop) { // we are not outputting anything + ++rc_main.keep; } + } if (++rc_main.total % 1000000 == 0 && m_verbose) diff --git a/src/variant.cpp b/src/variant.cpp index 643f07d..a241e3b 100644 --- a/src/variant.cpp +++ b/src/variant.cpp @@ -266,13 +266,9 @@ int main(int argc, char** argv) { SnowTools::GRC rules_rg = walk.GetMiniRulesCollection().getAllRegions(); - // for (auto& i : rules_rg) - //std::cerr << i << std::endl; - rules_rg.createTreeMap(); if (grv_proc_regions.size() && rules_rg.size()) { // intersect rules regions with mask regions. - // dont incorporate rules regions if there are any mate-linked regions rules_rg = rules_rg.intersection(grv_proc_regions, true); // true -> ignore_strand if (opt::verbose) @@ -283,14 +279,14 @@ int main(int argc, char** argv) { if (grv_proc_regions.size() > 0 && (rules_rg.size() || has_ml_region )) // explicitly gave regions walk.setBamWalkerRegions(grv_proc_regions.asGenomicRegionVector()); - else if (rules_rg.size() && !has_ml_region && grv_proc_regions.size() == 0) { + /* else if (rules_rg.size() && !has_ml_region && grv_proc_regions.size() == 0) { walk.setBamWalkerRegions(rules_rg.asGenomicRegionVector()); if (opt::verbose) std::cerr << "...from rules, will run on " << rules_rg.size() << " regions" << std::endl; } else if (!rules_rg.size() && grv_proc_regions.size() > 0) { std::cerr << "No regions with possibility of reads. This error occurs if no regions in -g are in -k." << std::endl; return 1; - } + }*/ // should we count all rules (slower) if (opt::counts_file.length())