diff --git a/.travis.yml b/.travis.yml index 9d41a3e..33401a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,9 @@ language: python python: - "2.7" + - "3.4" + - "3.5" + - "3.6" dist: trusty sudo: required @@ -38,4 +41,4 @@ deploy: target_branch: gh-pages on: branch: master - python: "2.7" + python: "3.6" diff --git a/README.rst b/README.rst index 160773a..71e3f2d 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,9 @@ Tombo Summary ============= -.. image:: https://travis-ci.org/nanoporetech/tombo.svg?branch=master +|travis_badge| + +.. |travis_badge| image:: https://travis-ci.org/nanoporetech/tombo.svg?branch=master :target: https://travis-ci.org/nanoporetech/tombo Tombo is a suite of tools primarily for the identification of modified nucleotides from nanopore sequencing data. @@ -19,9 +21,9 @@ Installation :target: http://bioconda.github.io/recipes/ont-tombo/README.html .. |pypi_badge| image:: https://badge.fury.io/py/ont-tombo.svg - :target: https://badge.fury.io/py/ont-tombo + :target: https://pypi.org/project/ont-tombo/ -Basic tombo installation (python2.7 support only) +Basic tombo installation (python 2.7 and 3.4+ support) :: @@ -30,7 +32,7 @@ Basic tombo installation (python2.7 support only) # or install pip package (numpy install required before tombo for cython optimization) pip install numpy - pip install ont-tombo + pip install ont-tombo[full] .. @@ -51,32 +53,32 @@ Re-squiggle (Raw Data Alignment) :: - tombo resquiggle path/to/amplified/dna/fast5s/ genome.fasta --minimap2-executable ./minimap2 --processes 4 + tombo resquiggle path/to/amplified/dna/fast5s/ genome.fasta --processes 4 .. Only R9.4/5 data is supported at this time. - DNA or RNA is automatically determined from FAST5s (set explicitly with `--dna` or `--rna`). + DNA or RNA is automatically determined from FAST5s (set explicitly with ``--dna`` or ``--rna``). - FAST5 files need not contain Events data, but must contain Fastq slot. See `annotate_raw_with_fastqs` for pre-processing of raw FAST5s. + FAST5 files need not contain Events data, but must contain Fastq slot. See ``annotate_raw_with_fastqs`` for pre-processing of raw FAST5s. Identify Modified Bases ^^^^^^^^^^^^^^^^^^^^^^^ :: - # comparing to an alternative 5mC model (recommended method) + # comparing to an alternative 5mC and 6mA model (recommended method) tombo test_significance --fast5-basedirs path/to/native/dna/fast5s/ \ - --alternate-bases 5mC --statistics-file-basename sample_compare + --alternate-bases 5mC 6mA --statistics-file-basename sample # comparing to a control sample (e.g. PCR) tombo test_significance --fast5-basedirs path/to/native/dna/fast5s/ \ - --control-fast5-basedirs path/to/amplified/dna/fast5s/ --statistics-file-basename sample_compare + --control-fast5-basedirs path/to/amplified/dna/fast5s/ --statistics-file-basename sample_compare # compare to the canonical base model tombo test_significance --fast5-basedirs path/to/native/dna/fast5s/ \ - --statistics-file-basename sample --processes 4 + --statistics-file-basename sample_de_novo --processes 4 .. @@ -100,7 +102,7 @@ Extract Sequences Surrounding Modified Positions :: - tombo write_most_significant_fasta --statistics-filename sample_compare.5mC.tombo.stats \ + tombo write_most_significant_fasta --statistics-filename sample.6mA.tombo.stats \ --genome-fasta genome.fasta Plotting Examples @@ -117,11 +119,11 @@ Plotting Examples # plot raw signal at genome locations with the most significantly/consistently modified bases tombo plot_most_significant --fast5-basedirs path/to/native/rna/fast5s/ \ - --statistics-filename sample_compare.5mC.tombo.stats --plot-alternate-model 5mC + --statistics-filename sample.5mC.tombo.stats --plot-alternate-model 5mC - # plot per-read test statistics using the 5mC alternative model testing method + # plot per-read test statistics using the 6mA alternative model testing method tombo plot_per_read --fast5-basedirs path/to/native/rna/fast5s/ \ - --genome-locations chromosome:1000 chromosome:2000:- --plot-alternate-model 5mC + --genome-locations chromosome:1000 chromosome:2000:- --plot-alternate-model 6mA =============== Common Commands @@ -190,7 +192,7 @@ Read Filtering: Note on Tombo Models ==================== -Tombo is currently provided with two standard models (DNA and RNA) and one alternative model (DNA::5mC). These models are applicable only to R9.4/5 flowcells with 1D or 1D^2 kits (not 2D). +Tombo is currently provided with two standard models (DNA and RNA) and two alternative models (DNA::5mC, DNA::6mA). These models are applicable only to R9.4/5 flowcells with 1D or 1D^2 kits (not 2D). These models are used by default for the re-squiggle and testing commands. The correct model is automatically selected for DNA or RNA based on the contents of each FAST5 file and processed accordingly. Additional models will be added in future releases. @@ -198,58 +200,39 @@ These models are used by default for the re-squiggle and testing commands. The c Requirements ============ -At least one supported mapper: -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -- minimap2 (https://github.com/lh3/minimap2) -- BWA-MEM (http://bio-bwa.sourceforge.net/) -- graphmap (https://github.com/isovic/graphmap) - -- HDF5 (http://micro.stanford.edu/wiki/Install_HDF5#Install) +python Requirements (handled by conda or pip): +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -python Requirements (handled by pip): -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -- numpy (must be installed before installing tombo) +- numpy - scipy - h5py - cython +- mappy -Optional packages for plotting (install R packages with ``install.packages([package_name])`` from an R prompt): -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -- rpy2 (along with an R installation) -- ggplot2 (required for any plotting subcommands) -- cowplot (required for plot_motif_with_stats subcommand) +Optional packages (handled by conda, but not pip): +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Optional packages for alternative model estimation: -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- Plotting Packages + + + R + + rpy2 + + ggplot2 + + gridExtra (required for ``plot_motif_with_stats`` and ``plot_kmer`` subcommands) -- sklearn +- On-disk Random Fasta Access + + + pyfaidx Advanced Installation Instructions ---------------------------------- -Install tombo with all optional dependencies (for plotting and model estimation) - -:: - - pip install ont-tombo[full] - -Install tombo with plotting dependencies (requires separate installation -of R packages ggplot2 and cowplot) - -:: - - pip install ont-tombo[plot] - -Install tombo with alternative model estimation dependencies +Minimal tombo installation without optional dependencies (enables re-squiggle, all modified base testing methods and text output) :: - pip install ont-tombo[alt_est] + pip install ont-tombo -Install github version of tombo (most versions on pypi should be up-to-date) +Install github version of tombo (versions on conda/pypi should be up-to-date) :: @@ -267,4 +250,10 @@ http://biorxiv.org/content/early/2017/04/10/094672 Gotchas ======= -- If plotting commands fail referencing rpy2 images, shared object files, etc., this may be an issue with the version of libraries installed by conda. In order to resolve this issue, remove the conda-forge channel and re-install ont-tombo. +- The Tombo conda environment (especially with python 2.7) may have installation issues. + + + The first troubleshooting step would be to install in a python 3.4+ environment. + + The R ``cowplot`` package was also causing several installation issues. As of Tombo version 1.2 the ``cowplot`` dependency has been replaced by the ``gridExtra`` package which should resolve this inter-dependency issue. + + If python2 is a requirement, un-installing and re-installing the offending package may help. + + Moving ``conda-forge`` to the end of the conda channel list (or removing it altogether) may help ``conda config --append channels conda-forge``. + + In python 2.7 there is an issue with the conda scipy.stats package. Down-grading to version 0.17 fixes this issue. diff --git a/docs/_images/dampened_fraction.png b/docs/_images/dampened_fraction.png new file mode 100644 index 0000000..f61d379 Binary files /dev/null and b/docs/_images/dampened_fraction.png differ diff --git a/docs/_images/roc.png b/docs/_images/roc.png new file mode 100644 index 0000000..32ead23 Binary files /dev/null and b/docs/_images/roc.png differ diff --git a/docs/_images/single_samp.png b/docs/_images/single_samp.png index cce60aa..4ad69e6 100644 Binary files a/docs/_images/single_samp.png and b/docs/_images/single_samp.png differ diff --git a/docs/conf.py b/docs/conf.py index c277023..c0366aa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -54,7 +54,7 @@ # # Get the version number from __init__.py -verstrline = open(os.path.join('..', __pkg_name__, '_version.py'), 'r').read() +verstrline = open(os.path.join('..', __pkg_name__, '_version.py'), 'r').readlines()[-1] vsre = r"^TOMBO_VERSION = ['\"]([^'\"]*)['\"]" mo = re.search(vsre, verstrline) if mo: diff --git a/docs/examples.rst b/docs/examples.rst index 188040a..cf86071 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -4,17 +4,17 @@ Tombo Examples Below are minimal use case examples. For more detail on each commands options and further algorithm details, please see the corresponding documentation sections. ----------------------------------- -Re-squiggle (Raw Signal Alignment) ----------------------------------- +------------------------------------------ +Re-squiggle (Raw Signal Genomic Alignment) +------------------------------------------ -The re-squiggle algorithm aligns raw signal to genomic sequence based on a genomic mapping. +The re-squiggle algorithm aligns raw signal (electric current nanopore measurements) to genomic sequence based on a genomic mapping. -This command will add infomation including the mapped genomic location and the raw signal to sequence assignment into the FAST5 files provided as well as producing an index file for more efficient file access in downstream commands. +This command will add infomation including the mapped genomic location and the raw signal to sequence assignment to the read files (in FAST5 format) provided, as well as producing an index file for more efficient file access in downstream commands. -The ``resquiggle`` command must be run on any set of FAST5s before any further processing by Tombo. +The ``resquiggle`` command must be run before any further processing by Tombo commands. -**Important note**: Currently, only a models for R9.4/5 sequencing DNA or RNA are included with Tombo. Analysis of other nanopore data types is not supported at this time. +**Important note**: Currently, only models for R9.4/5 (1D or 1D^2) DNA or RNA sequencing are included with Tombo. Analysis of other nanopore data types is not supported at this time. If DNA or RNA sample type is not explicitly specified (via ``--dna`` or ``--rna`` options) the sample type will be detected automatically for the set of reads. For more details see the :doc:`re-squiggle documentation `. @@ -23,7 +23,7 @@ For more details see the :doc:`re-squiggle documentation `. # optionally annotate raw FAST5s with FASTQ files produced from the same reads tombo annotate_raw_with_fastqs --fast5-basedir --fastq-filenames reads.fastq - tombo resquiggle [-h] --minimap2-executable ./minimap2 + tombo resquiggle --processes 4 ----------------------- Modified Base Detection @@ -31,35 +31,49 @@ Modified Base Detection Tombo provides three methods for the investigation of modified bases. Each method has different advantages and requirements. -* The specific alternative base method is preferred, but is currently only available for 5mC in DNA (more modifications coming soon). -* The canonical (control) sample comparison method would be preferred next, but requires the producetion of a second set of reads containing only the 4 canonical bases. -* The de novo method is recommended only a research tool and may produce high false positive rates. -* Additionally, both the control sample and de novo methods may not identify the exact modified base location and give no information as to the identity of a modified base. +* The specific alternative base method is preferred. Alternative DNA models are currently available for 5-methylcytosine (5mA) and N6-methyladenosine (6mA) in all sequence contexts. + + - More modifications will continue to be added. +* The canonical (control) sample comparison method requires the production of a second set of reads containing only the 4 canonical bases (e.g PCR). +* The de novo method compares signal to the included canonical bases model. -The result of all ``test_significance`` calls will be a binary statistics file, which can be passed to other Tombo sub-commands. + - This method is recommended only as a research tool and may produce high false positive rates. +* Both the control sample comparison and the de novo methods may not identify the exact modified base location and give no information as to the identity of a modified base. + +---- + +.. figure:: _images/testing_method_comparison.png + :align: center + :scale: 30% + + Tombo modified base testing methods. + +---- + +The result of all ``test_significance`` calls will be a binary statistics file(s), which can be passed to other Tombo sub-commands. For more details see the :doc:`modified base detection documentation `. -Specific Alternative Base Detection -=================================== +Specific Alternative Base Method +================================ -In order to specifically detect 5-methyl cytosine (and other alternative bases in the future), use the ``test_significance`` command with the ``--alternate-bases 5mC`` option. +In order to specifically detect 5mC and 6mA, use the ``test_significance`` command with the ``--alternate-bases 5mC 6mA`` option. -This will perform a log likelihood ratio test using the default canonical and 5mC alternative models provided with Tombo. +This will perform a log likelihood ratio test using the default canonical model and the 5mC and 6mA alternative models provided with Tombo. -New alternative base models will be added as they are trained. This is the perferred method for modified base detection if a model is available for your biological sample of interest. +New alternative base models will be added as they are trained. This is the perferred method for modified base detection if a model is available for your biological modification of interest as it identifies the exact location of the modified base and reduces false positives for spurious shifts in signal. .. code-block:: bash tombo test_significance --fast5-basedirs \ - --alternate-bases 5mC --statistics-file-basename sample_5mC_detection + --alternate-bases 5mC 6mA --statistics-file-basename sample_alt_model -Canonical Sample Comparison Detection -===================================== +Canonical Sample Comparison Method +================================== In order to perform canonical-sample-comparison modified base detection, use the ``test_significance`` command with a second set of reads from the same biological sample containing only canonical bases (e.g. PCR) using the ``--control-fast5-basedirs`` option. -This will perform a hypothesis test against the signal level observed from the control sample at each genomic position. This method provides the highest accuracy, but does not always identify the exact modification position or the identity of the modified base. +This will perform a hypothesis test against the signal level observed from the control sample at each genomic position. In some cases this method provides the highest accuracy, but does not always identify the exact modified base position. .. code-block:: bash @@ -67,12 +81,12 @@ This will perform a hypothesis test against the signal level observed from the c --control-fast5-basedirs \ --statistics-file-basename sample_canonical_compare -De novo Non-canonical Base Detection -==================================== +De novo Non-canonical Base Method +================================= In order to perform de novo non-canonical base detection, use the ``test_significance`` command without any other options (aside from the set of reads to test). -This will perform a hypothesis test against the default canonical base model provided with Tombo. Note that this method is quite error prone and will likely result in a high number of false positives, but may be of use in a research and development setting. This method also has the lowest requirement of only a set of reads and a genome. +This will perform a hypothesis test against the default canonical base model provided with Tombo. Note that this method is quite error prone and may result in a high false positive rate, but may be of use in a research and development setting. This method also has the lowest requirement of only a set of reads and a genome. .. code-block:: bash @@ -86,21 +100,27 @@ Text Output Wiggle Format Output ==================== -In order to output the results of re-squiggling and statistical testing in a genome browser compatible format (`wiggle format `_), the ``write_wiggles`` subcommand is provided. +In order to output the results of re-squiggling and statistical testing in a genome browser compatible format (`wiggle format `_), the ``write_wiggles`` sub-command is provided. .. code-block:: bash - tombo write_wiggles --fast5-basedirs --wiggle-basename sample_5mC_detection \ - --statistics-filename sample_5mC_detection.5mC.tombo.stats --wiggle-types fraction coverage + tombo write_wiggles --fast5-basedirs --wiggle-basename sample_alt_model \ + --statistics-filename sample_alt_model.5mC.tombo.stats --wiggle-types dampened_fraction coverage + +.. hint:: + + Other ``--wiggle-types`` available are ``fraction``, ``signal``, ``signal_sd``, ``dwell`` and ``difference``. + + The ``dampened_fraction`` option adds psuedo-counts to the detected number of un-modified and modified reads at each tested location (as specified by the ``--coverage-dampen-counts`` option), while the ``fraction`` option returns the raw fraction of modified reads at any genomic site. The ``dampen_fraction`` output is intended to allow the inclusion of low coverage regions in downstream analysis without causing potentially false site at the top of rank lists. Visualize different values of the ``--coverage-dampen-counts`` option with the included ``scripts/test_beta_priors.R`` script. Genome Sequence Output ====================== -For modified base analysis pipelines (e.g. motif detection), it may be of use to output the genomic sequence surrounding locations with the largest fraction of modified reads. The ``write_most_significant_fasta`` sub-command is provided for this purpose. +For modified base analysis pipelines (e.g. motif detection), it may be useful to output the genomic sequence surrounding locations with the largest fraction of modified reads. The ``write_most_significant_fasta`` sub-command is provided for this purpose. .. code-block:: bash - tombo write_most_significant_fasta --statistics-filename sample_5mC_detection.5mC.tombo.stats \ + tombo write_most_significant_fasta --statistics-filename sample_alt_model.6mA.tombo.stats \ --genome-fasta Example `meme `_ command line modified base motif detection command. @@ -115,9 +135,9 @@ For more details see the :doc:`text output documentation `. Plotting Examples ----------------- -Tombo provides many plotting functions for the visualization of potentially modified bases and the raw nanopore signal in general. +Tombo provides many plotting functions for the visualization of modified bases and raw nanopore signal in general. -Most plotting commands are genome-anchored. That is the raw signal is plotted as the re-squiggle algorithm has assigned it to the genome. Thus each read contain a different number of raw observations per genomic base. For summary distributions (not raw signal) the distributions are taken over each reads average signal level at the genomic position. +Most plotting commands are genome-anchored. That is the raw signal is plotted as the re-squiggle algorithm has assigned it to the genome. Thus each read contains a different number of raw observations assigned to each genomic base. For summary distributions (overplotting optios not showing raw signal) the distributions are taken over each read's average signal level at the genomic position. Each genome anchored plotting command allows for the selection of genomic positions based on generally applicable criterion. @@ -127,12 +147,13 @@ Each genome anchored plotting command allows for the selection of genomic positi tombo plot_motif_centered --fast5-basedirs --motif AWC \ --genome-fasta genome.fasta --control-fast5-basedirs - - tombo plot_per_read --fast5-basedirs \ - --genome-locations chromosome:1000 chromosome:2000:- --plot-alternate-model 5mC + + tombo plot_per_read --per-read-statistics-filename \ + --genome-locations chromosome:1000 chromosome:2000:- \ + --genome-fasta genome.fasta For more details see the :doc:`plotting documentation `. -.. +.. tip:: For additional command details, see the specific commands documentation section. diff --git a/docs/filtering.rst b/docs/filtering.rst index b97b384..97fd3ae 100644 --- a/docs/filtering.rst +++ b/docs/filtering.rst @@ -2,7 +2,7 @@ Read Filtering Commands *********************** -Read filtering commands can be useful to extract the most out out of a set of reads for modified base detection. Read filtering commands effect only the Tombo index file, and so filters can be cleared or applied iteratively without re-running any re-squiggle commands. Two filters are currently made available (``filter_stuck`` and ``filter_coverage``). +Read filtering commands can be useful to extract the most out out of a set of reads for modified base detection. Read filtering commands effect only the Tombo index file, and so filters can be cleared or applied iteratively without re-running any re-squiggle analysis. Two filters are currently made available (``filter_stuck`` and ``filter_coverage``). ---------------- ``filter_stuck`` @@ -16,11 +16,11 @@ This filter is based on the number of observations per genomic base along a read ``filter_coverage`` ------------------- -The ``filter_coverage`` command aims to filter reads to achieve more even read depth across a genome. This may be useful particularly in canonical and particularly in alternative mode estimation. This filter may also help make some testing cases more comparable across the genome. +The ``filter_coverage`` command aims to filter reads to achieve more even read depth across a genome. This may be useful particularly in canonical and particularly in alternative model estimation. This filter may also help make test statistics more comparable across the genome. This filter is applied by randomly selecting reads weighted by the approximate coverage at the mapped location of each read. The number of reads removed from downstream processing is defined by the ``--percent-to-filter`` option. -This filter is likely to be more useful for PCR'ed sample where duplicate locations are more likely to accumulate and cause very large spikes in coverage. +This filter is likely to be more useful for PCR'ed sample where duplicate locations are more likely to accumulate and cause large spikes in coverage. ----------------- ``clear_filters`` @@ -28,4 +28,4 @@ This filter is likely to be more useful for PCR'ed sample where duplicate locati The ``clear_filters`` simply removes any applied filters to this sample (failed reads from the re-squiggle command will still not be included). New filters can then be applied to this set of reads. -All Tombo commands will respect the filtered reads when they are parsed for procesing +All Tombo sub-commands will respect the filtered reads when parsed for processing. diff --git a/docs/index.rst b/docs/index.rst index db9c347..7f2831a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,14 +18,16 @@ Installation .. |pypi_badge| image:: https://badge.fury.io/py/ont-tombo.svg :target: https://badge.fury.io/py/ont-tombo -Basic Tombo installation (python2.7 support only):: +Basic tombo installation (python 2.7 and 3.4+ support) + +:: # install via bioconda environment conda install -c bioconda ont-tombo # or install pip package (numpy install required before tombo for cython optimization) pip install numpy - pip install ont-tombo + pip install ont-tombo[full] See :doc:`examples` for common workflows. @@ -49,6 +51,7 @@ Contents plotting filtering model_training + rna ------------------------- Full API reference (beta) diff --git a/docs/model_training.rst b/docs/model_training.rst index 7e20b62..7b1ef98 100644 --- a/docs/model_training.rst +++ b/docs/model_training.rst @@ -2,25 +2,29 @@ Model Training (Advanced Users Only) ************************************ -Model training is made available via the Tombo command, but should be used with care as these methods can be very sensetive to the samples used. Commands relevant to model training are ``estimate_reference`` for estimating a canonical bases model, ``estimate_alt_reference`` for estimation of a non-canonical alternative base model, and ``event_resquiggle`` for re-squiggleing reads without a model. +Model training is made available via several Tombo commands, but should be used with care as these methods can be very sensetive to the samples used. Commands relevant to model training are ``estimate_reference`` for estimating a canonical bases model, ``estimate_alt_reference`` for estimation of a non-canonical alternative base model, and ``event_resquiggle`` for re-squiggling reads without a model (requires event-based basecaller results for best results). + +.. note:: + + Model training results in a binary Tombo model file similar to those included in the Tombo software (within in the tombo/tombo_models directory). User-created strandard Tombo models can be used in re-squiggling, testing and plotting commands using the hidden ``--tombo-model-filename`` option. This option is generally for advanced users training their own models, so this option is not shown in the command line help documentation. Similarly user-created alternative models can be passed to plotting commands via the hidden ``--alternate-model-filename`` option and passed to the ``test_significance`` command via the hidden ``--alternate-model-filenames`` option. ====================== ``estimate_reference`` ====================== -The ``estimate_reference`` command is provided in order to estimate a canonical base expected current level Tombo model. +The ``estimate_reference`` command is provided to estimate a Tombo model for canonical bases only. -In order to estimate a canonical model, first genomic base levels are parsed from reads as assigned by a re-squiggle command (either ``model_resquiggle`` or ``resquiggel`` processed reads are acceptable) and grouped by by their genomic base assignment. By default, the median and standard deviation of the current level over reads covering each genomic position is computed. The ``--estimate-mean`` option will trigger this to be computed as a mean instead, though this can be sensetive to outlier read assignment and is thus not recommended. +To estimate a canonical model, first genomic base levels are parsed from reads as assigned by a re-squiggle command (either ``event_resquiggle`` or ``resquiggle`` processed reads are acceptable) and grouped by their genomic base assignment. By default, the median and standard deviation of the current level over reads covering each genomic position is computed. The ``--estimate-mean`` option will trigger this to be computed as a mean instead, though this can be sensetive to outlier read signal assignment and is thus not recommended. -All genmoic current levels are then grouped based on the genomic k-mer sequence at that location. This k-mer is defined by the ``--upstream-bases`` and ``--downstream-bases`` options. Note that the modelled k-mer will be one longer than the sum of these two options as the k-mer includes the assigned position as well. The central position has the strongest correlation with the current signal level as can be seen with the ``plot_kmer`` command. +All genomic current levels are then grouped based on the genomic k-mer sequence at that location. This k-mer is defined by the ``--upstream-bases`` and ``--downstream-bases`` options. Note that the modeled k-mer will be one longer than the sum of these two options as the k-mer includes the *dominant*, central position as well. The central position generally has the strongest correlation with the current signal level as can be seen with the ``plot_kmer`` command. -The reference signal level and spread for each k-mer are then estimated by taking the median of the signal level and mean of the standard deviation over all observations of each k-mer across the genome. By default, a single global standard deviation is taken as the median over all k-mers. The ``--kmer-specific-sd`` option is provided in order to estimate a seperate standard deviation for each k-mer, but is not recommended as standard deviation can be much less robust. +The reference signal level and spread for each k-mer are then estimated by taking the median of the signal level and mean of the standard deviation over all observations of each k-mer across the genome. By default, a single global standard deviation is taken as the median over all k-mers. The ``--kmer-specific-sd`` option is provided in order to estimate a seperate standard deviation for each k-mer, but is not recommended as this can have deleterious effects on Tombo analyses. In particular, k-mer specific standard deviation estimates can produce poor re-squiggle results due to signal being "packed" into high SD k-mers. These values are stored in the output file in the binary HDF5 format and can be passed to any Tombo command that takes a Tombo model file. Several options are supplied in order to ensure more robust parameter estimates via read depth thresholds at various stages of model estimation (``--minimum-test-reads``, ``--coverage-threshold`` and ``--minimum-kmer-observations``). -Model esitmation is capable of using mutiple processes via the ``--multiprocess-region-size`` and ``--processes`` options with similar behavior to the same behavior as these options in the ``test_significance`` command. The multiprocessing only applies to the genome position level computation and not the global model estimation stage. +The model estimation command is capable of using mutiple processes via the ``--multiprocess-region-size`` and ``--processes`` options with similar behavior as these options in the ``test_significance`` command. The multi-processing only applies to the genome position level computation and not the global model estimation stage; as such changes in multi-processing options will not change resulting models. ========================== ``estimate_alt_reference`` @@ -30,11 +34,11 @@ Model esitmation is capable of using mutiple processes via the ``--multiprocess- Alternative Reference Goals --------------------------- -One of the main goals of the Tombo suite of tools is to make alternative model estimation more accessible. Key to this goal is the estimation of an alternative model from a relatively simple to prodcue biological sample. A significant additional goal is the estimation of a model capable of detecting an alternative base in most (if not all) sequence contexts. +One of the main goals of the Tombo suite of tools is to make alternative model estimation more accessible. Key to this goal is the estimation of an alternative model from a relatively simple to prodcue biological sample. A significant additional goal is the estimation of a model capable of detecting an alternative base in all sequence contexts. In order to address these goals, the sample required for alternative model estimation must contain the four canonical bases along with a **single, known, alternative base incorporated randomly instead of one canonical base** into a sample with a known genome (referred to as the "*alternative sample*" below). The rate of incorporation for the alternative base should ideally be between 15% and 35%, though a larger range may be acceptable. Key to this method is that the exact known location of alternative base incorporation is not needed, though the base must incorporate in place of only a single canonical base (referred to as the "*swap base*" below and specified with the ``--alternate-model-base`` option to ``estimate_alt_reference``). -The creation of such a sample for the estimation of the included 5-methyl cytosine (5mC) model was completed by introducing 25% 5-methyl-dCTP into a standard PCR reaction in E. coil. Note that a standard PCR'ed (or otherwise produced canonical bases only) sample is also required for alternative model estimation (referred to as the "*standard sample*" below). These samples were then re-squiggled and processed with the ``estimate_alt_reference`` command to produce the included 5mC model. +The creation of such a sample for the estimation of the included 5-methylcytosine (5mC) model was completed by introducing 25% (ratio to canonical dCTP) 5-methyl-dCTP into a standard PCR reaction in E. coil. Note that a standard PCR'ed (or otherwise produced canonical bases only) sample is also required for alternative model estimation (referred to as the "*standard sample*" below). For the included N6-methyladenosine (6mA) model, the sample was produced using an in vitro methylase thus exemplifying the flexibility of the alternative model estimation method to different sample preparation techniques. These samples were then re-squiggled and processed with the ``estimate_alt_reference`` command to produce the included 5mC and 6mA models. --------------------------------------- Alternative Reference Estimation Method @@ -43,14 +47,14 @@ Alternative Reference Estimation Method Event Level Extraction ^^^^^^^^^^^^^^^^^^^^^^ -Given the above descsribed standard and alternative samples, the alternative model estimation procedure begins with the extraction of the current signal level from a number of reads from both samples. These signal levels are grouped by the genomic k-mer at the location aassigned by the re-squiggle algorithm. Importantly, in contract to standard reference estimation, the signal is not averaged or otherwise processed at the genomic position stage. This is because each swap base genomic position contains some proportion of canonical and alternative bases. +Given the above descsribed standard and alternative samples, the alternative model estimation procedure begins with the extraction of the current signal level from a number of reads from both samples. These signal levels are grouped by the genomic k-mer at the location assigned by the re-squiggle algorithm. Importantly, in contrast to standard reference estimation, the signal is not averaged or otherwise processed at the genomic position level. This is because each swap base genomic position contains some proportion of canonical and alternative bases. Reads continue to be processed until every k-mer has at least ``--minimum-kmer-observations`` unique event observations. For PCR'ed samples in paricular, the ``filter_coverage`` command can help speed up this processing step if the sample coverage is highly variable. In order to save on the memory footprint, event levels are no longer stored once 10,000 obervations have been made for a particular k-mer. Signal Level Density Estimation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Once enough observations have been parsed for each k-mer, a kernel density estimate is computed for each k-mer within the standard and alternative samples. This kernel density estimate can be controled with the ``--kernel-density-bandwidth`` option, though the default value seem to work well. The density estimates can be stored by specifying the ``--save-density-basename`` option, and this is highly recommended as the event extraction can be a long process. Future estimation efforts can then load these density estimates using the ``--alternate-density-filename`` and ``--control-density-filename`` options. Additionally, the ``debug_est_alt.R`` script (found in the ``scripts/`` directory of the repository) can produce some useful visualizations from these files. +Once enough observations have been parsed for each k-mer, a kernel density estimate is computed for each k-mer within the standard and alternative samples. This kernel density estimate can be controled with the ``--kernel-density-bandwidth`` option. The density estimates can be stored by specifying the ``--save-density-basename`` option, and this is highly recommended as the event extraction can be a long process. Future estimation efforts can then load these density estimates using the ``--alternate-density-filename`` and ``--control-density-filename`` options. Additionally, the ``debug_est_alt.R`` script (found in the ``scripts/`` directory of the repository) can produce some useful visualizations from these files. Alternative Base Density Isolation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -60,9 +64,9 @@ After standard and alternative kernel density estimation, an algorithm is applie Alternative Base Incorporation Rate ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The first step in this process is to estimate the fration of each k-mer alternative density composed of canonical signal levels. In order to estimate this value, the ratio of the highest peak of the standard density and the closest peak in the alternative sample density is computed for all k-mers including exactly one swap base. Note that the alternative density is also shifted such that these peaks are exactly overlapping due to scaling issues with a sample composed of a significnat proportion of non-canonical bases. +The first step in this process is to estimate the fraction of each k-mer alternative density composed of canonical signal levels. In order to estimate this value, the ratio of the highest peak of the standard density and the closest peak in the alternative sample density is computed for all k-mers including exactly one swap base. Before this ratio computation, alternative densities is shifted due to scaling issues for highly modified samples. This shift is estimated from the emperical signal levl distributions at each non-swap-base-containing k-mer and is fitted with a quadratic function. -Most of these k-mers are likely to shift the signal only slightly (though this may not hold true for extremely large alternative bases). Some small proportion of k-mers are likely to shift the signal observed significantly such that the standard and alternative base densities are essentially seperated and thus the ratio of these peaks represents close to the true alternative base incorporation. Thus a lower percentile of these ratios is taken as the true ratio of alternative base incorporation. This percentile is defined by the ``--alt-fraction-percentile`` option, with a default value of the first percentile. This value is also printed to stderr during the estimation command as a reference. +Most of these k-mers are likely to shift the signal only slightly (though this may not hold true for large or charged alternative bases). Some small proportion of k-mers are likely to shift the signal observed significantly such that the standard and alternative base densities are essentially seperated and thus the ratio of these peaks represents close to the true alternative base incorporation rate. Thus a lower percentile of these ratios is taken as the true rate of alternative base incorporation. This percentile is defined by the ``--alt-fraction-percentile`` option, with a default value of the first percentile. This value is also printed to stderr during the estimation command as a reference. ---- @@ -77,14 +81,14 @@ Most of these k-mers are likely to shift the signal only slightly (though this m Canonical Density "Subtraction" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Once the alternative base incorporation rate has been obtained, the alternative base expected level is computed by first isolating the alternative base density. This computation simply involoves subtracting, from the alternative sample density, the canonical density scaled by the alternative base incorporation rate. Any position where the scaled canonical density is greater than the alternative sample density is set to zero. Each k-mer's expected signal level is then taken as the weighted mean computed from the isolated alternative density. The spread for each k-mer is taken as the globally estimated standard deviation from the canonical model estimation. +Once the alternative base incorporation rate has been obtained, the alternative base expected level is computed by first isolating the alternative base density. This computation simply involoves subtracting, from the alternative sample kernel density estimate, the canonical kernel density estimate scaled by the alternative base incorporation rate. Any position where the scaled canonical density is greater than the alternative sample density is set to zero. Each k-mer's expected signal level is then taken as the weighted mean computed from the isolated alternative density. The spread for each k-mer is taken as the globally estimated standard deviation from the canonical model estimation as spread measures from the isolated distribution are not robust. For k-mers not containing any swap bases, the standard model expected level is taken. For kmers containing more than one swap base, the canonical distribution scaling factor is adjusted appropraitely assuming that each swap base has the same estimated incorporation rate. This is why only single swap base k-mers are used in the incorporation rate estimation stage. Alternative Model Output ^^^^^^^^^^^^^^^^^^^^^^^^ -The alternative model is then saved to the file specified with the ``--alternate-model-filename`` option. Also specified is the ``--alternate-model-name`` option, which should be a short name describing the alternative base. When ``test_significance`` is run with this alternative model, the results of testing against this alternative model are saved with this short name included in the output file. +The alternative model is then saved to the file specified with the ``--alternate-model-filename`` option. Also specified is the ``--alternate-model-name`` option, which should be a short name describing the alternative base. When ``test_significance`` is run with this alternative model, the results are saved with this short name included in the output Tombo statsitics filename. ==================== ``event_resquiggle`` diff --git a/docs/modified_base_detection.rst b/docs/modified_base_detection.rst index bbdc6bd..f7730de 100644 --- a/docs/modified_base_detection.rst +++ b/docs/modified_base_detection.rst @@ -2,7 +2,7 @@ Modified Base Detection *********************** -Tombo enables three methods for detecting shifts in current signal level, indicative of non-canonical bases. These three methods allow researchers to investigate non-canonical bases given any sample type, while also enabling more accurate detection of specific known modifications (currently only 5-methyl cytosine, but more coming soon). +Tombo enables three methods for detecting shifts in current signal level, indicative of non-canonical bases. These three methods allow researchers to investigate non-canonical bases given any sample type, while also enabling more accurate detection of specific known modifications when applicable. ---- @@ -14,39 +14,43 @@ Tombo enables three methods for detecting shifts in current signal level, indica ---- -All three methods are accessed by the ``test_significance`` tombo subcommand as described below. +All three methods are accessed by the ``test_significance`` Tombo sub-command as described below. TL;DR: -* To identify 5-methyl cytosine (5mC) run ``test_significance`` with the ``--alternate-bases 5mC`` option -* For more experimental de novo modified base detection simply run ``test_significance`` with a set of reads +* To identify 5-methylcytosine (5mC) and N6-methyladenosine (6mA), run ``test_significance`` with the ``--alternate-bases 5mC 6mA`` option +* For more experimental de novo modified base detection simply run ``test_significance`` with just a set of reads * For modified base detection via comparison to a control sample (e.g. PCR) run ``test_significance`` with a control set of reads (``--control-fast5-basedirs``) -* The ``test_significance`` command will produce a binary file (not intended for direct use) +* The ``test_significance`` command will produce a binary file (not intended for use outside the Tombo framework) + - To extract useful text files use the ``write_wiggles`` command - To visualize raw signal around significant regions use the ``plot_most_significant`` command + - To assess testing results around a known motif use the ``plot_motif_with_stats`` and ``plot_roc`` commands -.. - - Note that the ``resquiggle`` command must be run on a set of reads before processing with ``test_significance``. +.. hint:: + + The ``resquiggle`` command must be run on a set of reads before processing with ``test_significance``. ------------------- Statistical Testing ------------------- -For all statistical testing methods, the result is a binary Tombo statistics file. This file contains statistics associated with each genomic base (per-read output is not currently supported, but may be in the future). This file is not intended for direct use, but several other Tombo commands (``write_wiggles``, ``write_most_significant_fasta``, ``plot_most_significant``, etc.) take the statistics file as an input, accomodating many user pipelines downstream of modified base detection. +For all statistical testing methods, the result is a binary Tombo statistics file. This file contains statistics associated with each validly tested genomic base. This file is not intended for use outside of the Tombo framework. Several Tombo commands (e.g. ``write_wiggles``, ``write_most_significant_fasta`` and ``plot_most_significant``) take the statistics file as an input, accommodating many user pipelines downstream of modified base detection. + +Of particular interest, the statistics file contains the fraction of reads at each genomic position passing a set threshold (``--single-read-threshold``). This value is set to a default of 1% p-value for hypothesis testing methods (de novo and control sample comparison) and a log likelihood ratio of 0.5 for the alternative model testing method. Note that for likelihood ratio test fractions, some reads will fall between the +/- threshold values. The number of reads falling outside of the threshold values is saved under the ``valid_cov`` column in the statistics file. -Of particular interest, the statistics file contains the fraction of reads at each genomic position passing a set threshold (``--single-read-threshold``). This value is set to a default of 1% p-value for hypothesis testing methods (de novo and control sample comparison) and a log likelihood ratio of 2 for the alternative model testing method. Note that for likelihood ratio test fractions, some reads will fall between the +/- threshold values and so the sum of ``frac`` and ``alt_frac`` may not be 1. +For the de novo and alternative model testing approaches a default canonical model is used (included with Tombo code base). Users may also train their own canonical Tombo model (possibly for an older chemistry version) and test against this model using the hidden ``--tombo-model-filename`` option. See more in the :doc:`model_training` section. -For the de novo and alternative model testing approaches a default canonical model is used (included with Tombo code base). Users may also train their own canonical Tombo model (possibly for an older chemistry version) and test agaist this model using the ``--tombo-model-filename`` option. See more in the Model Training documentation section. +Also available from the ``test_significance`` sub-command is a per-read binary (HDF5) statistics file (via ``--per-read-statistics-basename`` option). This file is currently made available for research on per-read modified base detection including plotting via the ``plot_per_read`` command. For advanced research, the per-read statistics data can be accessed (including random access to particular regions of the genome) using the ``tombo.tombo_stats.PerReadStats`` class from the Tombo python API. -5mC (DNA) Detection -=================== +Alternative Model Method +======================== -In order to specifically detect 5-methyl cytosine (as well as other alternative bases soon), use the ``test_significance`` command with the ``--alternate-bases`` option. Users may also train their own alternative base Tombo models and test against these with the ``--alternate-model-filenames`` option. See more details in the Model Training documentation section. +In order to specifically detect 5mC and 6mA, use the ``test_significance`` command with the ``--alternate-bases`` option. Users may also train their own alternative base Tombo models and test against these with the hidden ``--alternate-model-filenames`` option (this option is hidden from the command line help as it is intended only for advanced users). See more details in the :doc:`model_training` section. -This will perform a log likelihood ratio test using the default canonical and 5mC alternative models provided with Tombo. This likelihood ratio is computed over all positions modeled. The default model is a 6-mer, so the signal at the 6 surrounding genomic contexts, contribute to the log likelihood ratio test statistic at any one position. +This will perform a log likelihood ratio test using the default canonical and the specified alternative models provided with Tombo (5mC and 6mA). This likelihood ratio is computed over all positions modeled. The default DNA model is a 6-mer, so the signal at the six surrounding genomic bases contribute to the log likelihood ratio test statistic at any one position. -For example with a **C** found in a TGGTA **C** GTCCG context, the signal will be tested against expected canonical and alternative distributions at the following locations:: +For example for 5mC detection within in a TGGTA **C** GTCCG context, the signal will be tested against expected canonical and alternative 5mC levels at the following locations:: TGGTA **C** GTCCG ----------------- @@ -57,28 +61,28 @@ For example with a **C** found in a TGGTA **C** GTCCG context, the signal will b A **C** GTCC **C** GTCCG -New alternative base models will be added as they are trained. This is the perferred method for modified base detection if a model is available for your biological sample of interest. +New alternative base models will be added as they are trained and validated internally. This is the perferred method for modified base detection if a model is available for your biological sample of interest as the exact modification position is identified. .. code-block:: bash tombo test_significance --fast5-basedirs \ - --alternate-bases 5mC --statistics-file-basename sample_5mC_detection + --alternate-bases 5mC 6mA --statistics-file-basename sample_alt_model # if you have trained you own alternative base model tombo test_significance --fast5-basedirs \ - --alternate-model-filename exotic_base.tombo.model \ - --statistics-file-basename sample_exotic_base_detection + --alternate-model-filenames alternative_base.tombo.model \ + --statistics-file-basename sample_user_alt_model -Canonical Sample Comparison Detection -===================================== +Canonical Sample Comparison Method +================================== In order to perform *canonical sample comparison* modified base detection, use the ``test_significance`` command with a second set of reads from the same biological sample containing only canonical bases (e.g. PCR) via the ``--control-fast5-basedirs``. -For each sample read, this will perform a hypothesis test against a normal distribution estimated from the the signal level observed from the control sample at each genomic position. This method provides the highest accuracy (as effects outside of the default modeled 6-mer are accounted for in the control sample), but does not always identify the exact modification position or the identity of the modified base. +For each sample read, this will perform a hypothesis test against a normal distribution estimated from the signal level observed from the control sample reads at each genome position. This method provides the highest accuracy (as effects outside of the default modeled 6-mer are accounted for in the control sample), but does not always identify the exact modification position or identity of the modified base. -Note that no global model is used in the application of this method. Instead the testing null distribution is estimated at each genomic location. +Note that no model is used in the application of this method. Instead the testing null distribution is estimated at each genomic location from the control set of reads. -For both this method, as well as the canonical model method, the ``--fishers-method-context`` option will combine test values, using `Fisher's Method `_, over a moving window extending a number of positions in either direction. Due to the nature of nanopore sequencing, the genomic context surrounding the read head effect that current at any position. Thus shifts in signal due to a modified base may occur at several positions to either side of the true modified location. Thus combining statistical test values across several genomic positions can help to center significant values on the truely modified position. The default value for this parameter is 1, but reasonable results can be obtained for values between 0 and 3. +For both this method, as well as the canonical model method, the ``--fishers-method-context`` option will combine test values, using `Fisher's Method `_, over a moving window extending a number of positions in either direction. Due to the nature of nanopore sequencing, the genomic context surrounding the read head effect that current at any position. Thus shifts in signal due to a modified base may occur at several positions to either side of the true modified location. Thus combining statistical test values across several genomic positions can help to center significant values on the truly modified position. The default value for this parameter is 1, but reasonable results can be obtained for values between 0 and 3. .. code-block:: bash @@ -86,12 +90,12 @@ For both this method, as well as the canonical model method, the ``--fishers-met --control-fast5-basedirs \ --statistics-file-basename sample_canonical_compare -De novo Non-canonical Base Detection -==================================== +De novo Non-canonical Base Method +================================= -In order to perform de novo non-canonical base detection, use the ``test_significance`` command with no other options (or a canonical Tombo model, ``--tombo-model-filename``, if not using the default canonical Tombo model). +In order to perform de novo non-canonical base detection, use the ``test_significance`` command with no other options (aside from the set of reads to test). -For each read, this will perform a hypothesis test against the canonical model based on the genomic sequence at each position. Note that this method can be quite error prone and will likely result in a high number of false positives, but may be of use in a research and development setting. This method also has the least requirements of the three methods, requiring only a set of reads and a genome, allowing any nanopore researcher to start investigating modified bases. +For each read, this will perform a hypothesis test against the canonical model based on the genomic sequence at each position. Note that this method can be quite error prone and may result in a high false positive rate, but may be of use in a research and development setting. This method also has the lowest barrier to entry, requiring only a set of reads and a genome, allowing any nanopore researcher to start investigating modified bases. .. code-block:: bash @@ -102,22 +106,36 @@ For each read, this will perform a hypothesis test against the canonical model b Multi-processing ---------------- -Tombo statistical testing provides the option to perform testing spread acorss multiple processes. This also allows users to limit the memory requirement for processing statistical testing, as all events for all reads covering a region must be read into memory at once to perform testing. If the ``test_significance`` command seems to be using too much memory, consider setting the ``--multiprocess-region-size`` to a lower value. +Tombo statistical testing provides the option to perform testing spread across multiple processes. This also limits the memory requirement for modified base detection, as all testing values across a region are held in memory. If the ``test_significance`` command seems to be using too much memory, consider lowering the ``--multiprocess-region-size`` value. -Multi-processing is performed over batches delineated by regular intervals across chromosomes covered by at least one read. The interval size is determined by the ``--multiprocess-region-size`` option and processed by ``--processes`` individual processes independently. The produced per-base results are identical no matter the multi=processing options selected. +Multi-processing is performed over batches delineated by regular intervals across chromosomes covered by at least one read. The interval size is determined by the ``--multiprocess-region-size`` option and processed by ``--processes`` individual processes independently. The produced per-base results are identical no matter the multi=processing options selected. These regions are also used as batches to store the pre-read statistics file. ---------------------------- Tombo Statistics File Format ---------------------------- -While the Tombo statistics file is meant to be a binary file not processed by outside tools its contents are described here. The Tombo statistics file is in the HDF5 format. There is one attribute at the root level, ``stat_type`` indicating which testing method was used (``model_compare``, ``de_novo`` or ``sample_compare``). +While the Tombo statistics file is meant to be a binary file not processed by outside tools its contents are described here for completeness. The Tombo statistics file is in the HDF5 format. There is one attribute at the root level, ``stat_type`` indicating which testing method was used (``model_compare``, ``de_novo`` or ``sample_compare``). + +The per-base statistics are stored in a dataset, ``stats``, containing one record for each genomic base. Each record contains the following attributes: ``frac``, ``pos``, ``chrm``, ``strand``, ``cov``, ``control_cov``, and ``valid_cov``. + +``pos``, ``chrm`` and ``strand`` define the zero-based genomic position for this record. -The per-base statistics are stored in a dataset, ``stats``, containing one record for each genomic base. Each record contains the following attributes: ``stat``, ``mt_stat``, ``frac``, ``alt_frac``, ``pos``, ``chrm``, ``strand``, ``cov`` and ``control_cov``. +``frac`` contains the fraction of valid (not including positions with ``-single_read_threshold < stat < single_read_threshold``) reads at this genomic position identified as the standard base. -``pos``, ``chrm`` and ``strand`` define the genomic position for this record. Note that position is 0-based indexing, and care should be taken if using this to compare to other genomic data sets. +``cov``, ``control_cov``, and ``valid_cov`` contain the read coverage at the genomic position for the sample and control reads. ``control_cov`` is only applicable for the control sample comparison testing method. ``valid_cov`` contains the number of reads contributing to the ``frac`` of tested reads as defined by ``--single-read-threshold`` (only applicable for the alternative model comparison method; set to ``cov`` for other methods). -``frac`` and ``alt_frac`` contain the fraction of reads at this genomic position producing a significant result as defined by ``--single-read-threshold``. Note that for log likelihood test results, via the alternative model comparison method, the sum of ``frac`` and ``alt_frac`` may not sum to 1. +------------------------------- +Per-read Statistics File Format +------------------------------- + +Per-read statistics can be stored by setting the ``--per-read-statistics-basename`` option to the ``test_significance`` command. This output file can then be used in downstream Tombo sub-commands (currently only via the ``plot_per_read`` command). + +For advanced users the Tombo per-read statsitics file can be accessed via the Tombo python API using the ``tombo.tombo_stats.PerReadStats`` class. This class provides initialization, simply taking the per-read statsitics filename. The ``PerReadStats`` class supports the ``get_region_stats`` function which takes a ``tombo.tombo_helper.intervalData`` object specifying the interval of interest. This will return a numpy array containing a record for each read (specified by the ``read_id`` field) and each tested genomic position (``pos`` field) along with the test statistic (``stat`` field) at that location. + +.. important:: + + All other optional arguments to the ``tombo.tombo_stats.PerReadStats`` constructor should be left as ``None``; setting these values will delete the file and construct a blank per-read statistics file. -``cov`` and ``control_cov`` contain the read coverage at the genomic position for the sample and control reads (control coverage is only applicable for the control sample comparison testing method). +The per-read statistics file is in the HDF5 format. All blocks are stored within the ``Statistic_Blocks`` slot. The size of the blocks is stored in the ``block_size`` attribute (defined by the ``--multiprocess-region-size`` option) and the type of statistical test applied is stored in the ``stat_type`` attribute. -``stat`` and ``mt_stat`` contain the "averaged" statistical testing values at each genomic location. ``mt_stat`` contains the `Benjamini-Hochberg `_ multiple testing corrected testing value. For the hypothesis testing based methods (de novo and control sample comparison) this "average" is taken as the `Fisher's method `_ combination of p-values for each read at this genomic position. Note that this is highly dependent on read coverage and thus should be used with caution. For the alternative model comparison method, the "average" is taken as the average log likelihood test statistics over all reads covering this genomic position. +Each genomic block is stored in a different ``Block_[##]`` slot. These slots do not have any particular order. Within each block the ``chrm``, ``strand`` and ``start`` of the block are stored. The block statistics are stored in the ``block_stats`` slot. Per-read statistics contain a record for each tested location within each read. Each record contains the genomic position (``pos``), the test statistic (``stat``; hypothesis test p-value or log likelihood ratio as indicated by the statistic type), and the ``read_id``. A single read spanning multiple blocks will contain statistics in more than one block. An individual read's statistics can be reconstructed using the ``read_id`` field. diff --git a/docs/plotting.rst b/docs/plotting.rst index 775e1d6..86fd66c 100644 --- a/docs/plotting.rst +++ b/docs/plotting.rst @@ -2,7 +2,7 @@ Plotting Commands ***************** -In order to enhance modified base detection and give users a better understanding of raw nanopore data, Tombo provides a number of plotting commands. +In order to enhance modified base detection and give users a better grasp of raw nanopore data, Tombo provides a number of plotting commands. ------------------------ Genome Anchored Plotting @@ -34,9 +34,9 @@ These plotting commands produce raw signal level plots such at the example below Model Plotting ^^^^^^^^^^^^^^ -Plots are also enabled to visualize the different testing frameworks available in Tombo. Thses plots would additionally include a control sample, the standard model or any non-standard base model, visualizing the control sample comparison, de novo testing and log likelihood ratio tests respectively. +Plots are also enabled to visualize the different testing frameworks available in Tombo. These plots include a control sample, the standard model or any non-standard base model, visualizing the control sample comparison, de novo and log likelihood ratio tests respectively. -Control these plots with these options: ``--control-fast5-basedirs``, ``--tombo-model-filename``, ``--alternate-model-filename``, ``--plot-standard-model``, and ``--plot-alternate-model`` +Control these plots with these options: ``--control-fast5-basedirs``, ``--plot-standard-model``, ``--plot-alternate-model 5mC``, ``--tombo-model-filename``, and ``--alternate-model-filename``. ---- @@ -63,7 +63,7 @@ Control these plots with these options: ``--control-fast5-basedirs``, ``--tombo- Over-Plotting ^^^^^^^^^^^^^ -When high coverage regions are plotted the raw signal plots can become less interpretable. By default, when read coverage exceeds 50X reads are randomly downsampled (change this option with ``--overplot-threshold``). Three additional over-plotting options (boxplot, quantile and density) are available as shown below (chose which over-plotting type to use with the ``--overplot-type`` option). +When high coverage regions are plotted the raw signal plots can become less interpretable. By default, when read coverage exceeds 50X reads are randomly downsampled to 50X coverage (change this threshold with the ``--overplot-threshold`` option). Three additional over-plotting types (boxplot, quantile and density) are available as shown below (chose which over-plotting type to use with the ``--overplot-type`` option). ---- @@ -90,7 +90,7 @@ When high coverage regions are plotted the raw signal plots can become less inte Per-read Plotting ^^^^^^^^^^^^^^^^^ -As testing is applied on a per-read setting, per-read statistics plots are also available. As per-read statistics are not stored in the current Tombo framework, test values are re-computed for this plotting command (and the control sample comparison method is not currently enabled). Create these plots with the ``plot_per_read`` command. +All testing in the Tombo framework is applied on a per-read basis; to visualize these per-read results, per-read statistic plots are available. Per-read statistics are an optional output from the ``test_significance`` command via the ``--per-read-statistics-filename`` option, and the output file specified by this option is required in order to plot per-read statistics. Create these plots with the ``plot_per_read`` command. ---- @@ -144,7 +144,34 @@ In order to investigate the k-mer signal current levels of a particular set of r ---- +ROC Curve +^^^^^^^^^ + +In order to validate the performance of significance testing results at a known sequence motif, the ``plot_roc`` command is provided. This command takes a Tombo statistics file, corresponding motif descriptions and the genome FASTA file. The "area under the curve" (AUC) for each motif is printed and the precision-recall curve is also plotted for each motif on the second page of the resulting PDF. Note that only genomic positions with the canonical base of interest are included in the results from this command. + +Below is an example command and resulting plot for identifying the known dam and dcm methylase contexts in E. coli using all three provided testing methods. + +.. code-block:: bash + + tombo plot_roc \ + --statistics-filenames vs_pcr.tombo.stats de_novo.tombo.stats \ + 5mC_model.5mC.tombo.stats 6mA_model.6mA.tombo.stats \ + --motif-descriptions CCWGG:2:"dcm 5mC Sample Comp"::GATC:2:"dam 6mA Sample Comp" \ + CCWGG:2:"dcm 5mC De novo"::GATC:2:"dam 6mA De novo" \ + CCWGG:2:"dcm 5mC Alt Comp" GATC:2:"dam 6mA Alt Comp" \ + --genome-fasta e_coli.fasta + +---- + +.. figure:: _images/roc.png + :align: center + :scale: 30% + + Example ROC curve plot + +---- + Correction Plotting ^^^^^^^^^^^^^^^^^^^ -Plotting commands, ``plot_correction`` and ``plot_multi_correction``, are provided to visualize the old event-based re-squiggle process. These commands are thus only applocable on reads that have been processed with ``event_reqsuiggle``. +Plotting commands, ``plot_correction`` and ``plot_multi_correction``, are provided to visualize the old event-based re-squiggle process. These commands are thus only applicable on reads that have been processed with ``event_reqsuiggle``. These commands may be deprecated in the future. diff --git a/docs/resquiggle.rst b/docs/resquiggle.rst index 273d18b..ad50c4a 100644 --- a/docs/resquiggle.rst +++ b/docs/resquiggle.rst @@ -2,23 +2,28 @@ Re-squiggle Algorithm ********************* -The signal level data produced from a nanopore read is referred to as a squiggle. Base calling this squiggle information generally contains some errors compared to a refernce genome, so this algorithm defines a new squiggle sequence assignment, hence a re-squiggle. +The signal level data produced from a nanopore read is referred to as a squiggle. Base calling this squiggle information generally contains some errors compared to a refernce genome. The re-squiggle algorithm defines a new squiggle to genomic sequence assignment, hence a re-squiggle. -The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle algorithm takes as input a FAST5 containing raw signal and associated base calls. The base calls are mapped to a genome reference and then the raw signal is assigned to the genomic sequence based on an exprected current level model. +The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle algorithm takes as input a read file (in FAST5 format) containing raw signal and associated base calls. The base calls are mapped to a genome reference and then the raw signal is assigned to the genomic sequence based on an expected current level model. TL;DR: -* Re-squiggle must be run before any other Tombo command (aside from the `annotate_raw_with_fastqs` pre-processing sub-command) -* Minimally the command takes a directory containing FAST5 files, a genome reference and an executable mapper. -* FAST5 files must contain basecalls (as produced by albacore in fast5 mode or added with `annotate_raw_with_fastqs`), but need not contain the "Events" table -* Tombo currently only supports R9.4 and R9.5 data (via included default models). Other data may produce sub-optimal results -* DNA and RNA reads will be detected automatically and processed accordingly (set explicitly with `--dna` or `--rna`) +* Re-squiggle must be run before any other Tombo command (aside from the ``annotate_raw_with_fastqs`` pre-processing sub-command). +* Minimally the command takes a directory containing FAST5 files and a genome reference. + + - Genome reference may be previously known or discovered from this sample. +* FAST5 files must contain basecalls (as produced by albacore in fast5 mode or added with ``annotate_raw_with_fastqs``), but need not contain the "Events" table. +* Tombo currently only supports R9.4 and R9.5 data (via included default models). Other data may produce sub-optimal results. +* DNA and RNA reads will be detected automatically and processed accordingly (set explicitly with ``--dna`` or ``--rna``). + + - Tombo does not perform spliced mapping. Thus a transcriptime reference must be passed to the re-squiggle command for RNA samples. For futher details on Tombo RNA processing see the :doc:`rna` section. +* Run ``resquiggle`` over multiple cores with the ``--processes`` option. ----------------- Algorithm Details ----------------- -The re-squiggle algorithm occurs in 4 main steps described below. +The re-squiggle algorithm occurs in four main steps described below. * Genome Mapping * Event Detection @@ -28,29 +33,33 @@ The re-squiggle algorithm occurs in 4 main steps described below. Genome Mapping -------------- -The genome mapping is performed via one of the Tombo supported mapping programs (``minimap2``, ``graphmap`` and ``bwa-mem``). This operation can be spread over multiple processes via the ``--processes``, ``--align-processes`` and ``--align-threads-per-process`` arguments. Each process pulls reads off a queue to run mapping operations in batches of ``--alignment-batch-size``. These mappings are then processed to extract the genomic sequence and these sequences are placed in a second queue to be processed later. +The genome mapping is performed via the python API to ``minimap2`` (`mappy python package `_). -Read base called sequence location within the FAST5 file is defined by the ``--basecall-group`` and ``--basecall-subgroups`` command line options. The default values of these parameters point to the default location for base calls from albacore. +Read base called sequence location within the FAST5 file is defined by the ``--basecall-group`` and ``--basecall-subgroups`` command line options. The default values of these parameters point to the default location for base calls from albacore or ``annotate_raw_with_fastqs``. -Note that each process currently reads the whole reference genome into memory in order to process SAM formated mappings. Take care when running Tombo on larger genomes ot avoid overflow a systems memory. This is true even if the optional ``--minimap2-index`` parameter is provided. The minimap2 index parameter only effects the mapping call itself. +The genomic sequence for successfully mapped reads are then passed on to the sequence to signal assignment stage. -When using ``bwa-mem`` note that the reference fasta must be indexed. Unfortunately, this error is currently not caught until after extracting read sequence and attempting to map, and will result in all reads failing due to no apparent alignment being produced (results in failed read message ``Alignment not produced (if all reads failed check for index files)``). +.. tip:: + + Unless the optional dependency ``pyfaidx`` is installed (included in default conda installation), each process reads the whole reference genome into memory in order to extract genomic seqeunce. Take care when running Tombo on larger genomes to avoid overflowing a systems memory. This is true even if the optional ``--minimap2-index`` parameter is provided. The minimap2 index parameter only effects the mapping call itself. Event Detection --------------- -The Tombo algorithm does not require the "events" (blocks of raw signal potentially corresponding to bases) table. Instead, Tombo discovers events from the raw signal. This segmented signal makes downstream processing steps more efficient and stable. This event detection algorithm is different from the event detection performed in albacore, but produces very simimlar results. +The Tombo algorithm does not require the "Events" table (raw signal assignment to base calls). Instead, Tombo discovers events from the raw signal. This segmented signal makes downstream processing steps more efficient and stable. This event detection algorithm is different from the event detection performed in previous versions of albacore, but produces similar results. -Events are determined by identifying large shifts in current level, by taking the running difference between neighboring windows of raw signal (3 observations for DNA and 20 for RNA). The largest jumps are chosen as the breakpoints between events. The mean of normalized raw signal is then computed for each event. +Events are determined by identifying large shifts in current level, by taking the running difference between neighboring windows of raw signal (explicitly set this parameter with the ``--segmentation-parameters`` option). The largest jumps (or most significant via a t-test for RNA) are chosen as the breakpoints between events. The mean of normalized raw signal is then computed for each event. -Signal normalization is performed on each read by shifting to a median of zero and scaling with the median absolute deviation (MAD). By default, a global scaling value is taken as the mean of the MAD from a random sample of reads and used to scale all reads. This behaviour can be overriden with ``--fit-scale-per-read`` option to fit the MAD scaling value from each read seperately or the ``--fixed-scale`` argument to manually set the scaling value (advanced users only). Raw signal is also windsorized based on the value of the ``--outlier-threshold`` parameter. These scaling parameters are stored in the Tombo slot for access in later commands. +Raw signal normalization estimates a median shift parameter and a median absolute deviation (MAD) scale parameter. By default, a global scale value is taken as the mean of MAD computed from a random sample of reads and used to scale all reads. This behaviour can be overriden with ``--fit-scale-per-read`` option or the ``--fixed-scale`` option to manually set the global scaling value (advanced users only). Raw signal is also windsorized, ``--outlier-threshold`` parameter. These scaling parameters are stored in the Tombo FAST5 slot for access in later commands. Note that only median signal normalization is available within the Tombo framework. + +The ``--segmentation-parameters`` values have been optimized for DNA and RNA data types, so DNA and RNA read types should not be mixed in processing. Sequence to Signal Assignment ----------------------------- -Given the mapped genomic sequence and segmented signal, the sequence to signal assignment algorithm finds the most likely matching of these two. +Given the mapped genomic sequence and segmented signal, the sequence to signal assignment algorithm finds the most likely matching between these two. -The algorithm first uses a large bandwidth (2000 events over the first 500 genomic bps) to identify the start of the genomic sequence within the events (see figure below). This is necessary as some portion at the beginning of a read is not base called and some additional sequence may have been trimmed by the alignment. The matching is determined by applying a dynamic programming algorithm to find the most likely matching between the event signal levels and the expected signal levels given the genomic sequence. +The algorithm first uses a large bandwidth (5000 events over the first 500 genomic bps) to identify the start of the genomic sequence within the events (see figure below). This is necessary as some portion at the beginning of a read is not base called and some additional sequence may have been trimmed from the alignment. The matching is determined by applying a dynamic programming/dynamic time warpping algorithm to find the most likely matching between the event signal levels and the expected signal levels given the genomic sequence. ---- @@ -64,15 +73,17 @@ The algorithm first uses a large bandwidth (2000 events over the first 500 genom :align: center :scale: 110% - Read start forward pass scores + Read start forward pass scores and traceback path ---- -First a static banded matrix is constructed by computing the z-score on each event level against each genomic position (see figure **a** above). The absolute value is then taken of this z-score and then flipped and shifted such that the expected value given a correct matching is zero. A forward pass computes the maximal cummulative score up to each matched event to genome position (see figure **b** above). At each iteration the maximal score is taken over three possibilities 1) staying in the same genomic position, and accumulating the shifted z-score 2) matching an event with a genomic position (with score bonus defined by ``--match-expected-value``) 3) skipping this genomic position (with a score penalty defined by ``--skip-penalty``). From this forward pass the maximal score along the last genomic position is taken and traced back to obtain the starting position of matching sequence and signal. +A static banded matrix is constructed by computing the z-score for event level (x-axis) against genomic positions (y-axis). The negative absolute value z-score is shifted to an expected value of zero to fill the banded matrix (see figure **a** above). A forward pass computes the maximal cummulative score up to each matched event to genome position (see figure **b** above). + +At each iteration the maximal score is taken over three possibilities 1) staying in the same genomic position, and accumulating the shifted z-score 2) matching an event with a genomic position (with score bonus) 3) skipping this genomic position (with a score penalty). The score match and skip penalties are definied by the ``--signal-align-parameters``. The default values have been optimized for DNA and RNA data types. From this forward pass, the maximal score along the last genomic position is taken and traced back to obtain the starting position of matching sequence and signal. -If a read is short enough (less than 2,500 events or less than 500 bps of called sequence), then the whole sequence to signal matching will be performed with a single run with an appropriate static bandwidth. +If a read is short enough (less than 5500 events or less than 500 bps of called sequence), then the whole sequence to signal matching will be performed with a single run with an appropriate static bandwidth. -For longer reads, the above computed start matching position is taken and then the same dynamic programming solution is applied except a smaller (definied by ``--bandwidth``) adaptive band is now used (see figure below). At each genomic position, the band position is defined to center on the current maximal score of the forward pass. This aims to ensure that the traceback path will remain within the adaptive window. There are edge cases where the valid matching leaves the adaptive band. These reads are filtered out and included in the failed read group ``Read event to sequence alignment extends beyond --bandwidth``. +For longer reads, the above computed start matching position is taken and then the same dynamic programming solution is applied except a smaller adaptive band is now used (see figure below). The bandwidth is definied by the ``--signal-align-parameters`` option and again has been optimized for DNA and RNA data types. At each genomic position, the band position is defined to center on the maximal score of the forward pass from the previous base. This aims to ensure that the traceback path will remain within the adaptive window. There are edge cases where the valid matching leaves the adaptive band. These reads are filtered out and included in the failed read group ``Read event to sequence alignment extends beyond --bandwidth``. ---- @@ -93,58 +104,60 @@ For longer reads, the above computed start matching position is taken and then t Resolve Skipped Bases --------------------- -After the dynamic programming step, skipped bases must be resolved using the raw signal to obtain a matching of each genomic base to a bit of raw signal. A region around each skipped genomic base is identified. Then a dynamic programming algorithm very similar to the last step is performed, except the raw signal is used instead of events and the skip move is not allowed. Additionally, the algorithm forces each genomic base to be assigned at least 3 raw observations to produce more robust assignments. After this procedure the full genomic sequence has raw signal assigned. +After the dynamic programming step, skipped bases must be resolved using the raw signal to obtain a matching of each genomic base to a bit of raw signal. A window around each skipped genomic base is identified. If a window does not contain enough raw signal to perform a raw signal search the window is expanded until enough signal is found. Overlapping windows are collapsed into a single window. + +After deletion windows are identified, a dynamic programming algorithm very similar to the last step is performed. Importantly, the raw signal is used instead of events and the skip move is no longer allowed. Additionally, each genomic base is forced to contain a minimal number of raw observations to produce more robust assignments (explicitly set this value with the ``--segmentation-parameters`` option). This completes the re-squiggle procedure producing a matching of a read's raw signal to the mapped genomic sequence. ------------------------------- Common Failed Read Descriptions ------------------------------- -``Alignment not produced (if all reads failed check for index files)`` - -* This error indicates that either the mapping program did not produce a valid mapping or index files are not available (primarily for BWA-MEM). - ``Fastq slot not present in --basecall-group`` ``Raw data is not found in Raw/Reads/Read_[read#]`` * These error indicates that a necessary peice of information for Tombo to run was not found in the FAST5 file. +``Alignment not produced`` + +* This error indicates that minimap2 (via mappy API) did not produce a valid mapping. + ``Could not close FAST5 file. Possibly corrupted file`` -* This error indicates that an unexpected error occurred trying to open or close a read file. This can happen if zombie processes are still accessing the files or if files are actively being accessed by another program. +* This error indicates that an unexpected error occurred trying to open or close a read file. This can happen if the reads are being actively accessed by another program or if a file has been corrupted. -``Adaptive signal to seqeunce alignment extended beyond raw signal`` -``Read event to sequence alignment extends beyond --bandwidth`` -``Too little signal around event-aligned genomic deletion`` +``Read contains too many potential genomic deletions`` +``Not enough raw signal around potential genomic deletion(s)`` -* These errors indicate that something seemed off when attempting to perform sequence to signal matching +* These errors indicate that the sequence to signal matching algorithm was unable to identify a valid path. -``No valid path found through raw signal of long read`` -``No valid path found through start of raw signal`` -``No valid path found through raw signal of short read`` +``Poor raw to expected signal matching`` +``Poor raw to expected signal matching at read start`` -* These errors indicate that the dynamic programming algorithm produce a poorly scored matching of genomic sequence to raw signal. This likely indicates that either the genomic mapping is incorrect or that the raw signal is of low quality in some sense. +* These errors indicate that the dynamic programming algorithm produce a poorly scored matching of genomic sequence to raw signal. Some potential sources for these errors include incorrect primary genomic mapping, incorrect genome sequence (compared to the biological sample), poor quality raw signal or an incompatible flowcell/library with included canonical models (only R9.5/4 flowcells currently supported; 2D reads are not supported; DNA and RNA are supported). ------------------ Tombo FAST5 Format ------------------ -The result of the re-squiggle algorithm writes the sequence to signal assignment back into the read FAST5 files (found in the ``--corrected-group`` slot; the default value is the default for all other Tombo commands to read in this data). When running the re-squiggle algorithm a second time on a set of reads, the --overwrite option is required in order to write over the previous Tombo results. +The result of the re-squiggle algorithm writes the sequence to signal assignment back into the read FAST5 files (found in the ``--corrected-group`` slot; the default value is the default for all other Tombo commands to read in this data). When running the re-squiggle algorithm a second time on a set of reads, the ``--overwrite`` option is required in order to write over the previous Tombo results. -The Tombo slot contains several useful bits of information. The ``--corrected-group`` slot contains attributes for the signal normalization (shift, scale, upper_limit, lower_limit and outlier_threshold) as well as a binary flag indicating whether the read is DNA or RNA. Within the ``Alignment`` group, the gemomic mapped start, end, strand and chromosome are stored. The mapping statistics (number clipped start and end bases, matching, mismatching, insertioned and deleted bases). Note that this information is not enabled at this time, but should be added back soon. +The ``--corrected-group`` slot contains attributes for the signal normalization (shift, scale, upper_limit, lower_limit and outlier_threshold) as well as a boolean flag indicating whether the read is DNA or RNA. Within the ``Alignment`` group, the gemomic mapped start, end, strand and chromosome as well as mapping statistics (number clipped start and end bases, matching, mismatching, inserted and deleted bases) are stored. -The ``Events`` slot contains a matrix containing the matching of raw signal to genomic sequence. This slot contains a single attribute (``read_start_rel_to_raw``) giving the zero-based offset into the raw signal slot for genomic sequence matching. The events table then starts matching sequence from this offset. Each entry in the ``Events`` table indicates the normalized mean signal level (``norm_mean``), optionally (triggered by the ``--include-event-stdev`` option) the normalized signal standard deviation (``norm_stdev``), the start position of this base (``start``), the length of this event in raw signal values (``length``) and the genomic base (``base``). +The ``Events`` slot contains a matrix with the matching of raw signal to genomic sequence. This slot contains a single attribute (``read_start_rel_to_raw``) giving the zero-based offset indicating the beginning of the read genomic sequence within the raw signal. Each entry in the ``Events`` table indicates the normalized mean signal level (``norm_mean``), optionally (triggered by the ``--include-event-stdev`` option) the normalized signal standard deviation (``norm_stdev``), the start position of this base (``start``), the length of this event in raw signal values (``length``) and the genomic base (``base``). This information is accessed as needed for down-stream Tombo processing commands. -This data generally adds ~75% to the memory footprint of a minimal FAST5 file (containing raw and sequence data). This may vary across files and sample types. +This data generally adds ~75% to the memory footprint of a minimal FAST5 file (containing raw and sequence data; not including a basecalling Events table). This may vary across files and sample types. + +**Important RNA note**: Tombo performs only un-spliced mapping. As such, for potentially spliced transcripts a transcriptome file must be provided. While this makes Tombo RNA processing annotation dependent the transcriptome is the more appropriate setting for modified base detection and thus this path has been chosen for Tomob RNA processing. More details about RNA processing can be found in the :doc:`rna` section. -**Important RNA note**: RNA reads pass through the pore in the 3' to 5' direction during sequencing. As such, the raw signal and albacore events are stored in the reverse direction from the genome. Tombo events are stored in the opposite direction (corresponding to the genome sequence direction, not sequencing time direction) for several practical reasons. Thus if events are to be compared to the raw signal, the raw signal must be reversed. +**Minor RNA note**: RNA reads pass through the pore in the 3' to 5' direction during sequencing. As such, the raw signal and albacore events are stored in the reverse direction from the genome. Tombo events for RNA data are stored in the opposite direction (corresponding to the genome sequence direction, not sequencing time direction) for several practical reasons. Thus if events are to be compared to the raw signal, the raw signal must be reversed. Tombo RNA models are stored in the same direction and thus may be considered inverted as compared to some other RNA HMM signal level models. ---------------- Tombo Index File ---------------- -By default, Tombo will create a hidden file containing the essential genomic mapping location for each validly processed read. This file will be located alongside the base fast5s directory. For example, if resquiggle is called on this directory ``/path/to/fast5/reads/`` then the following file will be created ``/path/to/fast5/.reads.RawGenomeCorrected_000.tombo.index``. This file should generally be about 1,000 times smaller than the corresponding FAST5s and so should not impeed significantly on disk space. If desired, the index can be skipped with the ``--skip-index`` option. Note that this will make most all downstream commands much slower as each read will be queried to produce the index in memory each time another Tombo command is run. +By default, Tombo will create a hidden file containing the essential genome mapping location for each validly processed read. This file will be located alongside the base fast5s directory. For example, if resquiggle is called on this directory ``/path/to/fast5/reads/`` then the following file will be created ``/path/to/fast5/.reads.RawGenomeCorrected_000.tombo.index``. This file should generally be about 1,000 times smaller than the corresponding FAST5s and so should not incur significantly more disk usage. If desired, the index can be skipped with the ``--skip-index`` option. Note that this will make most all downstream commands much slower and filtering cannot be applited without this index file. ------------------------------- Additional Command Line Options @@ -156,19 +169,18 @@ Additional Command Line Options ``--obs-per-base-filter`` -* This option applies a filter to "stuck" reads (too many observations per genomic base). This filter is applied only to the index file and can be cleared later. See the Filters section for more details. +* This option applies a filter to "stuck" reads (too many observations per genomic base). This filter is applied only to the index file and can be cleared later. See the :doc:`filtering` section for more details. --------------------- Pre-process Raw Reads --------------------- -Nanopore raw signal-space data consumes more memory than sequence-space data. As such, many users will produce only FASTQ basecalls initially from a set of raw reads in FAST5 format. The Tombo framework requires the linking of these basecalls with the raw signal-space data. The `annotate_raw_with_fastqs` sub-command is provided to assist with this workflow. +Nanopore raw signal-space data consumes more memory than sequence-space data. As such, many users will produce only FASTQ basecalls initially from a set of raw reads in FAST5 format. The Tombo framework requires the linking of these basecalls with the raw signal-space data. The ``annotate_raw_with_fastqs`` sub-command is provided to assist with this workflow. -Given a directory (or nested directories) of FAST5 raw read files and a set of FASTQ format basecalls from these reads, the `annotate_raw_with_fastqs` adds the sequence information from the FASTQ files to the appropriate FAST5 files. This command generally adds 15-25% to the disk usage for the raw reads. +Given a directory (or nested directories) of FAST5 raw read files and a set of FASTQ format basecalls from these reads, the ``annotate_raw_with_fastqs`` adds the sequence information from the FASTQ files to the appropriate FAST5 files. This command generally adds 15-25% to the disk usage for the raw reads. This functionality requires that the FASTQ seqeunce header lines begin with the read identifier from the FAST5 file. This has been tested with the Oxford Nanopore Technologies supported basecaller, albacore. Third-party basecallers may be not be processed correctly. .. code-block:: bash tombo annotate_raw_with_fastqs --fast5-basedir --fastq-filenames reads.fastq - diff --git a/docs/rna.rst b/docs/rna.rst new file mode 100644 index 0000000..dc09408 --- /dev/null +++ b/docs/rna.rst @@ -0,0 +1,13 @@ +************** +RNA Processing +************** + +Processing RNA data within the Tombo framework requires some extra care. The major item to consider when performing RNA processing is that a transcriptome reference must be supplied as spliced mapping is not supported within the Tombo framework. The lack of spliced mapping support within the Tombo framework is a conscious decision for identification of modified RNA bases. This is because the transcriptome is the natural setting for the detection of modified RNA bases. When modified RNA bases are projected onto the genome reference any potential transcript isoform-specfic modification information is lost. Leaving open the potential for isoform-specific modified base detection is a main reason for the choice to force mapping modified bases to a transcriptome. Tools to investigate isoform-specific modified bases is a future goal within the Tombo framework. This does pose some informatic challenges for downstream processing of Tombo RNA data. A recommended Tombo RNA processing pipeline will be posted here soon. + +A second minor note is that since RNA is currently sequenced in the 3' to 5' direction; thus special care must be taken when accessing Tombo re-squiggled binary data. The raw signal (from MinKNOW) and albacore basecalled events are stored in the reverse direction from the genome (3' to 5' for reads mapping to the plus genome strand). Tombo events for RNA data are stored in the opposite direction (corresponding to the genome sequence direction, not sequencing time direction) for several practical reasons. Thus if Tombo events are to be compared to the raw signal, the raw signal must be reversed. Tombo RNA models are stored in this direction as well and thus may be considered inverted as compared to some other RNA HMM signal level models processing data in the sequencing time direction. + +----------------------- +RNA Processing Workflow +----------------------- + +As Tombo RNA processing presents unique informatic challenges a recommended processing pipeline will be posted here soon. This pipeline aims to address the majority of use cases for RNA modified base detection including porting Tombo results to a genome browser compatible format. Please check back soon for the recommended Tombo RNA processing pipeline! diff --git a/docs/text_output.rst b/docs/text_output.rst index 748a62c..34d605e 100644 --- a/docs/text_output.rst +++ b/docs/text_output.rst @@ -2,32 +2,43 @@ Text Outputs ************ -Two main text outputs are available from Tombo: +Two text outputs are available from Tombo: -1. Wiggle - genome browser statistics +1. Wiggle - Genome browser compatible run per-base statistics 2. Fasta - Sequence output surrounding most modified sites ``write_wiggles`` ----------------- -The ``write_wiggles`` command takes in a set of reads (``--fast5-basedirs``) and potentially a control set of reads (``--control-fast5-basedirs``) or a pre-computed statistics file (``--statistics-filename``). Output wiggle files (`variableStep format `_) will be produced for each requested statistic (both plus and minus strands). +The ``write_wiggles`` command takes in a set of reads (``--fast5-basedirs``) and/or a pre-computed statistics file (``--statistics-filename``). A control set of reads can also be provided (``--control-fast5-basedirs``). Output wiggle files (`variableStep format `_) will be produced for each requested statistic (both plus and minus strands). Several statistics are available for output: * ``coverage`` - The coverage level for mapped and validly re-squiggled reads -* ``fraction`` - The fraction of significantly modified reads +* ``dampened_fraction`` - The estimated fraction of significantly modified reads + + - This estimate includes pseudo-counts added to the un-modified and modified read counts (as specified by the ``--coverage-dampen-counts`` option) + - This is equivalent to using a beta prior when estimating the fraction of reads modified at this position + - Test the effect of different dampen counts using the ``scripts/test_beta_priors.R`` (the default values are shown below) +* ``fraction`` - The raw fraction of significantly modified reads * ``signal`` - The mean signal level across all reads mapped to this location * ``signal_sd`` - The mean signal standard deviation across all reads mapped to this location (not available unless ``--include-event-stdev`` was provided in resquiggle call) +* ``dwell`` - The mean number of raw observations observed assigned to this location * ``difference`` - The difference in normalized signal level between a sample and control set of reads -.. +---- - Note that ``signal``, ``signal_sd`` and ``difference`` require each reads event level data to be queried and thus may be quite slow. ``coverage`` and ``fraction`` can be extracted simply from the tombo statistics file, which is much faster. +.. figure:: _images/dampened_fraction.png + :align: center + :scale: 30% + + Heatmap showing the resulting dampened farction of modified reads given the default ``--coverage-dampen-counts`` values over range of coverage and number of un-modified reads. -Potentially deprecated options: +---- -* ``stat`` - Per base statistical testing -log10(p-values). Test values are quite dependent on read depth and thus this option may be deprecated at some point -* ``mt_stat`` - Multiple testing corrected statistical test values +.. note:: + + ``signal``, ``signal_sd``, ``dwell`` and ``difference`` require each reads event level data to be queried and thus may be quite slow. ``coverage``, ``dampened_fraction``, and ``fraction`` can be extracted simply from the tombo statistics file, which is much faster. Files will be output to individual wiggle files (two per statistic for plus and minus genomic strand) in the following format ``[wiggle-basename].[wiggle-type].[sample|control]?.[plus|minus].wig`` @@ -41,11 +52,8 @@ To run ``write_most_significant_fasta``, a ``--statistics-filename`` is required * ``--num-regions`` - Defines the number of unique locations to be output * ``--num-bases`` - Defines the number of bases to be output surrounding the significant locations -Potentially deprecated options: +The output of this command could be used to determine sequence contexts consistently modified within a sample. Example `meme `_ command line modified base motif detection command. -* ``--statistic-order`` - Order regions by per-genomic base statistical testing values instead of fraction of reads with significant modified base results -* ``--q-value-threshold`` - Select the number of regions to output based on a q-value threhsold instead of a set number (This may produce very large files if not set carefully and so this option may be deprecated) +.. code-block:: bash -.. - - Note that fraction of reads with a significant result at this location can produce non-optimal results with the alternative base comparison log likelihood ratio test. This may be replaced by an estimated fraction based on testing results instead of the current thresholding criterion. + ./meme -oc motif_output.meme -dna -mod zoops tombo_results.significant_regions.fasta diff --git a/scripts/debug_est_alt.R b/scripts/debug_est_alt.R index e7060d8..d71aa54 100644 --- a/scripts/debug_est_alt.R +++ b/scripts/debug_est_alt.R @@ -1,8 +1,11 @@ library(ggplot2) library(ggridges) -densDat <- read.table('debug_est_alt.C.density.txt', header=TRUE) -standardDensDat <- read.table('debug_est_standard_ref.density.txt', header=TRUE) +densBase <- 'debug_est_alt' +altBase <- 'C' + +densDat <- read.table(paste0(densBase, '.alternate_density.txt'), header=TRUE) +standardDensDat <- read.table(paste0(densBase, '.control_density.txt'), header=TRUE) densDat$Sample <- "Alternative" standardDensDat$Sample <- "Standard" @@ -15,29 +18,13 @@ sDiffs <- sort(unlist(lapply(sAllDat, function(x) upDat <- do.call(rbind.data.frame, lapply(names(head(sDiffs, 20)), function(kmer) sAllDat[[kmer]])) dnDat <- do.call(rbind.data.frame, lapply(names(tail(sDiffs, 20)), function(kmer) sAllDat[[kmer]])) -pdf('alternate_model_estimation.density.C.pdf', width=10) -ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + geom_ridgeline(alpha=0.4, size=0, color='white') + - scale_fill_discrete(name='Contains\nAlternative\nBase') + theme_ridges() -ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + geom_ridgeline(alpha=0.4, size=0, color='white') + - scale_fill_discrete(name='Contains\nAlternative\nBase') + theme_ridges() -foo <- dev.off() - - - -dat <- read.table('debug_est_alt.C.txt', header=TRUE, - stringsAsFactors=FALSE, row.names=NULL, sep='\t') - -sd_width <- dat[1,'sd_width'] -min_frac <- dat[1,'min_frac'] - -pdf('alternate_model_estimation.C.pdf', width=10) -ggplot(dat) + - geom_point(aes(x=peaks_diff, color=contains_alt, y=minor_frac), alpha=0.4) + - geom_rect(aes(xmin=xs, xmax=xe, ymin=ys, ymax=ye), - data.frame(xs=c(-3,-sd_width), - xe=c(3,sd_width), - ys=c(0,min_frac), ye=c(min_frac,0.5)), alpha=0.2) + - xlab('Difference Between Estimated Means') + - ylab('Alternative Disribution Proportion') + - scale_color_discrete(name='Contains\nAlternative\nBase') + theme_bw() +pdf(paste0(densBase, '.density.pdf'), width=10) +ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + + geom_ridgeline(alpha=0.4, size=0, color='white') + + scale_fill_discrete(name='Contains\nAlternative\nBase') + + theme_ridges() + theme(axis.text.y=element_text(family="mono")) +ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + + geom_ridgeline(alpha=0.4, size=0, color='white') + + scale_fill_discrete(name='Contains\nAlternative\nBase') + + theme_ridges() + theme(axis.text.y=element_text(family="mono")) foo <- dev.off() diff --git a/scripts/debug_params.R b/scripts/debug_params.R new file mode 100644 index 0000000..bd4315a --- /dev/null +++ b/scripts/debug_params.R @@ -0,0 +1,45 @@ +library(dplyr) +library(ggplot2) +library(ggbeeswarm) + +## set _DEBUG_PARAMS = True in resquiggle.py +## example run for min_obs_per_base testing: +##for i in {0..6}; do +## testParam=`echo $i | awk '{print ($1 * 1) + 2}'` +## tombo resquiggle param_test_reads/ genome.fasta --segmentation-parameters 5 $testParam 5 --signal-align-parameters 4.2 4.2 1200 1.75 --processes 4 +##done > param_values.txt + +stat <- 'min_obs_per_base' + +dat <- read.table('param_values.txt') +colnames(dat) <- c('mean_obs_per_event', 'running_window', 'min_obs_per_base', + 'match_evalue', 'skip_pen', 'bandwidth', + 'read_name', 'mean_score') +dat$mean_obs_per_event <- factor(dat$mean_obs_per_event) +dat$running_window <- factor(dat$running_window) +dat$min_obs_per_base <- factor(dat$min_obs_per_base) +dat$match_evalue <- factor(dat$match_evalue) +dat$skip_pen <- factor(dat$skip_pen) +dat$bandwidth <- factor(dat$bandwidth) + +rdat <- dat %>% group_by(read_name) %>% summarize(nreads=n()) +maxNReads <- rdat$read_name[which(rdat$nreads == max(rdat$nreads))] +fdat <- dat %>% filter(read_name %in% maxNReads) + +minMed <- dat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% summarize(min(med)) +minMedF <- fdat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% summarize(min(med)) + +pdf('param_values.pdf', width=10) +ggplot(dat, aes_string(x=stat, y='mean_score', color=stat)) + + geom_hline(aes(yintercept=minMed)) + + geom_beeswarm(alpha=0.3, cex=0.5) + + stat_summary(fun.y=median, color='red', geom='point', size=2) + + stat_summary(fun.y=mean, color='orange', geom='point', size=2) + + theme_bw() + theme(axis.text.x=element_text(angle=60, hjust=1)) +ggplot(fdat, aes_string(x=stat, y='mean_score', color=stat)) + + geom_hline(aes(yintercept=minMedF)) + + geom_beeswarm(alpha=0.3, cex=0.5) + + stat_summary(fun.y=median, color='red', geom='point', size=2) + + stat_summary(fun.y=mean, color='orange', geom='point', size=2) + + theme_bw() + theme(axis.text.x=element_text(angle=60, hjust=1)) +foo <- dev.off() diff --git a/scripts/test_beta_priors.R b/scripts/test_beta_priors.R new file mode 100644 index 0000000..2352404 --- /dev/null +++ b/scripts/test_beta_priors.R @@ -0,0 +1,12 @@ +library(ggplot2) + +a <- 0.5 +b <- 2 + +mDat <- do.call(rbind.data.frame, lapply(1:20, function(i) + data.frame(postProb=sapply(0:i, function(j) (a+i-j) / (a+i+b)), coverage=i, notMod=0:i))) + +pdf('test_priors.pdf', width=8) +ggplot(mDat) + geom_tile(aes(x=coverage, y=notMod, fill=postProb)) + + theme_bw() + scale_fill_gradient2(low='#67001f', mid='#f7f7f7', high='#2166ac', midpoint=0.5) +dev.off() diff --git a/setup.py b/setup.py index c862faf..90d7839 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import os import sys @@ -7,13 +9,13 @@ from setuptools.command.build_ext import build_ext as _build_ext # Get the version number from _version.py, and exe_path -verstrline = open(os.path.join('tombo', '_version.py'), 'r').read() +verstrline = open(os.path.join('tombo', '_version.py'), 'r').readlines()[-1] vsre = r"^TOMBO_VERSION = ['\"]([^'\"]*)['\"]" mo = re.search(vsre, verstrline) if mo: __version__ = mo.group(1) else: - raise RuntimeError('Unable to find version string in "tombo/_version.py".'.format(__pkg_name__)) + raise RuntimeError('Unable to find version string in "tombo/_version.py".') def readme(): with open('README.rst') as f: @@ -29,18 +31,21 @@ def readme(): '\tThis is required in order to get maximum efficincy from ' + 'cython code optimizations.\nTo install run:\n$ pip install numpy\n' + '*' * 60 + '\n') - sys.exit(1) + sys.exit() -if not sys.version_info[0] == 2: - sys.exit("Sorry, Python 3 is not supported (yet)") +extras_require = ['pyfaidx'] +if sys.version_info[0] < 3: + extras_require.append('rpy2<=2.8.6') +else: + extras_require.append('rpy2') ext_modules = [ - Extension("tombo.dynamic_programming", - ["tombo/dynamic_programming.pyx"], + Extension(str("tombo.c_dynamic_programming"), + [str("tombo/c_dynamic_programming.pyx")], include_dirs=include_dirs, language="c++"), - Extension("tombo.c_helper", - ["tombo/c_helper.pyx"], + Extension(str("tombo.c_helper"), + [str("tombo/c_helper.pyx")], include_dirs=include_dirs, language="c++") ] @@ -52,10 +57,9 @@ def readme(): name = "ont-tombo", version = __version__, packages = ["tombo"], - python_requires = '<3', - install_requires = ['h5py', 'numpy', 'scipy', 'cython', 'setuptools >= 18.0'], - extras_require={'plot':['rpy2<=2.8.6'], 'alt_est':['scikit-learn'], - 'full':['rpy2<=2.8.6', 'scikit-learn']}, + install_requires = ['h5py', 'numpy', 'scipy', 'cython', + 'setuptools >= 18.0', 'mappy', 'future'], + extras_require={'full':extras_require}, author = "Marcus Stoiber", author_email = "marcus.stoiber@nanoporetech.com", @@ -74,4 +78,15 @@ def readme(): ext_modules=ext_modules, test_suite='nose2.collector.collector', tests_require=['nose2'], + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Science/Research', + 'Natural Language :: English', + 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + ] ) diff --git a/tombo/R_scripts/plotKmerDist.R b/tombo/R_scripts/plotKmerDist.R index ca2941b..b5b5929 100644 --- a/tombo/R_scripts/plotKmerDist.R +++ b/tombo/R_scripts/plotKmerDist.R @@ -18,8 +18,8 @@ plotKmerDist <- function(dat, baseDat, saveDatFn, dontPlot){ if (is.na(baseDat)){ print(mainP) } else { - mainL <- get_legend(mainP) - mainP <- mainP + theme(legend.position='none') + ##mainL <- get_legend(mainP) + ##mainP <- mainP + theme(legend.position='none') baseP <- ggplot(baseDat) + geom_tile(aes(x=Kmer, y=Position, fill=Base)) + scale_fill_manual( @@ -35,9 +35,17 @@ plotKmerDist <- function(dat, baseDat, saveDatFn, dontPlot){ mainP <- mainP + theme(axis.text.x=element_blank()) baseP <- baseP + theme(axis.text.x=element_blank()) } - print(plot_grid(plot_grid(mainP, baseP, ncol=1, - rel_heights=c(5,1), align='v'), - mainL, ncol=1, rel_heights=c(10,1))) + pdf('/dev/null') + mainP <- ggplot_gtable(ggplot_build(mainP)) + baseP <- ggplot_gtable(ggplot_build(baseP)) + maxWidth = grid::unit.pmax(mainP$widths[2:3], baseP$widths[2:3]) + mainP$widths[2:3] <- maxWidth + baseP$widths[2:3] <- maxWidth + foo <- dev.off() + grid.arrange(mainP, baseP, ncol=1, heights=c(5,1)) + ##plot_grid(plot_grid(mainP, baseP, ncol=1, + ## rel_heights=c(5,1), align='v'), + ## mainL, ncol=1, rel_heights=c(10,1))) }} plotKmerDistWReadPath <- function(dat, baseDat, saveDatFn, dontPlot){ @@ -69,29 +77,40 @@ plotKmerDistWReadPath <- function(dat, baseDat, saveDatFn, dontPlot){ print(mainP) print(readP) } else { - mainL <- get_legend(mainP) - mainP <- mainP + theme(legend.position='none') - baseP <- ggplot(baseDat) + - geom_tile(aes(x=Kmer, y=Position, fill=Base)) + - scale_fill_manual( - values=c('A'='#00CC00', 'C'='#0000CC', - 'G'='#FFB300', 'T'='#CC0000')) + - theme_bw() + theme( - axis.text.x=element_text(angle=60, hjust=1, size=8), - legend.position='none') - mainP <- mainP + theme(axis.text.x=element_blank(), - axis.title.x=element_blank()) - readP <- readP + theme(axis.text.x=element_blank(), - axis.title.x=element_blank()) - if (nchar(as.character(dat$Kmer[1])) > 3){ - mainP <- mainP + theme(axis.text.x=element_blank()) - readP <- readP + theme(axis.text.x=element_blank()) - baseP <- baseP + theme(axis.text.x=element_blank()) - } - print(plot_grid(plot_grid(mainP, baseP, ncol=1, - rel_heights=c(5,1), align='v'), - mainL, ncol=1, rel_heights=c(10,1))) - print(plot_grid(plot_grid(readP, baseP, ncol=1, - rel_heights=c(5,1), align='v'), - mainL, ncol=1, rel_heights=c(10,1))) + #mainL <- get_legend(mainP) + #mainP <- mainP + theme(legend.position='none') + baseP <- ggplot(baseDat) + + geom_tile(aes(x=Kmer, y=Position, fill=Base)) + + scale_fill_manual( + values=c('A'='#00CC00', 'C'='#0000CC', + 'G'='#FFB300', 'T'='#CC0000')) + theme_bw() + + theme(axis.text.x=element_text(angle=60, hjust=1, size=8), + legend.position='none') + mainP <- mainP + theme(axis.text.x=element_blank(), + axis.title.x=element_blank()) + readP <- readP + theme(axis.text.x=element_blank(), + axis.title.x=element_blank()) + if (nchar(as.character(dat$Kmer[1])) > 3){ + mainP <- mainP + theme(axis.text.x=element_blank()) + readP <- readP + theme(axis.text.x=element_blank()) + baseP <- baseP + theme(axis.text.x=element_blank()) + } + pdf('/dev/null') + mainP <- ggplot_gtable(ggplot_build(mainP)) + readP <- ggplot_gtable(ggplot_build(readP)) + baseP <- ggplot_gtable(ggplot_build(baseP)) + maxWidth = grid::unit.pmax(mainP$widths[2:3], readP$widths[2:3], + baseP$widths[2:3]) + mainP$widths[2:3] <- maxWidth + readP$widths[2:3] <- maxWidth + baseP$widths[2:3] <- maxWidth + foo <- dev.off() + grid.arrange(mainP, baseP, ncol=1, heights=c(5,1)) + grid.arrange(readP, baseP, ncol=1, heights=c(5,1)) + ##print(plot_grid(plot_grid(mainP, baseP, ncol=1, + ## rel_heights=c(5,1), align='v'), + ## mainL, ncol=1, rel_heights=c(10,1))) + ##print(plot_grid(plot_grid(readP, baseP, ncol=1, + ## rel_heights=c(5,1), align='v'), + ## mainL, ncol=1, rel_heights=c(10,1))) }} diff --git a/tombo/R_scripts/plotMotifStats.R b/tombo/R_scripts/plotMotifStats.R index 6e64d6a..afebd25 100644 --- a/tombo/R_scripts/plotMotifStats.R +++ b/tombo/R_scripts/plotMotifStats.R @@ -2,10 +2,13 @@ numModelVals <- 20 pseudoQuants <- seq(1/numModelVals,1-(1/numModelVals), 1/numModelVals) -plotMotifStats <- function(PlotDat, BaseDat, StatsDat, PlotFrac, ModelDat){ +plotMotifStats <- function(PlotDat, BaseDat, StatsDat, + ModelDat, AltModelDat=NULL){ ylim <- 4 regions <- unique(PlotDat$Region) midReg <- regions[(length(regions) + 1) / 2] + ## "ggplot_gtable(ggplot_build(" writes a page so sink it to dev null + pdf('/dev/null') ps <- lapply(regions, function(region){ rBaseDat <- BaseDat[BaseDat$Region==region,] rPlotDat <- PlotDat[PlotDat$Region==region,] @@ -30,6 +33,25 @@ plotMotifStats <- function(PlotDat, BaseDat, StatsDat, PlotFrac, ModelDat){ Group=rep(psDat$Region[1], nDens)) }) maxDens <- max(unlist(lapply(modDensDat, function(x) x$Position))) + if(! is.null(AltModelDat)) { + rAltModelDat <- AltModelDat[AltModelDat$Region==region,] + altModDensDat <- lapply(split( + rAltModelDat, paste0(rAltModelDat$Position, + rAltModelDat$Strand)), + function(psDat){ + psDens <- density(qnorm( + pseudoQuants, mean=psDat$Mean[1], sd=psDat$SD[1])) + nDens <- length(psDens$x) + data.frame(Position=psDens$y, + Signal=psDens$x, + Strand=rep(psDat$Strand[1], nDens), + gPos=rep(psDat$Position[1], nDens), + Group=rep(psDat$Region[1], nDens)) + }) + altMaxDens <- max(unlist(lapply( + altModDensDat, function(x) x$Position))) + maxDens <- max(altMaxDens, maxDens) + } normDensDat <- do.call( rbind.data.frame, lapply(modDensDat, function(posDens){ @@ -43,6 +65,21 @@ plotMotifStats <- function(PlotDat, BaseDat, StatsDat, PlotFrac, ModelDat){ p <- p + geom_polygon(aes(x=Position, y=Signal, group=gPos), data=normDensDat, fill='black', alpha=0.5, size=0, show.legend=FALSE) + if(! is.null(AltModelDat)) { + normAltDensDat <- do.call( + rbind.data.frame, + lapply(altModDensDat, function(posDens){ + data.frame(Position=(posDens$Position / maxDens) + + posDens$gPos[1], + Signal=posDens$Signal, + Strand=posDens$Strand, + gPos=posDens$gPos, + Group=posDens$Group) + })) + p <- p + geom_polygon(aes(x=Position, y=Signal, group=gPos), + data=normAltDensDat, fill='red', alpha=0.5, + size=0, show.legend=FALSE) + } } p <- p + geom_path(aes(x=Position, y=Signal, color=Group, group=Read), alpha=0.5, size=0.1, show.legend=FALSE) @@ -74,15 +111,15 @@ plotMotifStats <- function(PlotDat, BaseDat, StatsDat, PlotFrac, ModelDat){ if(region != midReg){ p <- p + theme(axis.title.y=element_blank()) } - return(p) + return(ggplot_gtable(ggplot_build(p))) }) maxStat <- max(StatsDat$Stat) if(maxStat <= 1){ tickVals <- c(0,0.2,0.4,0.6,0.8,1) } else if(maxStat < 10){ tickVals <- seq(0,10,by=2) } else { tickVals <- seq(0,100,by=5) } - statTitle <- ifelse(PlotFrac, 'Fraction Modified', '-Log(P-Value)') - ps[[length(ps) + 1]] <- ggplot(StatsDat) + + ps[[length(ps) + 1]] <- ggplot_gtable(ggplot_build( + ggplot(StatsDat) + geom_violin(aes( x=Position+0.5, y=Stat, group=cut_width(Position, 0.9999)), size=0.1, fill='black') + @@ -93,9 +130,19 @@ plotMotifStats <- function(PlotDat, BaseDat, StatsDat, PlotFrac, ModelDat){ axis.ticks.x=element_blank(), axis.title.x=element_blank(), panel.grid.minor.y=element_blank()) + - ylab(statTitle) - print(do.call( - plot_grid, - c(ps, list(ncol=1, align='v', - rel_heights=c(rep(1, length(regions)), 3))))) + ylab('Fraction Modified'))) + maxWidth <- do.call(grid::unit.pmax, + sapply(ps, function(x) x$widths[2:3])) + ps <- lapply(ps, function(p){ + p$widths[2:3] <- maxWidth + return(p)}) + # close dev null sink + foo <- dev.off() + do.call( + grid.arrange, + c(ps, list(ncol=1, heights=c(rep(1, length(regions)), 3)))) + ##print(do.call( + ## plot_grid, + ## c(ps, list(ncol=1, align='v', + ## rel_heights=c(rep(1, length(regions)), 3))))) } diff --git a/tombo/R_scripts/plotPerReadStats.R b/tombo/R_scripts/plotPerReadStats.R index 8ccc607..6b41aea 100644 --- a/tombo/R_scripts/plotPerReadStats.R +++ b/tombo/R_scripts/plotPerReadStats.R @@ -2,7 +2,9 @@ ngpValMax <- 20 lhRatioMax <- 6 plotPerReadStats <- function(StatData, OrdData, baseDat, boxCenter, arePvals){ - for(reg_i in unique(StatData$Region)){ + all_reg_ids <- unique(StatData$Region) + last_reg_id <- tail(all_reg_ids, 1) + for(reg_i in all_reg_ids){ regDat <- StatData[StatData$Region == reg_i,] regOrd <- OrdData[OrdData$Region == reg_i,'Read'] if(arePvals){ @@ -12,10 +14,10 @@ plotPerReadStats <- function(StatData, OrdData, baseDat, boxCenter, arePvals){ regDat$Stats[regDat$Stats < -lhRatioMax] <- -lhRatioMax } regDat$Read <- factor(regDat$Read, ordered=TRUE, levels=regOrd) + boxDat <- data.frame(xS=mean(range(regDat$Position))-1.5, + xE=mean(range(regDat$Position))+0.5, + yS=0.5, yE=length(unique(regDat$Read))+0.5) regDat <- regDat[!is.na(regDat$Stats),] - boxDat <- data.frame(xS=mean(range(regDat$Position))-0.5, - xE=mean(range(regDat$Position))+0.5, - yS=0.5, yE=length(unique(regDat$Read))+0.5) reg_base_dat <- baseDat[baseDat$Region==reg_i,] p <- ggplot(regDat) if(arePvals){ @@ -23,7 +25,7 @@ plotPerReadStats <- function(StatData, OrdData, baseDat, boxCenter, arePvals){ stroke=0, color='#969696', size=5, shape=21) + scale_fill_gradient(low="#fff7ec", high='#7f0000', name='-Log10\nP-Value') - } else{ + } else { p <- p + geom_point(aes(x=Position, y=Read, fill=Stats), stroke=0, color='#969696', size=5, shape=21) + scale_fill_gradient2( @@ -31,26 +33,29 @@ plotPerReadStats <- function(StatData, OrdData, baseDat, boxCenter, arePvals){ name='Log\nLikelihood\nRatio\n', breaks=c(-6,-3,0,3,6), labels=c('Alternative\nBase', '-3','0','3', 'Standard\nBase')) } - p <- p + geom_text( - aes(x=Position, y=0.5, label=Base, color=Base), - data=reg_base_dat, hjust=0.5, size=3, show.legend=FALSE, - vjust=1.2, angle=0) + + if(nrow(reg_base_dat) > 0){ + p <- p + geom_text( + aes(x=Position, y=0.5, label=Base, color=Base), + data=reg_base_dat, hjust=0.5, size=3, show.legend=FALSE, + vjust=1.2, angle=0) + } + if(boxCenter){ + p <- p + geom_rect(aes(xmin=xS, xmax=xE, ymin=yS, ymax=yE), + data=boxDat, fill=NA, color='black', size=0.2) + } + p <- p + scale_color_manual( values=c('A'='#00CC00', 'C'='#0000CC', 'G'='#FFB300', 'T'='#CC0000', '-'='black', 'N'='black')) + - scale_x_continuous(expand=c(0, 0)) + - ylab('Reads') + + scale_x_continuous(expand=c(0, 0)) + ylab('Reads') + theme_minimal() + theme(panel.grid=element_blank(), axis.text.y=element_blank(), axis.text.x=element_blank(), legend.text.align=0.5, legend.title.align=0.5) - if(boxCenter){ - p <- p + geom_rect(aes(xmin=xS, xmax=xE, ymin=yS, ymax=yE), - data=boxDat, fill=NA, color='black') - } + ## need to set clip to off so bases aren't cut off gt <- ggplot_gtable(ggplot_build(p)) gt$layout$clip[gt$layout$name == "panel"] <- "off" grid::grid.draw(gt) - grid::grid.newpage() + if(reg_i != last_reg_id){ grid::grid.newpage() } } } diff --git a/tombo/R_scripts/plotROC.R b/tombo/R_scripts/plotROC.R new file mode 100644 index 0000000..d2eccfe --- /dev/null +++ b/tombo/R_scripts/plotROC.R @@ -0,0 +1,8 @@ +plotROC <- function(rocDat){ + print(ggplot(rocDat) + geom_abline(slope=1, intercept=0) + + geom_path(aes(x=FP, y=TP, color=Comparison)) + theme_bw() + + xlab('False Positive Rate') + ylab('True Positive Rate')) + print(ggplot(rocDat) + + geom_path(aes(x=Precision, y=TP, color=Comparison)) + theme_bw() + + xlab('Precision') + ylab('Recall')) +} diff --git a/tombo/R_scripts/plotSingleRun.R b/tombo/R_scripts/plotSingleRun.R index 88d5a10..74cb8f1 100644 --- a/tombo/R_scripts/plotSingleRun.R +++ b/tombo/R_scripts/plotSingleRun.R @@ -62,5 +62,7 @@ plotSingleRun <- function(sigDat, quantDat, boxDat, eventDat, theme_bw() + theme(axis.text.x=element_text(hjust=0), panel.grid.major.x=element_blank(), panel.grid.minor.x=element_blank(), - panel.grid.minor.y=element_blank())) - }} + panel.grid.minor.y=element_blank()) + ) + } +} diff --git a/tombo/__init__.py b/tombo/__init__.py index e69de29..a98c64d 100644 --- a/tombo/__init__.py +++ b/tombo/__init__.py @@ -0,0 +1 @@ +from __future__ import unicode_literals, absolute_import diff --git a/tombo/__main__.py b/tombo/__main__.py index 353e8c8..f6266e2 100644 --- a/tombo/__main__.py +++ b/tombo/__main__.py @@ -1,8 +1,9 @@ -import sys +from __future__ import unicode_literals, absolute_import -import _option_parsers +import sys -from _version import TOMBO_VERSION +from . import _option_parsers +from ._version import TOMBO_VERSION def main(args=None): """The main routine.""" @@ -10,17 +11,17 @@ def main(args=None): args = sys.argv[1:] commands = [ - ('Resquiggle (Must be run before any other commands):', [ - ('resquiggle','Re-annotate raw signal with ' + - 'genomic alignment from existing basecalls.', - _option_parsers.get_eventless_resquiggle_parser()), - ]), ('Pre-processing:', [ ('annotate_raw_with_fastqs','Add basecalled sequence ' + 'from FASTQs to raw FAST5s.', _option_parsers.get_add_fastqs_parser()), ]), - ('Statistical Testing Command:',[ + ('Re-squiggle:', [ + ('resquiggle','Re-annotate raw signal with ' + + 'genomic alignment from existing basecalls.', + _option_parsers.get_eventless_resquiggle_parser()), + ]), + ('Modified Base Detection:',[ ('test_significance','Test for shifts in signal ' + 'indicative of non-canonical bases.', _option_parsers.get_test_signif_parser()), @@ -58,6 +59,8 @@ def main(args=None): _option_parsers.get_per_read_parser()), ]), ('Other Plotting Commands:', [ + ('plot_roc','Plot ROC curve from known motif(s).', + _option_parsers.get_roc_parser()), ('plot_kmer','Plot signal distributions acorss kmers.', _option_parsers.get_kmer_dist_parser()), ('cluster_most_significant', @@ -111,9 +114,10 @@ def main(args=None): import argparse parser = argparse.ArgumentParser( prog='tombo', - description='Tombo is a command line python toolset ' + - 'to analyze and visualize raw nanopore sequencing data ' + - 'including the identification of non-standard nucleotides.', + description='Tombo is a suite of tools primarily for the ' + + 'identification of modified nucleotides from nanopore sequencing ' + + 'data. Tombo also provides tools for the analysis and ' + + 'visualization of raw nanopore signal.', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '-v', '--version', action='version', @@ -133,46 +137,46 @@ def main(args=None): args = parser.parse_args(args) if args.subcmd == 'resquiggle': - import resquiggle + from . import resquiggle resquiggle.eventless_resquiggle_main(args) elif args.subcmd == 'event_resquiggle': - import _event_resquiggle + from . import _event_resquiggle _event_resquiggle.event_resquiggle_main(args) elif args.subcmd == 'model_resquiggle': - import _model_resquiggle + from . import _model_resquiggle _model_resquiggle.model_resquiggle_main(args) elif args.subcmd == 'test_significance': - import tombo_stats + from . import tombo_stats tombo_stats.test_shifts_main(args) elif args.subcmd == 'estimate_reference': - import tombo_stats + from . import tombo_stats tombo_stats.est_ref_main(args) elif args.subcmd == 'estimate_alt_reference': - import tombo_stats + from . import tombo_stats tombo_stats.est_alt_ref_main(args) elif args.subcmd == 'estimate_scale': - import tombo_stats + from . import tombo_stats tombo_stats.estimate_scale_main(args) elif args.subcmd == 'annotate_raw_with_fastqs': - import tombo_helper + from . import tombo_helper tombo_helper.annotate_reads_with_fastq_main(args) elif args.subcmd == 'clear_filters': - import tombo_helper + from . import tombo_helper tombo_helper.clear_filters_main(args) elif args.subcmd == 'filter_stuck': - import tombo_helper + from . import tombo_helper tombo_helper.filter_stuck_main(args) elif args.subcmd == 'filter_coverage': - import tombo_helper + from . import tombo_helper tombo_helper.filter_coverage_main(args) elif args.group == 'Text Output Commands:': - import text_output_commands + from . import text_output_commands if args.subcmd == 'write_wiggles': text_output_commands.wiggle_main(args) else: text_output_commands.write_signif_diff_main(args) else: - import plot_commands + from . import plot_commands plot_commands.plot_main(args) return diff --git a/tombo/_default_parameters.py b/tombo/_default_parameters.py new file mode 100644 index 0000000..28c2d53 --- /dev/null +++ b/tombo/_default_parameters.py @@ -0,0 +1,97 @@ +from __future__ import unicode_literals + +############################### +##### Model Name Defaults ##### +############################### + +# default model names +STANDARD_MODELS = { + 'DNA':'tombo.DNA.model', + 'RNA':'tombo.RNA.200mV.model', +} +ALTERNATE_MODELS = { + 'DNA_5mC':'tombo.DNA.5mC.model', + 'DNA_6mA':'tombo.DNA.6mA.model', +} + + +################################ +##### Re-squiggle Defaults ##### +################################ + +# table containing default segmentation parameters for different sample types +# 1) running neighboring window width for segmentation scoring +# 2) minimum observations per genomic base +# 3) mean number of observations per event during segmentation +SEG_PARAMS_TABLE = { + 'RNA':(8, 4, 10), + 'DNA':(5, 3, 5), +} + +# table containing default signal to sequence assignment parameters +# for different sample types +# 1) expected value for matching event to sequence +# 2) penalty for skipped sequence position +# 3) adaptive bandwidth +# 4) signal segmentation mean half-normal score threshold +ALGN_PARAMS_TABLE = { + 'RNA':(4, 10, 1400, 2.0), + 'DNA':(4.2, 4.2, 1200, 1.75), +} + +# factor of extra raw signal above minimum to add around skipped bases for +# raw signal segment detection +EXTRA_SIG_FACTOR = 1.1 + +MASK_FILL_Z_SCORE = -10 +MASK_BASES = 50 + +START_BANDWIDTH = 5000 +START_SEQ_WINDOW = 500 +BAND_BOUNDARY_THRESH = 5 + +DEL_FIX_WINDOW = 2 +MAX_DEL_FIX_WINDOW = 8 +MAX_RAW_CPTS = 200 +MIN_EVENT_TO_SEQ_RATIO = 1.1 + + +############################ +##### Testing Defaults ##### +############################ + +LLR_THRESH = 0.5 +HYPO_THRESH = 0.01 + + +##################################### +##### Model Estimation Defaults ##### +##################################### + +ALT_EST_BATCH = 1000 +MAX_KMER_OBS = 10000 +MIN_KMER_OBS_TO_EST = 50 +KERNEL_DENSITY_RANGE = (-5,5) + + +########################## +##### Misc. Defaults ##### +########################## + +SMALLEST_PVAL = 1e-50 + +# got quantiles from analysis of stability after shift-only normalization +ROBUST_QUANTS = (46.5, 53.5) + +# minimum standard deviation for a genomic position when estimating spread +# from a sample +MIN_POSITION_SD = 0.01 + +# number of points at which to estimate the k-mer siganl densities +NUM_DENS_POINTS = 500 + +# number of reads to estimate global scale parameter +NUM_READS_FOR_SCALE = 1000 + +# number of points to plot in the ROC curve plotting command +ROC_PLOT_POINTS = 1000 diff --git a/tombo/_event_resquiggle.py b/tombo/_event_resquiggle.py index 8b5aa69..a720692 100644 --- a/tombo/_event_resquiggle.py +++ b/tombo/_event_resquiggle.py @@ -1,8 +1,17 @@ -import os, sys +from __future__ import unicode_literals, absolute_import +from builtins import int, range, dict, zip + +import os +import io import re +import sys +import queue + +# Future warning from cython in h5py +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) import h5py -import Queue import numpy as np np.seterr(all='raise') @@ -11,14 +20,19 @@ from subprocess import call from time import sleep, time from itertools import repeat +from operator import itemgetter from tempfile import NamedTemporaryFile from distutils.version import LooseVersion from collections import defaultdict, namedtuple, Counter +if sys.version_info[0] > 2: + unicode = str + # import tombo functions -import tombo_helper as th +from . import tombo_helper as th -from c_helper import c_valid_cpts, c_valid_cpts_w_cap +from ._default_parameters import SEG_PARAMS_TABLE +from .c_helper import c_valid_cpts, c_valid_cpts_w_cap VERBOSE = False @@ -27,18 +41,17 @@ # allow this many times the alignment batch size into the queue of # reads to be resquiggled -ALIGN_BATCH_MULTIPLIER = 5 PROGRESS_INTERVAL = 100 +ALIGN_BATCH_MULTIPLIER = 5 + +FN_SPACE_FILLER = '|||' +FASTA_NAME_JOINER = ':::' ALBACORE_TEXT = 'ONT Albacore Sequencing Software' indelStats = namedtuple('indelStats', ('start', 'end', 'diff')) indelGroupStats = namedtuple('indelGroupStats', ('start', 'end', 'cpts', 'indels')) -readInfo = namedtuple( - 'readInfo', - ('ID', 'Subgroup', 'ClipStart', 'ClipEnd', - 'Insertions', 'Deletions', 'Matches', 'Mismatches')) mapperData = namedtuple('mapperData', ('exe', 'type', 'index')) # set default index to None mapperData.__new__.__defaults__ = (None,) @@ -58,18 +71,21 @@ ########## Raw Signal Re-squiggle Code ########## ################################################# -def get_valid_cpts(raw_signal, min_base_obs, num_cpts=None): +def get_valid_cpts(raw_signal, min_obs_per_base, running_stat_width, + num_cpts=None): if num_cpts is None: - return c_valid_cpts(raw_signal, min_base_obs) - return c_valid_cpts_w_cap(raw_signal, min_base_obs, num_cpts) + return c_valid_cpts( + raw_signal, min_obs_per_base, running_stat_width) + return c_valid_cpts_w_cap( + raw_signal, min_obs_per_base, running_stat_width, num_cpts) def get_indel_groups( - alignVals, align_segs, raw_signal, min_base_obs, timeout, - num_cpts_limit): + alignVals, align_segs, raw_signal, min_obs_per_base, + running_stat_width, timeout, num_cpts_limit): def get_all_indels(): # get genomic sequence for and between each indel - read_align = ''.join(zip(*alignVals)[0]) - genome_align = ''.join(zip(*alignVals)[1]) + read_align = ''.join(map(itemgetter(0), alignVals)) + genome_align = ''.join(map(itemgetter(1), alignVals)) genome_gaps = [(m.start(), m.end()) for m in GAP_PAT.finditer(genome_align)] read_gaps = [(m.start(), m.end()) @@ -79,8 +95,8 @@ def get_all_indels(): [(0,0), (len(read_align), len(read_align))]) btwn_indel_seqs = [ genome_align[m_start:m_end] for m_start, m_end in - zip(zip(*all_indel_locs)[1][:-1], - zip(*all_indel_locs)[0][1:])] + zip(map(itemgetter(1), all_indel_locs[:-1]), + map(itemgetter(0), all_indel_locs[1:]))] # is each indel an ins(ertion) or deletion all_is_ins = [read_align[start:end].startswith('-') for start, end in all_indel_locs[1:-1]] @@ -129,65 +145,71 @@ def get_all_indels(): def extend_group(indel_group): group_start = min(indel.start for indel in indel_group) - group_stop = max(indel.end for indel in indel_group) + group_end = max(indel.end for indel in indel_group) num_cpts = sum(indel.diff for indel in indel_group - ) + group_stop - group_start - 1 + ) + group_end - group_start - 1 + prev_num_cpts = num_cpts # check that there are enough points to split # add an extra set of values to ensure no zero changepoint - while align_segs[group_stop] - align_segs[group_start] < ( - num_cpts + 2) * min_base_obs: + while align_segs[group_end] - align_segs[group_start] < (( + num_cpts + 1) * min_obs_per_base) + (running_stat_width * 2): num_cpts += int(group_start > 0) + int( - group_stop < len(align_segs) - 1) + group_end < len(align_segs) - 1) + # ensure no infinite loop for large segmentation parameters + if num_cpts == prev_num_cpts: + raise NotImplementedError( + 'Entire read does not contain enough ' + + 'signal to re-squiggle') + prev_num_cpts = num_cpts group_start = max(0, group_start - 1) - group_stop = min(len(align_segs) - 1, group_stop + 1) - return group_start, group_stop, num_cpts + group_end = min(len(align_segs) - 1, group_end + 1) + return group_start, group_end, num_cpts def extend_and_join(indel_group): - group_start, group_stop, num_cpts = extend_group(indel_group) + group_start, group_end, num_cpts = extend_group(indel_group) # check if the extension hits the previous group while (len(indel_groups) > 0) and ( group_start <= indel_groups[-1].end): indel_group = indel_groups[-1].indels + indel_group del indel_groups[-1] - group_start, group_stop, num_cpts = extend_group( + group_start, group_end, num_cpts = extend_group( indel_group) - return group_start, group_stop, num_cpts, indel_group - def get_cpts(group_start, group_stop, num_cpts): + return group_start, group_end, num_cpts, indel_group + def get_cpts(group_start, group_end, num_cpts): """ - Get changepoints where the raw difference between min_base_obs - obs to the left and min_base_obs obs to the right is largest - while maintaining the min_base_obs between changepoints. - Still need to test this function for off by one bugs etc. + Get changepoints where the raw difference between running_stat_width + obs to the left and running_stat_width obs to the right is largest + while maintaining min_obs_per_base between changepoints. """ if num_cpts_limit is not None and num_cpts > num_cpts_limit: - raise RuntimeError, ('Reached maximum number of ' + - 'changepoints for a single indel') + raise NotImplementedError('Reached maximum number of ' + + 'changepoints for a single indel') try: cpts = get_valid_cpts( - raw_signal[align_segs[group_start]:align_segs[group_stop]], - min_base_obs, num_cpts) + raw_signal[align_segs[group_start]:align_segs[group_end]], + min_obs_per_base, running_stat_width, num_cpts) # not implemented error returned when fewer cpts found than requested except NotImplementedError: return None cpts.sort() return cpts def extend_for_cpts( - group_start, group_stop, num_cpts, indel_group): - cpts = get_cpts(group_start, group_stop, num_cpts) + group_start, group_end, num_cpts, indel_group): + cpts = get_cpts(group_start, group_end, num_cpts) # expand group until a valid set of changepoints can be identified while cpts is None: num_cpts += int(group_start > 0) + int( - group_stop < len(align_segs) - 1) + group_end < len(align_segs) - 1) group_start = max(0, group_start - 1) - group_stop = min(len(align_segs) - 1, group_stop + 1) + group_end = min(len(align_segs) - 1, group_end + 1) while (len(indel_groups) > 0) and ( group_start <= indel_groups[-1].end): indel_group = indel_groups[-1].indels + indel_group del indel_groups[-1] - group_start, group_stop, num_cpts = extend_group( + group_start, group_end, num_cpts = extend_group( indel_group) - cpts = get_cpts(group_start, group_stop, num_cpts) + cpts = get_cpts(group_start, group_end, num_cpts) - return (cpts + align_segs[group_start], group_start, group_stop, + return (cpts + align_segs[group_start], group_start, group_end, indel_group) if timeout is not None: @@ -201,7 +223,7 @@ def extend_for_cpts( curr_group = [all_indels[0],] for indel in all_indels[1:]: if timeout is not None and time() - timeout_start > timeout: - raise RuntimeError, 'Read took too long to re-segment.' + raise NotImplementedError('Read took too long to re-segment.') # check if indel hits current group if max(g_indel.end for g_indel in curr_group) >= indel.start: curr_group.append(indel) @@ -231,16 +253,18 @@ def extend_for_cpts( return indel_groups def find_read_start( - norm_signal, starts_rel_to_read, min_base_obs, - read_start_rel_to_raw, signal_length, num_obs=2000): + norm_signal, starts_rel_to_read, min_obs_per_base, + running_stat_width, read_start_rel_to_raw, + signal_length, num_obs=2000): # get only current starts up to requested num_obs begin_read_starts = starts_rel_to_read[ :np.argmax(starts_rel_to_read >= num_obs)] \ if starts_rel_to_read[-1] > num_obs else starts_rel_to_read if begin_read_starts.shape[0] <= 0: return norm_signal, starts_rel_to_read - signal_cpts = get_valid_cpts(norm_signal[:num_obs], min_base_obs, - begin_read_starts.shape[0]) + signal_cpts = get_valid_cpts( + norm_signal[:num_obs], min_obs_per_base, running_stat_width, + begin_read_starts.shape[0]) # identify the offset which aligns the most signal and read changepoints off_by_counts = Counter([ @@ -280,7 +304,7 @@ def resquiggle_read( norm_type, outlier_thresh, alignVals, fix_read_start, timeout, num_cpts_limit, genome_loc, read_info, basecall_group, corrected_group, compute_sd, pore_model, obs_filter, - min_base_obs=4, in_place=True, skip_index=False): + seg_params, in_place=True, skip_index=False): # errors should not happen here since these slotes were checked # in alignment function, but old zombie processes might cause # problems here @@ -289,8 +313,7 @@ def resquiggle_read( channel_info = th.get_channel_info(fast5_data) # extract raw data for this read - all_raw_signal = fast5_data[ - '/Raw/Reads/'].values()[0]['Signal'].value + all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'].value rna = th.is_read_rna(fast5_data) if rna: all_raw_signal = all_raw_signal[::-1] @@ -300,58 +323,65 @@ def resquiggle_read( '/Analyses/' + basecall_group + '/' + read_info.Subgroup + '/Events'].value event_means = event_data['mean'] - event_kmers = event_data['model_state'] + event_kmers = list(map(lambda x: x.decode(), + event_data['model_state'])) fast5_data.close() except: - raise NotImplementedError, ( + raise NotImplementedError( 'Error opening file for re-squiggle. This should have ' + 'been caught during the alignment phase. Check that there ' + 'are no other tombo processes or processes accessing ' + 'these HDF5 files running simultaneously.') + if seg_params is None: + bio_samp_type = 'RNA' if rna else 'DNA' + (running_stat_width, min_obs_per_base, + _) = SEG_PARAMS_TABLE[bio_samp_type] + else: + running_stat_width, min_obs_per_base = seg_params + # normalize signal - # print read id for resquiggle shift and scale output - #sys.stdout.write(read_info.ID + "\t") norm_signal, scale_values = th.normalize_raw_signal( all_raw_signal, read_start_rel_to_raw, starts_rel_to_read[-1], norm_type, channel_info, outlier_thresh, pore_model=pore_model, event_means=event_means, event_kmers=event_kmers) if fix_read_start: norm_signal, read_start_rel_to_raw = find_read_start( - norm_signal, starts_rel_to_read, min_base_obs, - read_start_rel_to_raw, all_raw_signal.shape[0]) + norm_signal, starts_rel_to_read, min_obs_per_base, + running_stat_width, read_start_rel_to_raw, + all_raw_signal.shape[0]) # group indels that are adjacent for re-segmentation indel_groups = get_indel_groups( - alignVals, starts_rel_to_read, norm_signal, min_base_obs, - timeout, num_cpts_limit) + alignVals, starts_rel_to_read, norm_signal, min_obs_per_base, + running_stat_width, timeout, num_cpts_limit) new_segs = [] prev_stop = 0 - for group_start, group_stop, cpts, group_indels in indel_groups: + for group_start, group_end, cpts, group_indels in indel_groups: ## add segments from last indel to this one and new segments new_segs.append( np.append(starts_rel_to_read[prev_stop:group_start+1], cpts)) - prev_stop = group_stop + prev_stop = group_end # handle end of read new_segs.append(starts_rel_to_read[prev_stop:]) - new_segs = np.concatenate(new_segs).astype(np.int32) + new_segs = np.concatenate(new_segs).astype(np.int64) if np.diff(new_segs).min() < 1: - raise NotImplementedError, ( + raise NotImplementedError( 'New segments include zero length events.') if new_segs[0] < 0: - raise NotImplementedError, ( + raise NotImplementedError( 'New segments start with negative index.') if new_segs[-1] > norm_signal.shape[0]: - raise NotImplementedError, ( + raise NotImplementedError( 'New segments end past raw signal values.') # get just from alignVals - align_seq = ''.join(zip(*alignVals)[1]).replace('-', '') + align_seq = ''.join(map(itemgetter(1), alignVals)).replace('-', '') if new_segs.shape[0] != len(align_seq) + 1: - raise ValueError, ('Aligned sequence does not match number ' + - 'of segments produced.') + raise ValueError('Aligned sequence does not match number ' + + 'of segments produced.') if in_place: # create new hdf5 file to hold new read signal @@ -374,7 +404,7 @@ def resquiggle_read( def resquiggle_worker( basecalls_q, failed_reads_q, index_q, basecall_group, corrected_group, norm_type, outlier_thresh, timeout, num_cpts_limit, compute_sd, - pore_model, obs_filter): + pore_model, obs_filter, seg_params): num_processed = 0 skip_index = index_q is None if not skip_index: proc_index_data = [] @@ -384,7 +414,7 @@ def resquiggle_worker( # None values placed in queue when all files have # been processed if fast5_fn is None: break - except Queue.Empty: + except queue.Empty: sleep(1) continue @@ -403,7 +433,7 @@ def resquiggle_worker( norm_type, outlier_thresh, alignVals, fix_read_start, timeout, num_cpts_limit, genome_loc, read_info, basecall_group, corrected_group, compute_sd, - pore_model, obs_filter, skip_index=skip_index) + pore_model, obs_filter, seg_params, skip_index=skip_index) if not skip_index: proc_index_data.append(index_data) except Exception as e: @@ -411,11 +441,11 @@ def resquiggle_worker( #raise try: th.write_error_status( - fast5_fn, corrected_group, read_info.Subgroup, str(e)) + fast5_fn, corrected_group, read_info.Subgroup, unicode(e)) except: pass failed_reads_q.put(( - str(e), read_info.Subgroup + ' :: ' + fast5_fn)) + unicode(e), read_info.Subgroup + ' :: ' + fast5_fn)) if not skip_index: index_q.put(proc_index_data) @@ -454,7 +484,7 @@ def fix_all_clipped_bases(batch_align_data, batch_reads_data): clip_fix_align_data = [] for read_fn_sg, ( alignVals, genome_loc, start_clipped_bases, - end_clipped_bases) in batch_align_data.iteritems(): + end_clipped_bases) in batch_align_data.items(): (read_start_rel_to_raw, starts_rel_to_read, basecalls, channel_info, read_id, fix_read_start) = batch_reads_data[read_fn_sg] # fix raw start positions to match bases clipped in mapping @@ -463,7 +493,7 @@ def fix_all_clipped_bases(batch_align_data, batch_reads_data): start_clipped_bases, end_clipped_bases, starts_rel_to_read, read_start_rel_to_raw) - bc_subgroup, fast5_fn = read_fn_sg.split(th.FASTA_NAME_JOINER) + bc_subgroup, fast5_fn = read_fn_sg.split(FASTA_NAME_JOINER) num_ins, num_del, num_match, num_mismatch = 0, 0, 0, 0 for rb, gb in alignVals: if rb == '-': @@ -474,7 +504,7 @@ def fix_all_clipped_bases(batch_align_data, batch_reads_data): num_match += 1 else: num_mismatch += 1 - read_info = readInfo( + read_info = th.alignInfo( read_id, bc_subgroup, start_clipped_bases, end_clipped_bases, num_ins, num_del, num_match, num_mismatch) @@ -524,15 +554,15 @@ def clip_m5_alignment(alignVals, start, strand, chrm): def parse_m5_record(r_m5_record): if r_m5_record['tStrand'] != '+': - raise NotImplementedError, ( + raise NotImplementedError( 'Mapping indicates negative strand reference mapping.') if r_m5_record['qStrand'] == "+": - alignVals = zip(r_m5_record['qAlignedSeq'], - r_m5_record['tAlignedSeq']) + alignVals = list(zip(r_m5_record['qAlignedSeq'], + r_m5_record['tAlignedSeq'])) else: - alignVals = zip(th.rev_comp(r_m5_record['qAlignedSeq']), - th.rev_comp(r_m5_record['tAlignedSeq'])) + alignVals = list(zip(th.rev_comp(r_m5_record['qAlignedSeq']), + th.rev_comp(r_m5_record['tAlignedSeq']))) alignVals, start_clipped_bases, end_clipped_bases, genome_loc \ = clip_m5_alignment( @@ -543,9 +573,9 @@ def parse_m5_record(r_m5_record): end_clipped_bases) def parse_m5_output(align_output, batch_reads_data): - alignments = dict( - (read_fn_sg, None) for read_fn_sg in batch_reads_data.keys()) + alignments = dict((read_fn_sg, None) for read_fn_sg in batch_reads_data) for line in align_output: + line = line.decode() r_m5_record = dict(zip(M5_FIELDS, line.strip().split())) if len(r_m5_record) != len(M5_FIELDS): continue @@ -558,16 +588,15 @@ def parse_m5_output(align_output, batch_reads_data): batch_align_failed_reads = [] batch_align_data = {} - for read_fn_sg, r_m5_record in alignments.iteritems(): + for read_fn_sg, r_m5_record in alignments.items(): if r_m5_record is None: batch_align_failed_reads.append( ('Alignment not produced.', read_fn_sg)) else: try: - batch_align_data[read_fn_sg] = parse_m5_record( - r_m5_record) + batch_align_data[read_fn_sg] = parse_m5_record(r_m5_record) except Exception as e: - batch_align_failed_reads.append((str(e), read_fn_sg)) + batch_align_failed_reads.append((unicode(e), read_fn_sg)) return batch_align_failed_reads, batch_align_data @@ -578,7 +607,7 @@ def parse_cigar(strand): (int(reg_len), reg_type) for reg_len, reg_type in CIGAR_PAT.findall(r_sam_record['cigar'])] if len(cigar) < 1: - raise RuntimeError, 'Invalid cigar string produced.' + raise NotImplementedError('Invalid cigar string produced.') if strand == '-': cigar = cigar[::-1] @@ -612,9 +641,10 @@ def get_qseq(cigar, strand): def get_tseq(qSeq, start_clipped_bases, end_clipped_bases, cigar, strand): tLen = sum([reg_len for reg_len, reg_type in cigar if reg_type in 'MDN=X']) - tSeq = genome_index[r_sam_record['rName']][ - int(r_sam_record['pos']) - 1: - int(r_sam_record['pos']) + tLen - 1] + tSeq = genome_index.get_seq( + r_sam_record['rName'], + int(r_sam_record['pos']) - 1, + int(r_sam_record['pos']) + tLen - 1) if strand == '-': tSeq = th.rev_comp(tSeq) # check that cigar starts and ends with matched bases @@ -672,15 +702,16 @@ def get_align_vals(tSeq, qSeq, cigar, strand): def parse_sam_output(align_output, batch_reads_data, genome_index): # create dictionary with empty slot to each read alignments = dict( - (read_fn_sg, None) for read_fn_sg in batch_reads_data.keys()) + (read_fn_sg, None) for read_fn_sg in batch_reads_data) for line in align_output: + line = line.decode() if line.startswith('@'): continue r_sam_record = dict(zip(SAM_FIELDS, line.strip().split())) if len(r_sam_record) < len(SAM_FIELDS): continue if r_sam_record['rName'] == '*': continue # store the alignment if none is stored for this read or # if this read has the lowest map quality thus far - qName = r_sam_record['qName'].replace(th.FN_SPACE_FILLER, ' ') + qName = r_sam_record['qName'].replace(FN_SPACE_FILLER, ' ') if alignments[qName] is None or \ int(alignments[qName]['mapq']) < \ int(r_sam_record['mapq']): @@ -688,7 +719,7 @@ def parse_sam_output(align_output, batch_reads_data, genome_index): batch_align_failed_reads = [] batch_align_data = {} - for read_fn_sg, r_sam_record in alignments.iteritems(): + for read_fn_sg, r_sam_record in alignments.items(): if r_sam_record is None: batch_align_failed_reads.append( ('Alignment not produced (if all reads failed ' + @@ -698,42 +729,44 @@ def parse_sam_output(align_output, batch_reads_data, genome_index): batch_align_data[read_fn_sg] = parse_sam_record( r_sam_record, genome_index) except Exception as e: - #raise - batch_align_failed_reads.append((str(e), read_fn_sg)) + # uncomment to identify mysterious errors + raise + batch_align_failed_reads.append((unicode(e), read_fn_sg)) return batch_align_failed_reads, batch_align_data def prep_graphmap_options( genome_fn, read_fn, out_fn, output_format, num_align_ps): return ['align', '-r', genome_fn, '-d', read_fn, '-o', out_fn, - '-L', output_format, '-t', str(num_align_ps)] + '-L', output_format, '-t', unicode(num_align_ps)] def prep_bwa_mem_options(genome_fn, read_fn, num_align_ps): - return ['mem', '-x', 'ont2d', '-v', '1', '-t', str(num_align_ps), + return ['mem', '-x', 'ont2d', '-v', '1', '-t', unicode(num_align_ps), genome_fn, read_fn] def prep_minimap2_options(genome_fn, read_fn, num_align_ps, index_fn): mapper_genome = genome_fn if index_fn is None else index_fn - return ['-ax', 'map-ont', '-t', str(num_align_ps), mapper_genome, read_fn] + return ['-ax', 'map-ont', '-t', unicode(num_align_ps), mapper_genome, read_fn] def align_to_genome(batch_reads_data, genome_fn, mapper_data, genome_index, num_align_ps, output_format='sam'): # prepare fasta text with batch reads batch_reads_fasta = '' for read_fn_sg, (_, _, basecalls, _, _, _) in \ - batch_reads_data.iteritems(): + batch_reads_data.items(): # note spaces aren't allowed in read names so replace with # vertical bars and undo to retain file names - batch_reads_fasta += ">" + read_fn_sg.replace(' ', th.FN_SPACE_FILLER) + \ - '\n' + ''.join(basecalls) + '\n' + batch_reads_fasta += ( + ">" + read_fn_sg.replace(' ', FN_SPACE_FILLER) + \ + '\n' + ''.join(basecalls) + '\n') read_fp = NamedTemporaryFile(suffix='.fasta') - read_fp.write(batch_reads_fasta) + read_fp.write(batch_reads_fasta.encode()) read_fp.flush() out_fp = NamedTemporaryFile() # optionally suppress output from mapper with devnull sink - with open(os.devnull, 'w') as FNULL: + with io.open(os.devnull, 'wb') as FNULL: if mapper_data.type == 'graphmap': mapper_options = prep_graphmap_options( genome_fn, read_fp.name, out_fp.name, @@ -748,7 +781,7 @@ def align_to_genome(batch_reads_data, genome_fn, mapper_data, genome_index, genome_fn, read_fp.name, num_align_ps, mapper_data.index) stdout_sink = out_fp else: - raise RuntimeError, 'Mapper not supported.' + raise NotImplementedError('Mapper not supported.') try: exitStatus = call([mapper_data.exe,] + mapper_options, @@ -765,8 +798,7 @@ def align_to_genome(batch_reads_data, genome_fn, mapper_data, genome_index, 'Problem running/parsing genome mapper. ' + 'Ensure you have a compatible version installed.' + 'Potentially failed to locate BWA index files.', - read_fn_sg) for read_fn_sg - in batch_reads_data.keys()], []) + read_fn_sg) for read_fn_sg in batch_reads_data], []) if output_format == 'sam': batch_parse_failed_reads, batch_align_data = parse_sam_output( @@ -775,7 +807,7 @@ def align_to_genome(batch_reads_data, genome_fn, mapper_data, genome_index, batch_parse_failed_reads, batch_align_data = parse_m5_output( align_output, batch_reads_data) else: - raise RuntimeError, 'Mapper output type not supported.' + raise NotImplementedError('Mapper output type not supported.') clip_fix_align_data = fix_all_clipped_bases( batch_align_data, batch_reads_data) @@ -792,7 +824,7 @@ def fix_stay_states( event_change_state = move_states[0] while not event_change_state: if start_clip >= len(move_states) - 2: - raise RuntimeError, ( + raise NotImplementedError( 'Read is composed entirely of stay model ' + 'states and cannot be processed') start_clip += 1 @@ -828,7 +860,7 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): try: fast5_data = h5py.File(fast5_fn, 'r') except: - raise NotImplementedError, ( + raise NotImplementedError( 'Error opening file for alignment. This should have ' + 'been caught during the HDF5 prep phase. Check that there ' + 'are no other tombo processes or processes accessing ' + @@ -844,26 +876,21 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): '/Analyses/' + basecall_group + '/' + basecall_subgroup + '/Events'].value except: - raise RuntimeError, ( + raise NotImplementedError( 'No events or corrupted events in file. Likely a ' + 'segmentation error or mis-specified basecall-' + 'subgroups (--2d?).') rna = th.is_read_rna(fast5_data) - try: - raw_attrs = dict( - fast5_data['/Raw/Reads/'].values()[0].attrs.items()) - if rna: - raw_len = fast5_data['/Raw/Reads/'].values()[0]['Signal'].shape[0] - except: - raise RuntimeError, ( - 'Raw data is not stored in Raw/Reads/Read_[read#] so ' + - 'new segments cannot be identified.') + raw_slot = th.get_raw_read_slot(fast5_data) + raw_attrs = dict(raw_slot.attrs.items()) + if rna: + raw_len = raw_slot['Signal'].shape[0] try: channel_info = th.get_channel_info(fast5_data) fast5_data.close() except: - raise RuntimeError, ( + raise NotImplementedError( 'Error getting channel information and closing fast5 file.') read_id = raw_attrs['read_id'] @@ -899,7 +926,7 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): starts_rel_to_read = np.round( starts_rel_to_read * channel_info.sampling_rate).astype('int_') - abs_event_start - kmer_reference_offset = 2 + kmer_dom_pos = 2 fix_read_start = False elif albacore_version < LooseVersion("2.0"): # compute event starts from length slot as start slot is less @@ -907,7 +934,7 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): starts_rel_to_read = np.cumsum(np.concatenate( [[0,], np.round(called_dat['length'] * channel_info.sampling_rate).astype('int_')])) - kmer_reference_offset = 2 + kmer_dom_pos = 1 # Fix floating point errors in abs_event_start by comparing to # potential breakpoints using resquiggle criterion # don't actually fix here to avoid reading raw signal twice @@ -924,11 +951,11 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): # move to the second position (from the third previously) # but raw was intorduced into rna basecalling one minor release later if rna and albacore_version < LooseVersion("2.1"): - kmer_reference_offset = 2 + kmer_dom_pos = 2 else: - kmer_reference_offset = 1 + kmer_dom_pos = 1 fix_read_start = False - basecalls = np.array([event_state[kmer_reference_offset] + basecalls = np.array([event_state.decode()[kmer_dom_pos] for event_state in called_dat['model_state']]) if rna: @@ -948,10 +975,10 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): if any(len(vals) <= 1 for vals in ( starts_rel_to_read, basecalls, called_dat['model_state'])): - raise NotImplementedError, ( + raise NotImplementedError( 'One or no segments or signal present in read.') if min(np.diff(starts_rel_to_read)) < 1: - raise NotImplementedError, ( + raise NotImplementedError( 'Zero length event present in input data.') # remove stay states from the base caller @@ -974,12 +1001,12 @@ def align_and_parse( read_data = get_read_data( fast5_fn, basecall_group, bc_subgroup) batch_reads_data[ - bc_subgroup + th.FASTA_NAME_JOINER + fast5_fn] = read_data + bc_subgroup + FASTA_NAME_JOINER + fast5_fn] = read_data except Exception as e: # uncomment to identify mysterious errors #raise batch_get_data_failed_reads.append(( - str(e), bc_subgroup + th.FASTA_NAME_JOINER + fast5_fn)) + unicode(e), bc_subgroup + FASTA_NAME_JOINER + fast5_fn)) batch_align_failed_reads, batch_align_data = align_to_genome( batch_reads_data, genome_fn, mapper_data, @@ -990,8 +1017,8 @@ def align_and_parse( for fast5_fn, sg_align_data in batch_align_data: fn_batch_align_data[fast5_fn].append(sg_align_data) # uncomment to identify mysterious errors - #print "Get data errors: " + str(batch_get_data_failed_reads) - #print "Align read errors: " + str(batch_align_failed_reads) + #print("Get data errors: " + unicode(batch_get_data_failed_reads)) + #print("Align read errors: " + unicode(batch_align_failed_reads)) return (batch_get_data_failed_reads + batch_align_failed_reads, fn_batch_align_data) @@ -1013,11 +1040,11 @@ def align_reads( batch_align_failed_reads, batch_align_data = align_and_parse( fast5s_to_process, genome_fn, mapper_data, genome_index, basecall_group, basecall_subgroups, num_align_ps) - for fast5_fn, sgs_align_data in batch_align_data.iteritems(): + for fast5_fn, sgs_align_data in batch_align_data.items(): basecalls_q.put((fast5_fn, sgs_align_data)) # uncomment to identify mysterious errors - #print "Prep reads fail: " + str(batch_prep_failed_reads) - #print "Align reads fail: " + str(batch_align_failed_reads) + #print("Prep reads fail: " + unicode(batch_prep_failed_reads)) + #print("Align reads fail: " + unicode(batch_align_failed_reads)) return batch_prep_failed_reads + batch_align_failed_reads @@ -1026,11 +1053,11 @@ def alignment_worker( mapper_data, basecall_group, basecall_subgroups, corrected_group, overwrite, num_align_ps): # this is only needed for sam output format (not m5) - genome_index = th.parse_fasta(genome_fn) + genome_index = th.Fasta(genome_fn) while not fast5_q.empty(): try: fast5_batch = fast5_q.get(block=False) - except Queue.Empty: + except queue.Empty: break batch_failed_reads = align_reads( @@ -1039,7 +1066,7 @@ def alignment_worker( corrected_group, basecalls_q, overwrite, num_align_ps) for failed_read in batch_failed_reads: try: - sg_fn = failed_read[1].split(th.FASTA_NAME_JOINER) + sg_fn = failed_read[1].split(FASTA_NAME_JOINER) if len(sg_fn) == 2: subgroup, fast5_fn = sg_fn else: @@ -1066,7 +1093,8 @@ def resquiggle_all_reads( basecall_group, basecall_subgroups, corrected_group, norm_type, outlier_thresh, timeout, num_cpts_limit, overwrite, align_batch_size, num_align_ps, align_threads_per_proc, - num_resquiggle_ps, compute_sd, pore_model, skip_index, obs_filter): + num_resquiggle_ps, compute_sd, pore_model, skip_index, obs_filter, + seg_params): manager = mp.Manager() fast5_q = manager.Queue() # set maximum number of parsed basecalls to sit in the middle queue @@ -1091,24 +1119,25 @@ def resquiggle_all_reads( mapper_data, basecall_group, basecall_subgroups, corrected_group, overwrite, align_threads_per_proc) align_ps = [] - for p_id in xrange(num_align_ps): + for p_id in range(num_align_ps): p = mp.Process(target=alignment_worker, args=align_args) p.start() align_ps.append(p) rsqgl_args = (basecalls_q, failed_reads_q, index_q, basecall_group, corrected_group, norm_type, outlier_thresh, timeout, - num_cpts_limit, compute_sd, pore_model, obs_filter) + num_cpts_limit, compute_sd, pore_model, obs_filter, + seg_params) resquiggle_ps = [] - for p_id in xrange(num_resquiggle_ps): + for p_id in range(num_resquiggle_ps): p = mp.Process(target=resquiggle_worker, args=rsqgl_args) p.start() resquiggle_ps.append(p) if VERBOSE: sys.stderr.write( - 'Correcting ' + str(num_reads) + ' files with ' + - str(len(basecall_subgroups)) + ' subgroup(s)/read(s) ' + - 'each (Will print a dot for each ' + str(PROGRESS_INTERVAL) + + 'Correcting ' + unicode(num_reads) + ' files with ' + + unicode(len(basecall_subgroups)) + ' subgroup(s)/read(s) ' + + 'each (Will print a dot for each ' + unicode(PROGRESS_INTERVAL) + ' reads completed).\n') failed_reads = defaultdict(list) all_index_data = [] @@ -1116,24 +1145,24 @@ def resquiggle_all_reads( try: errorType, fn = failed_reads_q.get(block=False) failed_reads[errorType].append(fn) - except Queue.Empty: + except queue.Empty: sleep(1) continue # add None entried to basecalls_q to indicate that all reads have # been basecalled and processed - for _ in xrange(num_resquiggle_ps): + for _ in range(num_resquiggle_ps): basecalls_q.put((None, None)) while any(p.is_alive() for p in resquiggle_ps): try: errorType, fn = failed_reads_q.get(block=False) failed_reads[errorType].append(fn) - except Queue.Empty: + except queue.Empty: try: proc_index_data = index_q.get(block=False) all_index_data.extend(proc_index_data) - except Queue.Empty: + except queue.Empty: sleep(1) continue @@ -1163,11 +1192,11 @@ def check_for_albacore(files, basecall_group, num_reads=50): continue if not has_albacore: - sys.stderr.write( - '******** WARNING ********* The provided FAST5 files do not ' + - 'appear to contain albacore basecalling events. ' + - 'tombo is only tested on albacore formatted results ' + - 'and other basecallers may not produce desired results.\n') + th._warning_message( + 'The provided FAST5 files do not ' + + 'appear to contain albacore basecalling events. ' + + 'tombo is only tested on albacore formatted results ' + + 'and other basecallers may not produce desired results.') return @@ -1180,10 +1209,9 @@ def event_resquiggle_main(args): if all(map_exe is None for map_exe in ( args.minimap2_executable, args.bwa_mem_executable, args.graphmap_executable)): - sys.stderr.write( - '*' * 60 + '\nERROR: Must provide either a ' + \ - 'minimap2, graphmap or bwa-mem executable.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Must provide either a minimap2, graphmap or ' + + 'bwa-mem executable.') if args.minimap2_executable is not None: mapper_data = mapperData( args.minimap2_executable, 'minimap2', args.minimap2_index) @@ -1195,10 +1223,8 @@ def event_resquiggle_main(args): if VERBOSE: sys.stderr.write('Getting file list.\n') try: if not os.path.isdir(args.fast5_basedir): - sys.stderr.write( - '*' * 60 + '\nERROR: Provided --fast5-basedir is ' + - 'not a directory.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Provided --fast5-basedir is not a directory.') fast5_basedir = ( args.fast5_basedir if args.fast5_basedir.endswith('/') else args.fast5_basedir + '/') @@ -1207,16 +1233,14 @@ def event_resquiggle_main(args): index_fn = th.get_index_fn(fast5_basedir, args.corrected_group) if os.path.exists(index_fn): os.remove(index_fn) except OSError: - sys.stderr.write( - '*' * 60 + '\nERROR: Reads base directory, a sub-directory ' + + th._error_message_and_exit( + 'Reads base directory, a sub-directory ' + 'or an old (hidden) index file does not appear to be ' + - 'accessible. Check directory permissions.\n' + '*' * 60 + '\n') - sys.exit() + 'accessible. Check directory permissions.') if len(files) < 1: - sys.stderr.write( - '*' * 60 + '\nERROR: No files identified in the specified ' + - 'directory or within immediate subdirectories.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'No files identified in the specified directory or ' + + 'within immediate subdirectories.') check_for_albacore(files, args.basecall_group) @@ -1252,20 +1276,20 @@ def event_resquiggle_main(args): args.timeout, args.cpts_limit, args.overwrite, args.alignment_batch_size, args.align_processes, align_threads_per_proc, num_resquiggle_ps, compute_sd, - pore_model, args.skip_index, obs_filter) + pore_model, args.skip_index, obs_filter, args.segmentation_parameters) if not args.skip_index: th.write_index_file(all_index_data, index_fn, fast5_basedir) fail_summary = [(err, len(fns)) for err, fns in failed_reads.items()] if len(fail_summary) > 0: - total_num_failed = sum(zip(*fail_summary)[1]) - sys.stderr.write('Failed reads summary (' + str(total_num_failed) + + total_num_failed = sum(map(itemgetter(1), fail_summary)) + sys.stderr.write('Failed reads summary (' + unicode(total_num_failed) + ' total failed):\n' + '\n'.join( - "\t" + err + " :\t" + str(n_fns) + "\t" + err + " :\t" + unicode(n_fns) for err, n_fns in sorted(fail_summary)) + '\n') else: sys.stderr.write('All reads successfully re-squiggled!\n') if args.failed_reads_filename is not None: - with open(args.failed_reads_filename, 'w') as fp: + with io.open(args.failed_reads_filename, 'wt') as fp: fp.write('\n'.join(( err + '\t' + ', '.join(fns) for err, fns in failed_reads.items())) + '\n') diff --git a/tombo/_model_resquiggle.py b/tombo/_model_resquiggle.py index ad3786f..416dcfa 100644 --- a/tombo/_model_resquiggle.py +++ b/tombo/_model_resquiggle.py @@ -1,6 +1,10 @@ -import os, sys +from __future__ import unicode_literals, absolute_import -import Queue +from builtins import range, dict, map, zip + +import io +import sys +import queue import numpy as np np.seterr(all='raise') @@ -8,15 +12,19 @@ from time import sleep from itertools import repeat +from operator import itemgetter from collections import defaultdict +if sys.version_info[0] > 2: + unicode = str + # import tombo functions -import tombo_stats as ts -import tombo_helper as th +from . import tombo_stats as ts +from . import tombo_helper as th -from c_helper import c_new_means -from dynamic_programming import c_reg_z_scores, c_base_forward_pass, \ - c_base_traceback +from .dynamic_programming import traceback, forward_pass +from .c_helper import c_new_means +from .c_dynamic_programming import c_reg_z_scores, c_base_traceback VERBOSE = False @@ -65,65 +73,6 @@ def write_switch(s_fp, switch_points, reg_id, iter_num=0): for sig_i in sig_is) + '\n') return - -def forward_pass(reg_z_scores, min_obs_per_base): - # dynamic programming algorithm to find modeled signal to base assignment - - # fill banded path with cumulative probabilties from the previous signal - # either in the current base or the previous base (left or diagonal left - # from associated plotting) - - # get the first row data - prev_b_data, (prev_b_start, prev_b_end) = reg_z_scores[0] - prev_b_fwd_data = np.cumsum(prev_b_data) - # store number of observations since last diagonal at each position - # - forces forward pass to allow legal traceback paths while - # enforcing the minimum observations per base threshold - # - should also help from optimization pushing poor fitting bases - # to assign only an observation or two - # - will also use this data to traceback all reasonable paths - prev_b_last_diag = np.ones(prev_b_end - prev_b_start, - dtype=np.int32) * min_obs_per_base - # first row is just a cumsum since there is no previous row - reg_fwd_scores = [(prev_b_fwd_data, prev_b_last_diag, - (prev_b_start, prev_b_end))] - - for b_data, (b_start, b_end) in reg_z_scores[1:]: - b_fwd_data, prev_b_last_diag = c_base_forward_pass( - b_data, b_start, b_end, - prev_b_data, prev_b_start, prev_b_end, - prev_b_fwd_data, prev_b_last_diag, min_obs_per_base) - - # consider storing data to form traceback in one go at the - # end of this loop - reg_fwd_scores.append(( - b_fwd_data, prev_b_last_diag, (b_start, b_end))) - prev_b_data, prev_b_fwd_data, prev_b_start, prev_b_end = ( - b_data, b_fwd_data, b_start, b_end) - - return reg_fwd_scores - -def traceback(reg_fwd_scores, min_obs_per_base): - # traceback along maximally likely path - - # initilize array to store new segments - new_segs = np.empty(len(reg_fwd_scores) - 1, dtype=np.int32) - # get first two bases of data for lookups - curr_base_sig = 1 - curr_b_data, _, (curr_start, curr_end) = reg_fwd_scores[-1] - next_b_data, _, (next_start, next_end) = reg_fwd_scores[-2] - new_segs[-1] = c_base_traceback( - curr_b_data, curr_start, next_b_data, next_start, next_end, - curr_end - 1, min_obs_per_base) - for base_pos in range(len(reg_fwd_scores) - 3, -1, -1): - curr_b_data, curr_start = next_b_data, next_start - next_b_data, _, (next_start, next_end) = reg_fwd_scores[base_pos] - new_segs[base_pos] = c_base_traceback( - curr_b_data, curr_start, next_b_data, next_start, next_end, - new_segs[base_pos+1] - 1, min_obs_per_base) - - return new_segs - def get_best_event_path(reg_z_scores, b_switch_pnts, min_obs_per_base): # calc cummulative sums for more efficient region sum computations reg_cumm_z = [(np.cumsum(np.concatenate([[0], b_data])), b_start) @@ -170,7 +119,7 @@ def get_base_z_mean(base_cumsum, b_start, curr_pos, prev_pos, prev_sum): curr_max_path = prev_path curr_max_sum = sp_event_mean_z - return np.array(curr_max_path[1:], dtype=np.int32) + return np.array(curr_max_path[1:], dtype=np.int64) def traceback_until( reg_fwd_scores, start_base, seq_pos, b_switch_pnts, @@ -230,7 +179,7 @@ def find_all_tb_paths(reg_z_scores, reg_fwd_scores, global_tb, min_obs_per_base, # unlikely with a window of 3 original bases. tb_b_ranges = np.concatenate([[0], global_tb, [ reg_fwd_scores[-1][-1][-1] + 1]]) - tb_b_ranges = zip(tb_b_ranges[:-1], tb_b_ranges[1:] - 1) + tb_b_ranges = list(zip(tb_b_ranges[:-1], tb_b_ranges[1:] - 1)) for base_pos, seq_pos in req_locations: path_i = [] # add this position as a switch point for this base @@ -330,9 +279,9 @@ def base_space_pass(reg_z_scores): return new_segs - if (min_obs_per_base * (reg_end - reg_start - 1) >= - r_b_starts[reg_end - 1] - r_b_starts[reg_start]): - raise NotImplementedError, ( + if ((min_obs_per_base * (reg_end - reg_start)) >= + (r_b_starts[reg_end] - r_b_starts[reg_start])): + raise NotImplementedError( 'Not enough signal to correct poor fitting region.') reg_z_scores = c_reg_z_scores( @@ -355,33 +304,34 @@ def filter_regions(signif_shift_regs, r_prev_new_segs, r_pp_segs): return filtered_regs def model_resquiggle_read( - r_data, kmer_ref, kmer_width, upstrm_bases, dnstrm_bases, z_trans_lag, - z_thresh, reg_context, base_reg_context, max_base_shift, b_max_base_shift, - min_obs_per_base, base_space_iters, new_corr_grp, compute_sd, - debug_fps=None): + r_data, std_ref, z_trans_lag, z_thresh, reg_context, base_reg_context, + max_base_shift, b_max_base_shift, min_obs_per_base, base_space_iters, + new_corr_grp, compute_sd, debug_fps=None): # should also get signal here all_read_data = th.get_all_read_data(r_data) if all_read_data is None: - raise NotImplementedError, ('Error parsing data from FAST5 file.') + raise NotImplementedError('Error parsing data from FAST5 file.') (r_means, r_seq, r_sig, r_b_starts, scale_vals, norm_type, outlier_thresh, genome_loc) = all_read_data - r_ref_means, r_ref_sds = zip(*[ - kmer_ref[kmer] for kmer in [''.join(bs) for bs in zip(*[ - r_seq[i:] for i in range(kmer_width)])]]) + r_ref_means, r_ref_sds, _, _ = ts.get_ref_from_seq(r_seq, std_ref) + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + # add upstream NANs so all data passed to model shifts is on the same # coordinate system. Note that the nan values will never be accessed # as the shift regions don't let a region extend beyond the non-nan # statistic values - r_ref_means = np.concatenate((([np.NAN] * upstrm_bases), r_ref_means)) - r_ref_sds = np.concatenate((([np.NAN] * upstrm_bases), r_ref_sds)) + r_ref_means = np.concatenate((([np.NAN] * std_ref.central_pos), r_ref_means)) + r_ref_sds = np.concatenate((([np.NAN] * std_ref.central_pos), r_ref_sds)) # add NAN values so that shifted regions will line up with original # base regions since kmer upstream and downstream positions can't be tested window_z = np.concatenate(( - [np.NAN] * upstrm_bases, + [np.NAN] * std_ref.central_pos, ts.calc_window_z_transform( - r_means[upstrm_bases:-dnstrm_bases], r_ref_means[upstrm_bases:], - r_ref_sds[upstrm_bases:], z_trans_lag), [np.NAN] * dnstrm_bases)) + r_means[std_ref.central_pos:-dnstrm_bases], + r_ref_means[std_ref.central_pos:], + r_ref_sds[std_ref.central_pos:], z_trans_lag), + [np.NAN] * dnstrm_bases)) signif_shift_regs = ts.get_read_signif_shift_regions( window_z, z_thresh, reg_context) @@ -400,10 +350,12 @@ def model_resquiggle_read( # on sequence (which is un-changed) r_means = c_new_means(r_sig, r_prev_new_segs) window_z = np.concatenate(( - [np.NAN] * upstrm_bases, + [np.NAN] * std_ref.central_pos, ts.calc_window_z_transform( - r_means[upstrm_bases:-dnstrm_bases], r_ref_means[upstrm_bases:], - r_ref_sds[upstrm_bases:], z_trans_lag), [np.NAN] * dnstrm_bases)) + r_means[std_ref.central_pos:-dnstrm_bases], + r_ref_means[std_ref.central_pos:], + r_ref_sds[std_ref.central_pos:], z_trans_lag), + [np.NAN] * dnstrm_bases)) signif_shift_regs = ts.get_read_signif_shift_regions( window_z, z_thresh, base_reg_context) # filter regions that didn't change in the last round of @@ -437,26 +389,25 @@ def model_resquiggle_worker( reg_context, base_reg_context, max_base_shift, b_max_base_shift, min_obs_per_base, base_space_iters, new_corr_grp, compute_sd, overwrite, in_place, corr_group): - kmer_ref, upstrm_bases, _, _ = ts.parse_tombo_model(tb_model_fn) - kmer_width = len(next(kmer_ref.iterkeys())) - dnstrm_bases = kmer_width - upstrm_bases - 1 + std_ref = ts.TomboModel(tb_model_fn) if DEBUG_SIGNAL or DEBUG_BASE: - sig_fp = open('debug_signal_space.signal.txt', 'w') + sig_fp = io.open('debug_signal_space.signal.txt', 'wt') sig_fp.write('SignalPos\tSignal\tRegion\tIteration\n') - zscore_fp = open('debug_signal_space.window_z_scores.txt', 'w') + zscore_fp = io.open('debug_signal_space.window_z_scores.txt', 'wt') zscore_fp.write('BasePos\tSignalPos\tZScore\tRegion\tIteration\n') - origP_fp = open('debug_signal_space.window_orig_path.txt', 'w') + origP_fp = io.open('debug_signal_space.window_orig_path.txt', 'wt') origP_fp.write('BasePos\tSignalPos\tRegion\tIteration\n') - tb_fp = open('debug_signal_space.window_traceback.txt', 'w') + tb_fp = io.open('debug_signal_space.window_traceback.txt', 'wt') tb_fp.write('BasePos\tSignalPos\tpathVal\tRegion\tIteration\n') - ld_fp = open('debug_signal_space.window_last_diag.txt', 'w') + ld_fp = io.open('debug_signal_space.window_last_diag.txt', 'wt') ld_fp.write('BasePos\tSignalPos\tLastDiagCount\tRegion\tIteration\n') - sigMaxP_fp = open('debug_signal_space.window_signal_max_path.txt', 'w') + sigMaxP_fp = io.open( + 'debug_signal_space.window_signal_max_path.txt', 'wt') sigMaxP_fp.write('BasePos\tSignalPos\tRegion\tIteration\n') - maxP_fp = open('debug_signal_space.window_max_path.txt', 'w') + maxP_fp = io.open('debug_signal_space.window_max_path.txt', 'wt') maxP_fp.write('BasePos\tSignalPos\tRegion\tIteration\n') - spP_fp = open('debug_signal_space.window_switch_points.txt', 'w') + spP_fp = io.open('debug_signal_space.window_switch_points.txt', 'wt') spP_fp.write('BasePos\tSignalPos\tRegion\tIteration\n') debug_fps = (sig_fp, zscore_fp, origP_fp, tb_fp, ld_fp, sigMaxP_fp, maxP_fp, spP_fp) @@ -467,7 +418,7 @@ def model_resquiggle_worker( while True: try: fn_reads = reads_q.get(block=False) - except Queue.Empty: + except queue.Empty: break num_processed += 1 @@ -489,20 +440,21 @@ def model_resquiggle_worker( for r_data in fn_reads: try: model_resquiggle_read( - r_data, kmer_ref, kmer_width, upstrm_bases, dnstrm_bases, - z_trans_lag, z_thresh, reg_context, base_reg_context, - max_base_shift, b_max_base_shift, min_obs_per_base, - base_space_iters, new_corr_grp, compute_sd, debug_fps) + r_data, std_ref, z_trans_lag, z_thresh, reg_context, + base_reg_context, max_base_shift, b_max_base_shift, + min_obs_per_base, base_space_iters, new_corr_grp, + compute_sd, debug_fps) except Exception as e: # uncomment to identify mysterious errors #raise try: subgrp = r_data.corr_group.split('/')[1] - th.write_error_status(r_data.fn, corr_group, subgrp, str(e)) + th.write_error_status( + r_data.fn, corr_group, subgrp, unicode(e)) except: pass failed_reads_q.put(( - str(e), r_data.corr_group + th.FASTA_NAME_JOINER + r_data.fn)) + unicode(e), r_data.corr_group + ':::' + r_data.fn)) return @@ -536,11 +488,11 @@ def model_resquiggle( # group reads by filename so slot is not deleted in 2D reads fn_grouped_reads = defaultdict(list) - for cs_reads in raw_read_coverage.itervalues(): + for cs_reads in raw_read_coverage.values(): for r_data in cs_reads: fn_grouped_reads[r_data.fn].append(r_data) num_reads = 0 - for fn_reads in fn_grouped_reads.itervalues(): + for fn_reads in fn_grouped_reads.values(): reads_q.put(fn_reads) num_reads += 1 @@ -550,22 +502,22 @@ def model_resquiggle( min_obs_per_base, base_space_iters, new_corr_grp, compute_sd, overwrite, in_place, corr_group) mod_rsqgl_ps = [] - for p_id in xrange(num_processes): + for p_id in range(num_processes): p = mp.Process(target=model_resquiggle_worker, args=mod_rsqgl_args) p.start() mod_rsqgl_ps.append(p) if VERBOSE: sys.stderr.write( - 'Correcting ' + str(num_reads) + ' files with ' + - str(len(bc_subgrps)) + ' subgroup(s)/read(s) ' + - 'each (Will print a dot for each ' + str(PROGRESS_INTERVAL) + + 'Correcting ' + unicode(num_reads) + ' files with ' + + unicode(len(bc_subgrps)) + ' subgroup(s)/read(s) ' + + 'each (Will print a dot for each ' + unicode(PROGRESS_INTERVAL) + ' reads completed).\n') failed_reads = defaultdict(list) while any(p.is_alive() for p in mod_rsqgl_ps): try: errorType, fn = failed_reads_q.get(block=False) failed_reads[errorType].append(fn) - except Queue.Empty: + except queue.Empty: sleep(1) continue while not failed_reads_q.empty(): @@ -597,15 +549,15 @@ def model_resquiggle_main(args): fail_summary = [(err, len(fns)) for err, fns in failed_reads.items()] if len(fail_summary) > 0: - total_num_failed = sum(zip(*fail_summary)[1]) - sys.stderr.write('Failed reads summary (' + str(total_num_failed) + + total_num_failed = sum(map(itemgetter(1), fail_summary)) + sys.stderr.write('Failed reads summary (' + unicode(total_num_failed) + ' total failed):\n' + '\n'.join( - "\t" + err + " :\t" + str(n_fns) + "\t" + err + " :\t" + unicode(n_fns) for err, n_fns in sorted(fail_summary)) + '\n') else: sys.stderr.write('All reads successfully re-squiggled!\n') if args.failed_reads_filename is not None: - with open(args.failed_reads_filename, 'w') as fp: + with io.open(args.failed_reads_filename, 'wt') as fp: fp.write('\n'.join(( err + '\t' + ', '.join(fns) for err, fns in failed_reads.items())) + '\n') @@ -614,5 +566,5 @@ def model_resquiggle_main(args): if __name__ == '__main__': - raise NotImplementedError, ( + raise NotImplementedError( 'This is a module. See commands with `tombo -h`') diff --git a/tombo/_option_parsers.py b/tombo/_option_parsers.py index 73a01c6..470a8a1 100644 --- a/tombo/_option_parsers.py +++ b/tombo/_option_parsers.py @@ -1,132 +1,189 @@ +from __future__ import unicode_literals, absolute_import + +from builtins import map + +import sys import argparse +if sys.version_info[0] > 2: + unicode = str + +from ._default_parameters import SEG_PARAMS_TABLE, ALGN_PARAMS_TABLE, \ + LLR_THRESH, HYPO_THRESH, ALTERNATE_MODELS + +ALT_BASES = tuple(set(alt_name.split('_')[1] for alt_name in ALTERNATE_MODELS)) + ################################## ###### Positional arguments ###### ################################## basedir_opt=('fast5_basedir', { + 'type':unicode, 'help':'Directory containing fast5 files. All files ending in "fast5" ' + 'found recursively within this base directory will be processed.'}) -fasta_pos_opt=('genome_fasta', {'help':'Path to fasta file for mapping.'}) +fasta_pos_opt=( + 'genome_fasta', {'type':unicode, 'help':'Path to fasta file for mapping.'}) ############################ ###### Text arguments ###### ############################ -minimap2_opt=('--minimap2-executable', {'help':'Path to minimap2 executable.'}) +minimap2_opt=('--minimap2-executable', { + 'type':unicode, 'help':'Path to minimap2 executable.'}) minindx_opt=('--minimap2-index', { - 'help':'Path to minimap2 index (with map-ont preset) file corresponding ' + - 'to the [genome_fasta] provided.'}) -bwamem_opt=('--bwa-mem-executable', {'help':'Path to bwa-mem executable.'}) -graphmap_opt=('--graphmap-executable', {'help':'Path to graphmap executable.'}) + 'type':unicode, 'help':'Path to minimap2 index (with map-ont preset) ' + 'file corresponding to the [genome_fasta] provided.'}) +bwamem_opt=('--bwa-mem-executable', { + 'type':unicode, 'help':'Path to bwa-mem executable.'}) +graphmap_opt=('--graphmap-executable', { + 'type':unicode, 'help':'Path to graphmap executable.'}) poremod_opt=('--pore-model-filename', { - 'help':'File containing kmer model parameters (level_mean and ' + - 'level_stdv) used in order to compute kmer-based corrected pA ' + + 'type':unicode, + 'help':'File containing kmer model parameters (level_mean ' + + 'and level_stdv) used in order to compute kmer-based corrected pA ' + 'values. E.g. https://github.com/jts/nanopolish/blob/master/etc/' + 'r9-models/template_median68pA.5mers.model'}) tbmod_opt=('--tombo-model-filename', { - 'help':'Tombo model for event-less resquiggle and significance testing. ' + - 'If no model is provided the default DNA or RNA tombo model will be used.'}) + 'type':unicode, 'help':'Tombo model filename. If no file is provided, ' + + 'the default DNA or RNA Tombo model will be used.'}) +tbmod_w_opt=('--tombo-model-filename', { + 'type':unicode, 'help':'Filename to save Tombo model.'}) atbmod_opt=('--alternate-model-filename', { + 'type':unicode, 'help':'Tombo model for alternative likelihood ratio significance testing.'}) -atbmods_opt=('--alternate-model-filenames', { - 'nargs':'+', - 'help':'Tombo models for alternative likelihood ratio significance testing.'}) +hidden_tbmod_opt=('--tombo-model-filename', { + 'type':unicode, 'help':argparse.SUPPRESS}) +hidden_atbmod_opt=('--alternate-model-filename', { + 'type':unicode, 'help':argparse.SUPPRESS}) +hidden_atbmods_opt=('--alternate-model-filenames', { + 'type':unicode, 'nargs':'+', 'help':argparse.SUPPRESS}) altname_opt=('--alternate-model-name', { + 'type':unicode, 'help':'A short name to associate with this alternate model (e.g. 5mC, ' + - '4mC, 6mA). This text will be included in output filenames when this model ' + - 'is used for testing.'}) + '6mA, etc.). This text will be included in output filenames when this ' + + 'model is used for testing.'}) failed_opt=('--failed-reads-filename', { 'help':'Output failed read filenames with assoicated error. Default: ' + 'Do not store failed reads.'}) sfast5dir_opt = ('--fast5-basedir', { - 'help':'Directory containing fast5 files.'}) + 'type':unicode, 'help':'Directory containing fast5 files.'}) fast5dir_opt = ('--fast5-basedirs', { - 'nargs':'+', 'help':'Directories containing fast5 files.'}) + 'type':unicode, 'nargs':'+', 'help':'Directories containing fast5 files.'}) ctrlfast5dir_opt=('--control-fast5-basedirs', { - 'nargs':'+', - 'help':'Control set of directories containing fast5 files. These reads ' + - 'should contain only standard nucleotides.'}) + 'type':unicode, 'nargs':'+', + 'help':'Set of directories containing fast5 files for control reads, ' + + 'containing only canonical nucleotides.'}) corrgrp_opt=('--corrected-group', { - 'default':'RawGenomeCorrected_000', + 'type':unicode, 'default':'RawGenomeCorrected_000', 'help':'FAST5 group created by resquiggle command. Default: %(default)s'}) correvntgrp_opt=('--corrected-group', { - 'default':'RawGenomeCorrected_000', + 'type':unicode, 'default':'RawGenomeCorrected_000', 'help':'FAST5 group created by resquiggle command. Default: %(default)s'}) newcorrgrp_opt=('--new-corrected-group', { - 'default':'RawModelCorrected_000', + 'type':unicode, 'default':'RawModelCorrected_000', 'help':'FAST5 group created by resquiggle command. Default: %(default)s'}) bcgrp_opt=('--basecall-group', { - 'default':'Basecall_1D_000', + 'type':unicode, 'default':'Basecall_1D_000', 'help':'FAST5 group obtain original basecalls (under Analyses group). ' + 'Default: %(default)s'}) bcsubgrps_opt=('--basecall-subgroups', { - 'default':['BaseCalled_template',], 'nargs':'+', + 'type':unicode, 'default':['BaseCalled_template',], 'nargs':'+', 'help':'FAST5 subgroup(s) (under /Analyses/[--basecall-group]/) containing ' + 'basecalls and created within [--corrected-group] containing re-squiggle ' + 'results. Default: %(default)s'}) bcsubgrp_opt=('--basecall-subgroup', { - 'default':'BaseCalled_template', + 'type':unicode, 'default':'BaseCalled_template', 'help':'FAST5 subgroup (under /Analyses/[--basecall-group]/) under which ' + 'to store basecalls from FASTQs. Default: %(default)s'}) gnmloc_opt=('--genome-locations', { - 'nargs':'+', + 'type':unicode, 'nargs':'+', 'help':'Genomic locations at which to plot signal. Format locations ' + 'as "chrm:position[:strand] [chrm2:position2[:strand2] ...]" ' + '(strand not applicable for all applications)'}) fasta_opt=('--genome-fasta', { + 'type':unicode, 'help':'FASTA file used to re-squiggle. For faster sequence access.'}) motif_opt=('--motif', { + 'type':unicode, 'help':'Motif of interest at which to plot signal and statsitics. ' + 'Supports IUPAC single letter codes (use T for RNA).'}) obsfilt_opt=('--obs-per-base-filter', { - 'nargs':'+', 'default':[], + 'type':unicode, 'nargs':'+', 'default':[], 'help':'Filter reads baseed on observations per base percentile ' + 'thresholds. Format thresholds as "percentile:thresh ' + '[pctl2:thresh2 ...]". For example to filter reads with 99th ' + 'pctl > 200 obs/base or max > 5k obs/base use "99:200 100:5000".'}) fastqs_opt = ('--fastq-filenames', { - 'nargs':'+', 'help':'FASTQ filenames containing basecalls to be added to ' + + 'type':unicode, 'nargs':'+', + 'help':'FASTQ filenames containing basecalls to be added to ' + 'raw FAST5 files.'}) wigfn_opt=('--wiggle-basename', { - 'default':'tombo_results', + 'type':unicode, 'default':'tombo_results', 'help':'Basename for output wiggle files. Two files (plus and minus ' + 'strand) will be produced for each --wiggle-types supplied. ' + 'Filenames formatted as "[wiggle-basename].[wiggle-type].' + '[sample|control]?.[plus|minus].wig". Default: %(default)s'}) pdf_opt=('--pdf-filename', { - 'help':'PDF filename to store plot(s). Default: %(default)s'}) + 'type':unicode, 'help':'PDF filename to store plot(s). Default: %(default)s'}) statfn_opt=('--statistics-filename', { - 'help':"File to save/load base by base statistics."}) + 'type':unicode, 'help':"File to save/load base by base statistics."}) statbsnm_opt=('--statistics-file-basename', { + 'type':unicode, 'help':"File base name to save base by base statistics from testing. " + "Filenames will be [--statistics-file-basename]." + "[--alternate-bases]?.tombo.stats"}) rdata_opt=('--r-data-filename', { + 'type':unicode, 'help':"Filename to save R data structure. Default: Don't save"}) seqs_opt=('--sequences-filename', { + 'type':unicode, 'help':'File for sequences from selected regions. Sequences will be ' + 'stored in FASTA format. Default: %(default)s.'}) densbn_opt=('--save-density-basename', { + 'type':unicode, 'help':"Basename to save alternative model estimation density " + "estimation information. See scripts/debug_est_alt.R for info use " + "example. Default: Don't save."}) altden_opt=('--alternate-density-filename', { + 'type':unicode, 'help':'File containing k-mer level kernel density estimates for the ' + 'alternative sample saved using --save-density-basename.'}) ctrlden_opt=('--control-density-filename', { + 'type':unicode, 'help':'File containing k-mer level kernel density estimates for the ' + 'control sample saved using --save-density-basename.'}) +prstatbn_opt=('--per-read-statistics-basename', { + 'type':unicode, + 'help':'Base for binary files containing per-read statistics from ' + + 'statistical testing. Filenames will be [--per-read-statistics-basename].' + + '[--alternate-bases]?.tombo.per_read_stats'}) +prstat_opt=('--per-read-statistics-filename', { + 'type':unicode, + 'help':'Binary file containing per-read statistics from ' + + 'statistical testing.'}) + +statfns_opt=('--statistics-filenames', { + 'type':unicode, 'nargs':'+', + 'help':"Files to load base by base statistics."}) +motifdesc_opt=('--motif-descriptions', { + 'type':unicode, 'nargs':'+', + 'help':'Ground truth, motif centered, modified base descriptions for ' + + 'computing ROC and PR curves. Each statistics file is associated with ' + + 'a set of motif descriptions. Format descriptions as: "motif:mod_pos:name' + + '[::motif2:mod_pos2:name2...]". The mod_pos indicated the modified base ' + + 'within the motif (1-based index). Example: CCWGG:2:"dcm 5mC"::GATC:2:' + + '"dam 6mA" would assess the performance of a single Tombo statistics ' + + 'file for identification of E. coli dam and dcm methylation.'}) ############################ @@ -173,16 +230,12 @@ 'help':'Observations of each k-mer required to include a read in ' + 'read level averages. Default: %(default)d'}) -bndwdth_opt=('--bandwidth', { - 'type':int, 'default':501, - 'help':'Bandwidth of events for dynamic sequence to event mapping. ' + - 'Default: %(default)d'}) minobs_opt=('--min-obs-per-base', { 'type':int, 'help':'Minimum raw observations to assign to a genomic base. ' + 'Default: %(default)d'}) covthresh_opt=('--coverage-threshold', { - 'type':int, 'default':100, + 'type':int, 'help':'Maximum mean coverage per region when estimating k-mer model ' + '(limits compute time for deep samples). Default: %(default)d'}) maxbase_opt=('--max-bases-shift', { @@ -215,7 +268,7 @@ 'help':'Number of bases offset over which to search when computing ' + 'distances for signal cluster plotting. Default: 0 (exact position)'}) cntxt_opt=('--num-context', { - 'type':int, 'default':2, + 'type':int, 'default':5, 'help':'Number of context bases around motif. Default: %(default)d'}) numstat_opt=('--num-statistics', { 'type':int, 'default':200, @@ -236,7 +289,7 @@ 'help':'Number of context bases up and downstream over which to compute ' + "Stouffer's Z combined z-scores. Default: %(default)d."}) regcntxt_opt=('--region-context', { - 'type':int, 'default':1, + 'type':int, 'default':2, 'help':'Number of context bases up and downstream of poorly fit ' + 'regions to perform model re-squiggle. Default: %(default)d.'}) brcntxt_opt=('--base-score-region-context', { @@ -254,6 +307,21 @@ 'help':'Number of reads required at a position to perform significance ' + 'testing or contribute to model estimation. Default: %(default)d'}) +segpars_opt=('--segmentation-parameters', { + 'type':int, 'nargs':3, + 'help':'Specify the 3 parameters for segmentation 1) running neighboring ' + + 'windows width 2) minimum raw observations per genomic base 3) mean raw ' + + 'observations per event. Sample type defaults: ' + + ' || '.join((bst + ' : ' + ' '.join(map(str, params))) + for bst, params in SEG_PARAMS_TABLE.items())}) +segpars2_opt=('--segmentation-parameters', { + 'type':int, 'nargs':2, + 'help':'Specify the 2 parameters for segmentation 1) running neighboring ' + + 'windows width 2) minimum raw observations per genomic base. Sample type ' + + 'defaults:\n' + + ' || '.join((bst + ' : ' + ' '.join(map(str, params[:2]))) + for bst, params in SEG_PARAMS_TABLE.items())}) + ############################### ###### Boolean arguments ###### @@ -273,7 +341,7 @@ estmean_opt=('--estimate-mean', { 'default':False, 'action':'store_true', 'help':"Use the mean instead of median for model level estimation. Note:" + - "This can cause poor fits due to outliers"}) + " This can cause poor fits due to outliers"}) kmspec_opt=('--kmer-specific-sd', { 'default':False, 'action':'store_true', 'help':"Estimate standard deviation for each k-mers individually."}) @@ -290,10 +358,6 @@ 'default':False, 'action':'store_true', 'help':'Plot k-mer means across whole reads as opposed to ' + 'individual k-mer event levels.'}) -statord_opt=('--statistic-order', { - 'default':False, 'action':'store_true', - 'help':"Order selected locations by p-values or mean likelihood ratio. " + - "Default: fraction of significant reads."}) boxc_opt=('--box-center', { 'default':False, 'action':'store_true', 'help':"Plot a box around the central base."}) @@ -319,14 +383,6 @@ ###### Float arguments ###### ############################## -mexpct_opt=('--match-expected-value', { - 'type':float, 'default':0.5, - 'help':'Expected value when a matched event to genomic sequence is ' + - 'encountered. Default: %(default)f'}) -skippen_opt=('--skip-penalty', { - 'type':float, 'default':1.0, - 'help':'Penalty applied to skipped genomic bases in event to sequence ' + - 'assignment. Default: %(default)f'}) otlthresh_opt=('--outlier-threshold', { 'default':5, 'type':float, 'help':'Windosrize the signal at this number of scale values. ' + @@ -339,7 +395,7 @@ 'type':float, 'help':'P-value or log likelihood ratio threshold when computing ' + 'fraction of significant reads at each genomic position. Default: ' + - 'pvalue:0.01; likelihood ratio:2'}) + 'p-value:{0:.2g}; likelihood ratio:{1:.2g}'.format(HYPO_THRESH, LLR_THRESH)}) altfrac_opt=('--alt-fraction-percentile', { 'default':1, 'type':float, 'help':'When esitmating the alternative base incorporation rate, this ' + @@ -351,9 +407,6 @@ 'help':'Bandwidth applied when performing Gaussian kernal density ' + 'esitmation on standard and alternative base signal distributions. ' + 'Default: %(default)f'}) -qvalthresh_opt=('--q-value-threshold', { - 'type':float, - 'help':'Plot all regions below provied q-value. Overrides --num-regions.'}) pctfilt_opt=('--percent-to-filter', { 'type':float, 'default':10, 'help':'Percentage of all reads to filter. Reads are randomly selected ' + @@ -362,6 +415,19 @@ fxdscl_opt=('--fixed-scale', { 'type':float, 'help':'Fixed scaling parameter to use for raw signal normalization.'}) +cvgdmp_opt=('--coverage-dampen-counts', { + 'type':float, 'nargs':2, 'default':[2,0.5], + 'help':'Dampen fraction modified estimates for low coverage sites. Two ' + + 'parameters are psuedo unmodified and modified read counts. This is ' + + 'equivalent to a beta prior on the fraction estimate. Default: %(default)s'}) + +sigapars_opt=('--signal-align-parameters', { + 'type':float, 'nargs':4, + 'help':'Specify the 4 parameters for signal to genome sequence alignment ' + + 'algorithm 1) match expected value 2) skip penalty 3) bandwidth 4) mean ' + + 'signal segmentation half-normal score threshold. Sample type defaults: ' + + ' || '.join((bst + ' : ' + ' '.join(map(str, params))) + for bst, params in ALGN_PARAMS_TABLE.items())}) ############################## @@ -369,6 +435,7 @@ ############################## normtype_opt=('--normalization-type', { + 'type':unicode, 'default':'median', 'choices':('median', 'pA', 'pA_raw', 'none'), 'help':'Choices: "none": raw 16-bit DAQ values, "pA_raw": pA as in the ' + 'ONT events (using offset, range and digitization), "pA": k-mer-based ' + @@ -383,26 +450,25 @@ 'default':2, 'type':int, 'choices':(0,1,2,3,4), 'help':'Downstream bases in k-mer. Default: %(default)d'}) altbs_opt=('--alternate-model-base', { - 'choices':('A','C','G','T'), + 'type':unicode, 'choices':('A','C','G','T'), 'help':'Non-standard base is an alternative to this base.'}) modbs_opt=('--alternate-bases', { - 'choices':('5mC',), 'nargs':'+', + 'type':unicode, 'choices':ALT_BASES, 'nargs':'+', 'help':'Default non-standard base model for testing.'}) paltmod_opt=('--plot-alternate-model', { - 'choices':('5mC',), + 'type':unicode, 'choices':ALT_BASES, 'help':'Add alternative model distribution to the plot.'}) - regtype_opt=('--region-type', { - 'default':'random', 'choices':['random', 'start', 'end'], + 'type':unicode, 'default':'random', 'choices':['random', 'start', 'end'], 'help':'Region to plot within each read. Default: random'}) ovplttype_opt=('--overplot-type', { - 'default':'Downsample', + 'type':unicode, 'default':'Downsample', 'choices':['Downsample', 'Boxplot', 'Quantile', 'Density'], 'help':'Plot type for regions with higher coverage. Default: Downsample'}) wigtypes_opt=('--wiggle-types', { - 'default':['coverage', 'fraction'], 'nargs':'+', - 'choices':['coverage', 'fraction', 'signal', 'signal_sd', 'length', - 'stat', 'mt_stat', 'difference'], + 'type':unicode, 'default':['coverage', 'fraction'], 'nargs':'+', + 'choices':['coverage', 'fraction', 'dampened_fraction', 'signal', + 'signal_sd', 'dwell', 'difference'], 'help':'Data types of wiggles to produce. Default: "coverage fraction"'}) dna_opt=('--dna', { @@ -446,6 +512,16 @@ def add_default_args(parser): return fast5_args, misc_args, parser +def add_comp_dist_args(parser): + alt_args = parser.add_argument_group('Comparison Arguments') + alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) + alt_args.add_argument(pstdmod_opt[0], **pstdmod_opt[1]) + alt_args.add_argument(paltmod_opt[0], **paltmod_opt[1]) + alt_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) + alt_args.add_argument(hidden_atbmod_opt[0], **hidden_atbmod_opt[1]) + + return alt_args, parser + ##################################### ###### Main re-squiggle parser ###### @@ -460,29 +536,23 @@ def get_eventless_resquiggle_parser(): req_args.add_argument(basedir_opt[0], **basedir_opt[1]) req_args.add_argument(fasta_pos_opt[0], **fasta_pos_opt[1]) - mapper_args = parser.add_argument_group( - 'Mapper Arguments (One mapper is required)') - mapper_args.add_argument(minimap2_opt[0], **minimap2_opt[1]) - mapper_args.add_argument(minindx_opt[0], **minindx_opt[1]) - mapper_args.add_argument(bwamem_opt[0], **bwamem_opt[1]) - mapper_args.add_argument(graphmap_opt[0], **graphmap_opt[1]) - mapper_args.add_argument(batchsize_opt[0], **batchsize_opt[1]) - mod_args = parser.add_argument_group('Model Parameters') - mod_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) mod_args.add_argument(dna_opt[0], **dna_opt[1]) mod_args.add_argument(rna_opt[0], **rna_opt[1]) + mod_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) alg_args = parser.add_argument_group( 'Event to Sequence Assignment Parameters') - alg_args.add_argument(mexpct_opt[0], **mexpct_opt[1]) - alg_args.add_argument(skippen_opt[0], **skippen_opt[1]) - alg_args.add_argument(bndwdth_opt[0], **bndwdth_opt[1]) - alg_args.add_argument(fitscl_opt[0], **fitscl_opt[1]) - alg_args.add_argument(fxdscl_opt[0], **fxdscl_opt[1]) - alg_args.add_argument(otlthresh_opt[0], **otlthresh_opt[1]) + alg_args.add_argument(segpars_opt[0], **segpars_opt[1]) + alg_args.add_argument(sigapars_opt[0], **sigapars_opt[1]) + + sig_args = parser.add_argument_group( 'Signal Scaling Parameters') + sig_args.add_argument(fitscl_opt[0], **fitscl_opt[1]) + sig_args.add_argument(fxdscl_opt[0], **fxdscl_opt[1]) + sig_args.add_argument(otlthresh_opt[0], **otlthresh_opt[1]) io_args = parser.add_argument_group('Input/Output Arguments') + io_args.add_argument(minindx_opt[0], **minindx_opt[1]) io_args.add_argument(skpidx_opt[0], **skpidx_opt[1]) io_args.add_argument(failed_opt[0], **failed_opt[1]) io_args.add_argument(incldsd_opt[0], **incldsd_opt[1]) @@ -491,10 +561,7 @@ def get_eventless_resquiggle_parser(): filt_args.add_argument(obsfilt_opt[0], **obsfilt_opt[1]) multi_args = parser.add_argument_group('Multiprocessing Arguments') - multi_args.add_argument(proc_opt[0], default=2, **proc_opt[1]) - multi_args.add_argument(alignproc_opt[0], **alignproc_opt[1]) - multi_args.add_argument(alignthrds_opt[0], **alignthrds_opt[1]) - multi_args.add_argument(rsqglproc_opt[0], **rsqglproc_opt[1]) + multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) fast5_args = parser.add_argument_group('FAST5 Data Arguments') fast5_args.add_argument(corrgrp_opt[0], **corrgrp_opt[1]) @@ -532,6 +599,7 @@ def get_event_resquiggle_parser(): norm_args.add_argument(normtype_opt[0], **normtype_opt[1]) norm_args.add_argument(poremod_opt[0], **poremod_opt[1]) norm_args.add_argument(otlthresh_opt[0], **otlthresh_opt[1]) + norm_args.add_argument(segpars2_opt[0], **segpars2_opt[1]) filt_args = parser.add_argument_group('Read Filtering Arguments') filt_args.add_argument(obsfilt_opt[0], **obsfilt_opt[1]) @@ -572,11 +640,11 @@ def get_model_resquiggle_parser(): reg_args.add_argument(pvalthrsh_opt[0], **pvalthrsh_opt[1]) modr_args = parser.add_argument_group('Model Re-squiggle Arguments') - modr_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) modr_args.add_argument(dna_opt[0], **dna_opt[1]) modr_args.add_argument(rna_opt[0], **rna_opt[1]) modr_args.add_argument(maxbase_opt[0], **maxbase_opt[1]) modr_args.add_argument(minobs_opt[0], default=3, **minobs_opt[1]) + modr_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) brsqgl_args = parser.add_argument_group('Base Scoring Arguments') brsqgl_args.add_argument(bsiters_opt[0], **bsiters_opt[1]) @@ -633,7 +701,7 @@ def get_est_ref_parser(): 'and testing without an amplified (un-modified) sample.', add_help=False) req_args = parser.add_argument_group('Required Arguments') req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) - req_args.add_argument(tbmod_opt[0], required=True, **tbmod_opt[1]) + req_args.add_argument(tbmod_w_opt[0], required=True, **tbmod_w_opt[1]) stat_args = parser.add_argument_group('Modeling Arguments') stat_args.add_argument(estmean_opt[0], **estmean_opt[1]) @@ -643,7 +711,7 @@ def get_est_ref_parser(): filt_args = parser.add_argument_group('Filtering Arguments') filt_args.add_argument(minreads_opt[0], default=10, **minreads_opt[1]) - filt_args.add_argument(covthresh_opt[0], **covthresh_opt[1]) + filt_args.add_argument(covthresh_opt[0], default=100, **covthresh_opt[1]) filt_args.add_argument(minkmer_opt[0], default=5, **minkmer_opt[1]) multi_args = parser.add_argument_group('Multiprocessing Arguments') @@ -673,10 +741,12 @@ def get_est_alt_ref_parser(): dens_args.add_argument(altden_opt[0], **altden_opt[1]) dens_args.add_argument(ctrlden_opt[0], **ctrlden_opt[1]) - stat_args = parser.add_argument_group('Standard Model Arguments') - stat_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) - stat_args.add_argument(dna_opt[0], **dna_opt[1]) - stat_args.add_argument(rna_opt[0], **rna_opt[1]) + mod_args = parser.add_argument_group('Standard Model Arguments') + mod_args.add_argument(dna_opt[0], **dna_opt[1]) + mod_args.add_argument(rna_opt[0], **rna_opt[1]) + mod_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) + + stat_args = parser.add_argument_group('Model Fitting Arguments') stat_args.add_argument(altfrac_opt[0], **altfrac_opt[1]) stat_args.add_argument(kernden_opt[0], **kernden_opt[1]) @@ -686,6 +756,9 @@ def get_est_alt_ref_parser(): io_args = parser.add_argument_group('Output Argument') io_args.add_argument(densbn_opt[0], **densbn_opt[1]) + multi_args = parser.add_argument_group('Multiprocessing Arguments') + multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) + fast5_args, misc_args, parser = add_default_args(parser) return parser @@ -720,18 +793,21 @@ def get_test_signif_parser(): alt_args = parser.add_argument_group( 'Comparison Arguments (Default: De novo testing against default ' + 'standard model)') + alt_args.add_argument(modbs_opt[0], **modbs_opt[1]) alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) - alt_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) alt_args.add_argument(dna_opt[0], **dna_opt[1]) alt_args.add_argument(rna_opt[0], **rna_opt[1]) - alt_args.add_argument(atbmods_opt[0], **atbmods_opt[1]) - alt_args.add_argument(modbs_opt[0], **modbs_opt[1]) + alt_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) + alt_args.add_argument(hidden_atbmods_opt[0], **hidden_atbmods_opt[1]) test_args = parser.add_argument_group('Significance Test Arguments') test_args.add_argument(fmo_opt[0], **fmo_opt[1]) - test_args.add_argument(minreads_opt[0], default=5, **minreads_opt[1]) + test_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) test_args.add_argument(snglrdthrsh_opt[0], **snglrdthrsh_opt[1]) + io_args = parser.add_argument_group('Output Argument') + io_args.add_argument(prstatbn_opt[0], **prstatbn_opt[1]) + multi_args = parser.add_argument_group('Multiprocessing Arguments') multi_args.add_argument(mpreg_opt[0], **mpreg_opt[1]) multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) @@ -805,12 +881,7 @@ def get_max_cov_parser(): req_args = parser.add_argument_group('Required Argument') req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) - alt_args = parser.add_argument_group('Comparison Arguments') - alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) - alt_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) - alt_args.add_argument(atbmod_opt[0], **atbmod_opt[1]) - alt_args.add_argument(pstdmod_opt[0], **pstdmod_opt[1]) - alt_args.add_argument(paltmod_opt[0], **paltmod_opt[1]) + alt_args, parser = add_comp_dist_args(parser) ovplt_args = parser.add_argument_group('Overplotting Arguments') ovplt_args.add_argument(ovpltthresh_opt[0], **ovpltthresh_opt[1]) @@ -836,12 +907,7 @@ def get_genome_loc_parser(): req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) req_args.add_argument(gnmloc_opt[0], required=True, **gnmloc_opt[1]) - alt_args = parser.add_argument_group('Comparison Arguments') - alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) - alt_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) - alt_args.add_argument(atbmod_opt[0], **atbmod_opt[1]) - alt_args.add_argument(pstdmod_opt[0], **pstdmod_opt[1]) - alt_args.add_argument(paltmod_opt[0], **paltmod_opt[1]) + alt_args, parser = add_comp_dist_args(parser) ovplt_args = parser.add_argument_group('Overplotting Arguments') ovplt_args.add_argument(ovpltthresh_opt[0], **ovpltthresh_opt[1]) @@ -868,12 +934,7 @@ def get_motif_loc_parser(): req_args.add_argument(motif_opt[0], required=True, **motif_opt[1]) req_args.add_argument(fasta_opt[0], required=True, **fasta_opt[1]) - alt_args = parser.add_argument_group('Comparison Arguments') - alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) - alt_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) - alt_args.add_argument(atbmod_opt[0], **atbmod_opt[1]) - alt_args.add_argument(pstdmod_opt[0], **pstdmod_opt[1]) - alt_args.add_argument(paltmod_opt[0], **paltmod_opt[1]) + alt_args, parser = add_comp_dist_args(parser) ovplt_args = parser.add_argument_group('Overplotting Arguments') ovplt_args.add_argument(ovpltthresh_opt[0], **ovpltthresh_opt[1]) @@ -926,12 +987,7 @@ def get_signif_diff_parser(): req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) req_args.add_argument(statfn_opt[0], required=True, **statfn_opt[1]) - alt_args = parser.add_argument_group('Comparison Arguments') - alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) - alt_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) - alt_args.add_argument(atbmod_opt[0], **atbmod_opt[1]) - alt_args.add_argument(pstdmod_opt[0], **pstdmod_opt[1]) - alt_args.add_argument(paltmod_opt[0], **paltmod_opt[1]) + alt_args, parser = add_comp_dist_args(parser) ovplt_args = parser.add_argument_group('Overplotting Arguments') ovplt_args.add_argument(ovpltthresh_opt[0], **ovpltthresh_opt[1]) @@ -940,8 +996,6 @@ def get_signif_diff_parser(): reg_args = parser.add_argument_group('Plotting Region Arguments') reg_args.add_argument(numreg_opt[0], default=10, **numreg_opt[1]) reg_args.add_argument(numbases_opt[0], default=21, **numbases_opt[1]) - reg_args.add_argument(qvalthresh_opt[0], **qvalthresh_opt[1]) - reg_args.add_argument(statord_opt[0], **statord_opt[1]) out_args = parser.add_argument_group('Output Arguments') out_args.add_argument(pdf_opt[0], @@ -963,9 +1017,7 @@ def get_signif_motif_parser(): req_args.add_argument(motif_opt[0], required=True, **motif_opt[1]) req_args.add_argument(statfn_opt[0], required=True, **statfn_opt[1]) - alt_args = parser.add_argument_group('Comparison Arguments') - alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) - alt_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) + alt_args, parser = add_comp_dist_args(parser) ovplt_args = parser.add_argument_group('Overplotting Argument') ovplt_args.add_argument(ovpltthresh_opt[0], **ovpltthresh_opt[1]) @@ -974,7 +1026,10 @@ def get_signif_motif_parser(): reg_args.add_argument(numreg_opt[0], default=3, **numreg_opt[1]) reg_args.add_argument(cntxt_opt[0], **cntxt_opt[1]) reg_args.add_argument(numstat_opt[0], **numstat_opt[1]) - reg_args.add_argument(statord_opt[0], **statord_opt[1]) + + seq_args = parser.add_argument_group( + 'Sequence Argument (for faster sequence access)') + seq_args.add_argument(fasta_opt[0], **fasta_opt[1]) out_args = parser.add_argument_group('Output Argument') out_args.add_argument(pdf_opt[0], @@ -990,15 +1045,13 @@ def get_per_read_parser(): description='Plot non-standard base statistic per read at specified ' + 'genomic locations.', add_help=False) req_args = parser.add_argument_group('Required Arguments') - req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) req_args.add_argument(gnmloc_opt[0], required=True, **gnmloc_opt[1]) + req_args.add_argument(prstat_opt[0], required=True, **prstat_opt[1]) - alt_args = parser.add_argument_group('Comparison Arguments') - alt_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) - alt_args.add_argument(atbmod_opt[0], **atbmod_opt[1]) - alt_args.add_argument(fmo_opt[0], **fmo_opt[1]) - alt_args.add_argument(pstdmod_opt[0], **pstdmod_opt[1]) - alt_args.add_argument(paltmod_opt[0], **paltmod_opt[1]) + seq_args = parser.add_argument_group( + 'Sequence Arguments (Provide either FAST5s dir or genome FASTA)') + seq_args.add_argument(fasta_opt[0], **fasta_opt[1]) + seq_args.add_argument(fast5dir_opt[0], **fast5dir_opt[1]) reg_args = parser.add_argument_group('Plotting Region Arguments') reg_args.add_argument(numreads_opt[0], default=100, **numreads_opt[1]) @@ -1006,9 +1059,8 @@ def get_per_read_parser(): reg_args.add_argument(boxc_opt[0], **boxc_opt[1]) out_args = parser.add_argument_group('Output Argument') - out_args.add_argument(pdf_opt[0], - default=OUTPUT_BASE + '.per_read_stats.pdf', - **pdf_opt[1]) + out_args.add_argument( + pdf_opt[0], default=OUTPUT_BASE + '.per_read_stats.pdf', **pdf_opt[1]) fast5_args, misc_args, parser = add_default_args(parser) @@ -1092,6 +1144,27 @@ def get_kmer_dist_parser(): return parser +def get_roc_parser(): + parser = argparse.ArgumentParser( + description='Plot ROC curve given known motif(s).', + add_help=False) + req_args = parser.add_argument_group('Required Argument') + req_args.add_argument(statfns_opt[0], required=True, **statfns_opt[1]) + req_args.add_argument(motifdesc_opt[0], required=True, **motifdesc_opt[1]) + req_args.add_argument(fasta_opt[0], required=True, **fasta_opt[1]) + + out_args = parser.add_argument_group('Output Arguments') + out_args.add_argument(pdf_opt[0], + default=OUTPUT_BASE + '.roc.pdf', + **pdf_opt[1]) + + filt_args = parser.add_argument_group('Filtering Arguments') + filt_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) + + misc_args, parser = add_misc_args(parser) + + return parser + def get_cluster_signif_diff_parser(): parser = argparse.ArgumentParser( description='Cluster signal trace differences at most significant ' + @@ -1112,7 +1185,6 @@ def get_cluster_signif_diff_parser(): reg_args = parser.add_argument_group('Plotting Region Arguments') reg_args.add_argument(numreg_opt[0], default=10, **numreg_opt[1]) reg_args.add_argument(numbases_opt[0], default=21, **numbases_opt[1]) - reg_args.add_argument(qvalthresh_opt[0], **qvalthresh_opt[1]) reg_args.add_argument(slides_opt[0], **slides_opt[1]) out_args = parser.add_argument_group('Output Arguments') @@ -1143,6 +1215,9 @@ def get_wiggle_parser(): out_args.add_argument(wigfn_opt[0], **wigfn_opt[1]) out_args.add_argument(wigtypes_opt[0], **wigtypes_opt[1]) + stat_args = parser.add_argument_group('Statistical Argument') + stat_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) + fast5_args, misc_args, parser = add_default_args(parser) return parser @@ -1152,17 +1227,17 @@ def get_write_signif_diff_parser(): description='Write sequence at genomic locations with most ' + 'significant difference from previous test_significance results.', add_help=False) - req_args = parser.add_argument_group( - 'Required Arguments (either fast5s or fasts required)') + req_args = parser.add_argument_group('Required Argument') req_args.add_argument(statfn_opt[0], required=True, **statfn_opt[1]) - req_args.add_argument(fast5dir_opt[0], **fast5dir_opt[1]) - req_args.add_argument(fasta_opt[0], **fasta_opt[1]) + + seq_args = parser.add_argument_group( + 'Sequence Arguments (Provide either FAST5s dir or genome FASTA)') + seq_args.add_argument(fasta_opt[0], **fasta_opt[1]) + seq_args.add_argument(fast5dir_opt[0], **fast5dir_opt[1]) reg_args = parser.add_argument_group('Region Selection Arguments') - reg_args.add_argument(statord_opt[0], **statord_opt[1]) reg_args.add_argument(numreg_opt[0], default=100, **numreg_opt[1]) - reg_args.add_argument(numbases_opt[0], default=21, **numbases_opt[1]) - reg_args.add_argument(qvalthresh_opt[0], **qvalthresh_opt[1]) + reg_args.add_argument(numbases_opt[0], default=15, **numbases_opt[1]) out_args = parser.add_argument_group('Output Arguments') out_args.add_argument( @@ -1175,5 +1250,5 @@ def get_write_signif_diff_parser(): if __name__ == '__main__': - raise NotImplementedError, ( + raise NotImplementedError( 'This is a module. See commands with `tombo -h`') diff --git a/tombo/_version.py b/tombo/_version.py index 20e0007..6f626d7 100644 --- a/tombo/_version.py +++ b/tombo/_version.py @@ -1 +1,3 @@ -TOMBO_VERSION = '1.1.1' +from __future__ import unicode_literals + +TOMBO_VERSION = '1.2' diff --git a/tombo/dynamic_programming.pyx b/tombo/c_dynamic_programming.pyx similarity index 52% rename from tombo/dynamic_programming.pyx rename to tombo/c_dynamic_programming.pyx index 8dc4914..33e0de2 100644 --- a/tombo/dynamic_programming.pyx +++ b/tombo/c_dynamic_programming.pyx @@ -1,17 +1,22 @@ -from numpy cimport ndarray +cimport cython import numpy as np cimport numpy as np + DTYPE = np.float64 ctypedef np.float64_t DTYPE_t + +DTYPE_INT = np.int64 +ctypedef np.int64_t DTYPE_INT_t + from libcpp cimport bool -def c_base_z_scores(ndarray[DTYPE_t] b_sig not None, - float ref_mean, float ref_sd): - cdef int n_sig = b_sig.shape[0] - cdef ndarray[DTYPE_t] b_z_scores = np.empty(n_sig, dtype=DTYPE) +def c_base_z_scores(np.ndarray[DTYPE_t] b_sig not None, + DTYPE_t ref_mean, DTYPE_t ref_sd): + cdef DTYPE_INT_t n_sig = b_sig.shape[0] + b_z_scores = np.empty(n_sig, dtype=DTYPE) cdef DTYPE_t b_pos_z_score - cdef int idx + cdef DTYPE_INT_t idx for idx in range(n_sig): b_pos_z_score = (b_sig[idx] - ref_mean) / ref_sd if b_pos_z_score > 0: @@ -21,14 +26,19 @@ def c_base_z_scores(ndarray[DTYPE_t] b_sig not None, return b_z_scores def c_reg_z_scores( - ndarray[DTYPE_t] r_sig not None, ndarray[DTYPE_t] r_ref_means not None, - ndarray[DTYPE_t] r_ref_sds not None, ndarray[int] r_b_starts not None, - int reg_start, int reg_end, int max_base_shift, int min_obs_per_base): - cdef int base_i, b_sig_start, b_sig_end, prev_sig_start, prev_sig_end, idx - cdef int reg_len = reg_end - reg_start - cdef ndarray[int] sig_starts = np.empty(reg_len, dtype=np.int32) + np.ndarray[DTYPE_t] r_sig not None, + np.ndarray[DTYPE_t] r_ref_means not None, + np.ndarray[DTYPE_t] r_ref_sds not None, + np.ndarray[DTYPE_INT_t] r_b_starts not None, + DTYPE_INT_t reg_start, DTYPE_INT_t reg_end, + DTYPE_INT_t max_base_shift, DTYPE_INT_t min_obs_per_base): + cdef DTYPE_INT_t base_i, b_sig_start, b_sig_end, prev_sig_start, \ + prev_sig_end, idx + cdef DTYPE_INT_t reg_len = reg_end - reg_start + cdef np.ndarray[DTYPE_INT_t] sig_starts = np.empty(reg_len, dtype=DTYPE_INT) prev_start_set = False - cdef ndarray[int] base_range = np.arange(reg_start, reg_end, dtype=np.int32) + cdef np.ndarray[DTYPE_INT_t] base_range = np.arange( + reg_start, reg_end, dtype=DTYPE_INT) for idx in range(reg_len): base_i = base_range[idx] b_sig_start = r_b_starts[max(reg_start, base_i - max_base_shift)] @@ -41,12 +51,12 @@ def c_reg_z_scores( sig_starts[idx] = b_sig_start prev_sig_start = b_sig_start - cdef ndarray[int] sig_ends = np.empty(reg_len, dtype=np.int32) + cdef np.ndarray[DTYPE_INT_t] sig_ends = np.empty(reg_len, dtype=DTYPE_INT) prev_end_set = False # clip positions from the end of each base for idx in range(reg_len): base_i = base_range[reg_len - idx - 1] - b_sig_end = r_b_starts[min(reg_end, base_i + max_base_shift + 1)] + b_sig_end = r_b_starts[min(reg_end - 1, base_i + max_base_shift + 1)] # clip observations from the end of a base if there is no # possible traceback path through that location if (prev_end_set and @@ -57,40 +67,42 @@ def c_reg_z_scores( prev_sig_end = b_sig_end reg_scores = [] - cdef ndarray[DTYPE_t] b_z_scores for idx in range(reg_len): base_i = base_range[idx] b_sig_start = sig_starts[idx] b_sig_end = sig_ends[idx] # z-score computation is far more efficient than p-values and # produces *very* similar results - b_z_scores = c_base_z_scores(r_sig[b_sig_start:b_sig_end], - r_ref_means[base_i], r_ref_sds[base_i]) - - reg_scores.append((b_z_scores, ( - b_sig_start-r_b_starts[reg_start], - b_sig_end-r_b_starts[reg_start]))) + reg_scores.append(( + c_base_z_scores(r_sig[b_sig_start:b_sig_end], + r_ref_means[base_i], r_ref_sds[base_i]), ( + b_sig_start-r_b_starts[reg_start], + b_sig_end-r_b_starts[reg_start]))) return reg_scores def c_base_forward_pass( - ndarray[DTYPE_t] b_data not None, int b_start, int b_end, - ndarray[DTYPE_t] prev_b_data not None, int prev_b_start, int prev_b_end, - ndarray[DTYPE_t] prev_b_fwd_data not None, - ndarray[int] prev_b_last_diag not None, int min_obs_per_base): - cdef int b_len = b_end - b_start + np.ndarray[DTYPE_t] b_data not None, + DTYPE_INT_t b_start, DTYPE_INT_t b_end, + np.ndarray[DTYPE_t] prev_b_data not None, + DTYPE_INT_t prev_b_start, DTYPE_INT_t prev_b_end, + np.ndarray[DTYPE_t] prev_b_fwd_data not None, + np.ndarray[DTYPE_INT_t] prev_b_last_diag not None, + DTYPE_INT_t min_obs_per_base): + cdef DTYPE_INT_t b_len = b_end - b_start # forward pass cumulative z-scores for this base - cdef ndarray[DTYPE_t] b_fwd_data = np.empty(b_len, dtype=DTYPE) + cdef np.ndarray[DTYPE_t] b_fwd_data = np.empty(b_len, dtype=DTYPE) # store last diagonal move to pass on to next base - cdef ndarray[int] b_last_diag = np.empty(b_len, dtype=np.int32) + cdef np.ndarray[DTYPE_INT_t] b_last_diag = np.empty(b_len, dtype=DTYPE_INT) # use cumsum as it is much more efficient than sums - cdef ndarray[DTYPE_t] prev_b_data_cumsum = np.cumsum(prev_b_data) - cdef int pos, last_valid_diag_lag, pos_diag_val - cdef DTYPE_t diag_score, stay_base_score, pos_score + cdef np.ndarray[DTYPE_t] prev_b_data_cumsum = np.cumsum(prev_b_data) + cdef DTYPE_INT_t pos, last_valid_diag_lag, pos_diag_val + cdef DTYPE_t diag_score, stay_base_score, pos_score, fwd_value # add the diagonally below position value for the first possible # position in each base - b_fwd_data[0] = b_data[0] + prev_b_fwd_data[b_start - prev_b_start - 1] + fwd_value = b_data[0] + prev_b_fwd_data[b_start - prev_b_start - 1] + b_fwd_data[0] = fwd_value b_last_diag[0] = 1 # some bases end at the same position (could change this by trimming earlier) @@ -116,31 +128,32 @@ def c_base_forward_pass( # stayed in this base, so add one to the last stayed in base count pos_score, pos_diag_val = ( stay_base_score, b_last_diag[pos - b_start - 1] + 1) - b_fwd_data[pos - b_start] = b_data[pos - b_start] + pos_score + fwd_value = b_data[pos - b_start] + pos_score + b_fwd_data[pos - b_start] = fwd_value b_last_diag[pos - b_start] = pos_diag_val - cdef DTYPE_t curr_fwd_score - cdef int idx, curr_last_diag, reg_left_len + cdef DTYPE_INT_t idx, curr_last_diag, reg_left_len if b_end > prev_b_end + 1: # perform C cumsum until the end of the base # note no possible allowed diagonal moves here - curr_fwd_score = b_fwd_data[prev_b_end - b_start] + fwd_value = b_fwd_data[prev_b_end - b_start] curr_last_diag = b_last_diag[prev_b_end - b_start] reg_left_len = b_end - prev_b_end - 1 for idx in range(reg_left_len): - curr_fwd_score += b_data[idx + prev_b_end - b_start + 1] + fwd_value += b_data[idx + prev_b_end - b_start + 1] curr_last_diag += 1 - b_fwd_data[idx + prev_b_end - b_start + 1] = curr_fwd_score + b_fwd_data[idx + prev_b_end - b_start + 1] = fwd_value b_last_diag[idx + prev_b_end - b_start + 1] = curr_last_diag return b_fwd_data, b_last_diag def c_base_traceback( - ndarray[DTYPE_t] curr_b_data not None, int curr_start, - ndarray[DTYPE_t] next_b_data not None, int next_start, int next_end, - int sig_start, int min_obs_per_base): - cdef int curr_base_sig = 1 - cdef int sig_pos + np.ndarray[DTYPE_t] curr_b_data not None, DTYPE_INT_t curr_start, + np.ndarray[DTYPE_t] next_b_data not None, + DTYPE_INT_t next_start, DTYPE_INT_t next_end, + DTYPE_INT_t sig_start, DTYPE_INT_t min_obs_per_base): + cdef DTYPE_INT_t curr_base_sig = 1 + cdef DTYPE_INT_t sig_pos for sig_pos in range(sig_start, -1, -1): curr_base_sig += 1 # if there is not enough signal in the current base or the next base @@ -155,68 +168,59 @@ def c_base_traceback( # Eventless re-squiggle dynamic programming algorithm +@cython.wraparound(False) +@cython.boundscheck(False) def c_banded_forward_pass( - ndarray[DTYPE_t, ndim=2] shifted_z_scores not None, - ndarray[int, ndim=1] event_starts not None, - float skip_pen, float stay_pen): - cdef int n_bases = shifted_z_scores.shape[0] - cdef int bandwidth = shifted_z_scores.shape[1] - cdef ndarray[DTYPE_t, ndim=2] fwd_pass = np.empty((n_bases + 1, bandwidth)) - cdef ndarray[int, ndim=2] fwd_pass_tb = np.empty( - (n_bases + 1, bandwidth), dtype=np.int32) + np.ndarray[DTYPE_t, ndim=2] shifted_z_scores not None, + np.ndarray[DTYPE_INT_t, ndim=1] event_starts not None, + DTYPE_t skip_pen, DTYPE_t stay_pen): + cdef DTYPE_INT_t n_bases = shifted_z_scores.shape[0] + cdef DTYPE_INT_t bandwidth = shifted_z_scores.shape[1] + cdef np.ndarray[DTYPE_t, ndim=2] fwd_pass = np.empty(( + n_bases + 1, bandwidth)) + cdef np.ndarray[DTYPE_INT_t, ndim=2] fwd_pass_tb = np.empty( + (n_bases + 1, bandwidth), dtype=DTYPE_INT) # zero starts let the read start anywhere along the beginning # (for finding the read start) - cdef int idx + cdef DTYPE_INT_t idx for idx in range(bandwidth): - fwd_pass[0,idx] = 0.0 + fwd_pass[0, idx] = 0.0 - cdef int max_from, event_pos, band_pos, seq_pos, prev_b_pos - cdef float max_score, pos_z_score, skip_score, diag_score - # set min score to total sequence times the skip penalty (times 100 for - # good measure) should not be able to obtain a worse score without - # really bad z-score fits (probably from wrong normalization if this occurs - cdef float min_score = -skip_pen * n_bases * 100 + cdef DTYPE_INT_t max_from, band_pos, seq_pos, prev_b_pos + cdef DTYPE_t max_score, pos_z_score, skip_score, diag_score for seq_pos in range(n_bases): - for band_pos in range(bandwidth): - event_pos = band_pos + event_starts[seq_pos] - pos_z_score = shifted_z_scores[seq_pos,band_pos] - prev_b_pos = (event_pos - event_starts[seq_pos-1] + # set first band position to skip score if the bands have the same start + if seq_pos == 0 or event_starts[seq_pos] == event_starts[seq_pos-1]: + fwd_pass[seq_pos + 1, 0] = fwd_pass[seq_pos, 0] - skip_pen + fwd_pass_tb[seq_pos + 1, 0] = 1 + # else use the match score + else: + fwd_pass[seq_pos + 1, 0] = ( + fwd_pass[seq_pos, event_starts[seq_pos] - + event_starts[seq_pos-1] - 1] + + shifted_z_scores[seq_pos, 0]) + fwd_pass_tb[seq_pos + 1, 0] = 2 + + for band_pos in range(1, bandwidth): + pos_z_score = shifted_z_scores[seq_pos, band_pos] + prev_b_pos = (band_pos + event_starts[seq_pos] - + event_starts[seq_pos-1] if seq_pos > 0 else band_pos) - # stay score - max_score = (fwd_pass[seq_pos+1, band_pos-1] - stay_pen + pos_z_score - if band_pos > 0 else min_score) - max_from = 0 if band_pos > 0 else -1 - # then check skip score - if 0 <= prev_b_pos < bandwidth: - skip_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen - if skip_score > max_score: - max_score = skip_score - max_from = 1 - # finally check diagonal score - if 0 <= prev_b_pos - 1 < bandwidth: + # first set to stay state + max_score = fwd_pass[seq_pos+1, band_pos-1] - stay_pen + pos_z_score + max_from = 0 + # then check diagonal score + if prev_b_pos - 1 < bandwidth: diag_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score if diag_score > max_score: max_score = diag_score max_from = 2 - - # invalid max_from indicates that the min_score was too high - # so just compute skip and diag scores - if max_from == -1: - if prev_b_pos == 0: - max_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen - max_from = 1 - elif prev_b_pos == bandwidth: - max_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score - max_from = 2 - else: + # finally check skip score (note nested check to save some ops) + if prev_b_pos < bandwidth: skip_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen - diag_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score - if diag_score > skip_score: - max_score = diag_score - max_from = 2 - else: + if skip_score > max_score: max_score = skip_score max_from = 1 @@ -225,18 +229,20 @@ def c_banded_forward_pass( return fwd_pass, fwd_pass_tb -def c_banded_traceback(ndarray[int, ndim=2] fwd_pass_tb not None, - ndarray[int] event_starts not None, int band_pos, - int band_boundary_thresh=-1): +def c_banded_traceback( + np.ndarray[DTYPE_INT_t, ndim=2] fwd_pass_tb not None, + np.ndarray[DTYPE_INT_t] event_starts not None, DTYPE_INT_t band_pos, + DTYPE_INT_t band_boundary_thresh=-1): # first row in fwd pass is a pseudo-row and does not represent a base - cdef int n_bases = fwd_pass_tb.shape[0] - 1 - cdef int bandwidth = fwd_pass_tb.shape[1] - cdef ndarray[int] seq_poss = np.empty(n_bases + 1, dtype=np.int32) - cdef int curr_event_pos = band_pos + event_starts[n_bases - 1] + cdef DTYPE_INT_t n_bases = fwd_pass_tb.shape[0] - 1 + cdef DTYPE_INT_t bandwidth = fwd_pass_tb.shape[1] + cdef np.ndarray[DTYPE_INT_t] seq_poss = np.empty( + n_bases + 1, dtype=DTYPE_INT) + cdef DTYPE_INT_t curr_event_pos = band_pos + event_starts[n_bases - 1] # last position is the end of the current looking window which is the # passed value seq_poss[n_bases] = curr_event_pos + 1 - cdef int curr_seq_pos + cdef DTYPE_INT_t curr_seq_pos for curr_seq_pos in range(n_bases, 0, -1): band_pos = curr_event_pos - event_starts[curr_seq_pos-1] # 0 indicates stay in the current base @@ -254,39 +260,54 @@ def c_banded_traceback(ndarray[int, ndim=2] fwd_pass_tb not None, return seq_poss +@cython.wraparound(False) +@cython.boundscheck(False) +def c_argmax(np.ndarray[DTYPE_t] vals): + cdef DTYPE_t val + cdef DTYPE_t max_val = vals[0] + cdef DTYPE_INT_t pos + cdef DTYPE_INT_t max_pos = 0 + + for pos in range(1, vals.shape[0]): + val = vals[pos] + if val > max_val: + max_val = val + max_pos = pos + return max_pos + +@cython.wraparound(False) +@cython.boundscheck(False) def c_adaptive_banded_forward_pass( - ndarray[DTYPE_t, ndim=2] fwd_pass not None, - ndarray[int, ndim=2] fwd_pass_tb not None, - ndarray[int] event_starts not None, - ndarray[DTYPE_t] event_means not None, - ndarray[DTYPE_t] r_ref_means not None, - ndarray[DTYPE_t] r_ref_sds not None, - float z_shift, float skip_pen, float stay_pen, - int start_seq_pos, bool return_z_scores=False): - cdef int n_bases = fwd_pass.shape[0] - 1 - cdef int bandwidth = fwd_pass.shape[1] - cdef int half_bandwidth = bandwidth / 2 - cdef int n_events = event_means.shape[0] - - cdef int max_from, event_pos, band_pos, seq_pos, prev_b_pos, \ - prev_band_start, curr_band_start - cdef float max_score, pos_z_score, skip_score, diag_score, ref_mean, ref_sd - # set min score to total sequence times the skip penalty (times 100 for - # good measure) should not be able to obtain a worse score without - # really bad z-score fits (probably from wrong normalization if this occurs - cdef float min_score = -skip_pen * n_bases * 100 - - cdef ndarray[DTYPE_t] shifted_z_scores = np.empty(bandwidth) - cdef ndarray[DTYPE_t, ndim=2] all_shifted_z_scores + np.ndarray[DTYPE_t, ndim=2] fwd_pass not None, + np.ndarray[DTYPE_INT_t, ndim=2] fwd_pass_tb not None, + np.ndarray[DTYPE_INT_t] event_starts not None, + np.ndarray[DTYPE_t] event_means not None, + np.ndarray[DTYPE_t] r_ref_means not None, + np.ndarray[DTYPE_t] r_ref_sds not None, + DTYPE_t z_shift, DTYPE_t skip_pen, DTYPE_t stay_pen, + DTYPE_INT_t start_seq_pos, DTYPE_t mask_fill_z_score, + bool return_z_scores=False): + cdef DTYPE_INT_t n_bases = fwd_pass.shape[0] - 1 + cdef DTYPE_INT_t bandwidth = fwd_pass.shape[1] + cdef DTYPE_INT_t half_bandwidth = bandwidth / 2 + cdef DTYPE_INT_t n_events = event_means.shape[0] + + cdef DTYPE_INT_t event_pos, seq_pos, prev_band_start, curr_band_start, \ + band_pos, prev_b_pos, max_from + cdef DTYPE_t pos_z_score, ref_mean, ref_sd, max_score, skip_score, diag_score + + cdef np.ndarray[DTYPE_t] shifted_z_scores = np.empty(bandwidth, dtype=DTYPE) + cdef np.ndarray[DTYPE_t, ndim=2] all_shifted_z_scores if return_z_scores: - all_shifted_z_scores = np.empty((n_bases - start_seq_pos, bandwidth)) + all_shifted_z_scores = np.empty((n_bases - start_seq_pos, bandwidth), + dtype=DTYPE) for seq_pos in range(start_seq_pos, n_bases): # determine adaptive location for this sequence position prev_band_start = event_starts[seq_pos - 1] - curr_band_start = (prev_band_start + np.argmax(fwd_pass[seq_pos,:]) - - half_bandwidth + 1) + curr_band_start = prev_band_start + c_argmax(fwd_pass[seq_pos]) \ + - half_bandwidth + 1 if curr_band_start < prev_band_start: - curr_band_start = event_starts[seq_pos - 1] + curr_band_start = prev_band_start if curr_band_start >= n_events: # if this isn't within one of the last sequence position # the read is forced to skip to the end and will likely @@ -319,50 +340,42 @@ def c_adaptive_banded_forward_pass( shifted_z_scores[ event_pos - curr_band_start] = z_shift - pos_z_score for event_pos in range(n_events - curr_band_start, bandwidth): - shifted_z_scores[event_pos] = -20.0 + shifted_z_scores[event_pos] = mask_fill_z_score if return_z_scores: all_shifted_z_scores[seq_pos - start_seq_pos,:] = shifted_z_scores # now perform dynamic programming fill for this seq position - for band_pos in range(bandwidth): + + # set first band position to skip score if the bands have the same start + if curr_band_start == prev_band_start: + fwd_pass[seq_pos + 1, 0] = fwd_pass[seq_pos, 0] - skip_pen + fwd_pass_tb[seq_pos + 1, 0] = 1 + # else use the match score + else: + fwd_pass[seq_pos + 1, 0] = fwd_pass[ + seq_pos, curr_band_start - prev_band_start - 1] + \ + shifted_z_scores[0] + fwd_pass_tb[seq_pos + 1, 0] = 2 + + # profiling shows that >60% of the time is spent here. Not + # functionalized now due to function call overheads + for band_pos in range(1, bandwidth): pos_z_score = shifted_z_scores[band_pos] - event_pos = band_pos + curr_band_start - prev_b_pos = (event_pos - prev_band_start - if seq_pos > 0 else band_pos) + prev_b_pos = band_pos + curr_band_start - prev_band_start - # stay score - max_score = (fwd_pass[seq_pos+1, band_pos-1] - stay_pen + pos_z_score - if band_pos > 0 else min_score) - max_from = 0 if band_pos > 0 else -1 - # then check skip score - if 0 <= prev_b_pos < bandwidth: - skip_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen - if skip_score > max_score: - max_score = skip_score - max_from = 1 - # finally check diagonal score - if 0 <= prev_b_pos - 1 < bandwidth: + # first set to stay state + max_score = fwd_pass[seq_pos+1, band_pos-1] - stay_pen + pos_z_score + max_from = 0 + # then check diagonal score + if prev_b_pos - 1 < bandwidth: diag_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score if diag_score > max_score: max_score = diag_score max_from = 2 - - # invalid max_from indicates that the min_score was too high - # so just compute skip and diag scores - if max_from == -1: - if prev_b_pos == 0: - max_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen - max_from = 1 - elif prev_b_pos == bandwidth: - max_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score - max_from = 2 - else: + # finally check skip score (note nested check to save some ops) + if prev_b_pos < bandwidth: skip_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen - diag_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score - if diag_score > skip_score: - max_score = diag_score - max_from = 2 - else: + if skip_score > max_score: max_score = skip_score max_from = 1 @@ -371,4 +384,5 @@ def c_adaptive_banded_forward_pass( if return_z_scores: return all_shifted_z_scores + return diff --git a/tombo/c_helper.pyx b/tombo/c_helper.pyx index 9e62bed..a1958f6 100644 --- a/tombo/c_helper.pyx +++ b/tombo/c_helper.pyx @@ -1,21 +1,24 @@ -from numpy cimport ndarray - import numpy as np cimport numpy as np + DTYPE = np.float64 ctypedef np.float64_t DTYPE_t + +DTYPE_INT = np.int64 +ctypedef np.int64_t DTYPE_INT_t + from libc.math cimport log cdef extern from "math.h": double sqrt(double m) -def c_mean_std(ndarray[DTYPE_t] values): +def c_mean_std(np.ndarray[DTYPE_t] values): """ More efficient method to get both mean and standard deviation """ cdef DTYPE_t v_mean, v_var - cdef int idx - cdef int v_len = values.shape[0] + cdef DTYPE_INT_t idx + cdef DTYPE_INT_t v_len = values.shape[0] v_mean = 0 for idx in range(v_len): v_mean += values[idx] @@ -25,13 +28,13 @@ def c_mean_std(ndarray[DTYPE_t] values): v_var += (values[idx] - v_mean)**2 return v_mean, sqrt(v_var / v_len) -def c_new_mean_stds(ndarray[DTYPE_t] norm_signal not None, - ndarray[int] new_segs not None): - cdef int n_segs = new_segs.shape[0] - 1 - cdef ndarray[DTYPE_t] means_arr = np.empty(n_segs, dtype=DTYPE) - cdef ndarray[DTYPE_t] stds_arr = np.empty(n_segs, dtype=DTYPE) +def c_new_mean_stds(np.ndarray[DTYPE_t] norm_signal not None, + np.ndarray[DTYPE_INT_t] new_segs not None): + cdef DTYPE_INT_t n_segs = new_segs.shape[0] - 1 + cdef np.ndarray[DTYPE_t] means_arr = np.empty(n_segs, dtype=DTYPE) + cdef np.ndarray[DTYPE_t] stds_arr = np.empty(n_segs, dtype=DTYPE) cdef DTYPE_t curr_sum, curr_var, seg_mean - cdef int idx, seg_idx, seg_len + cdef DTYPE_INT_t idx, seg_idx, seg_len for idx in range(n_segs): seg_len = new_segs[idx + 1] - new_segs[idx] curr_sum = 0 @@ -45,12 +48,12 @@ def c_new_mean_stds(ndarray[DTYPE_t] norm_signal not None, stds_arr[idx] = sqrt(curr_var / seg_len) return means_arr, stds_arr -def c_new_means(ndarray[DTYPE_t] norm_signal not None, - ndarray[int] new_segs not None): - cdef int n_segs = new_segs.shape[0] - 1 - cdef ndarray[DTYPE_t] means_arr = np.empty(n_segs, dtype=DTYPE) +def c_new_means(np.ndarray[DTYPE_t] norm_signal not None, + np.ndarray[DTYPE_INT_t] new_segs not None): + cdef DTYPE_INT_t n_segs = new_segs.shape[0] - 1 + cdef np.ndarray[DTYPE_t] means_arr = np.empty(n_segs, dtype=DTYPE) cdef DTYPE_t curr_sum - cdef int idx, seg_idx + cdef DTYPE_INT_t idx, seg_idx for idx in range(n_segs): curr_sum = 0 for seg_idx in range(new_segs[idx], new_segs[idx + 1]): @@ -59,10 +62,10 @@ def c_new_means(ndarray[DTYPE_t] norm_signal not None, return means_arr def c_apply_outlier_thresh( - ndarray[DTYPE_t] raw_signal, float lower_lim, float upper_lim): - cdef int raw_size = raw_signal.shape[0] - cdef ndarray[DTYPE_t] clipped_signal = np.empty(raw_size, dtype=np.float) - cdef int pos + np.ndarray[DTYPE_t] raw_signal, DTYPE_t lower_lim, DTYPE_t upper_lim): + cdef DTYPE_INT_t raw_size = raw_signal.shape[0] + cdef np.ndarray[DTYPE_t] clipped_signal = np.empty(raw_size, dtype=DTYPE) + cdef DTYPE_INT_t pos cdef DTYPE_t pos_sig for pos in range(raw_size): pos_sig = raw_signal[pos] @@ -75,27 +78,29 @@ def c_apply_outlier_thresh( return clipped_signal def c_valid_cpts_w_cap( - ndarray[DTYPE_t] raw_signal, int min_base_obs, int num_cpts): - cdef ndarray[DTYPE_t] raw_cumsum = np.cumsum( + np.ndarray[DTYPE_t] raw_signal, DTYPE_INT_t min_base_obs, + DTYPE_INT_t running_stat_width, DTYPE_INT_t num_cpts): + cdef np.ndarray[DTYPE_t] raw_cumsum = np.cumsum( np.concatenate([[0.0], raw_signal])) - # get difference between all neighboring min_base_obs regions - cdef ndarray[int] candidate_poss = np.argsort(np.abs( - (2 * raw_cumsum[min_base_obs:-min_base_obs]) - - raw_cumsum[:-2*min_base_obs] - - raw_cumsum[2*min_base_obs:])).astype(np.int32)[::-1] - - cdef ndarray[int] cpts = np.empty(num_cpts, dtype=np.int32) - cpts[0] = candidate_poss[0] + min_base_obs + # get difference between all neighboring running_stat_width regions + cdef np.ndarray[DTYPE_INT_t] candidate_poss = np.argsort(np.abs( + (2 * raw_cumsum[running_stat_width:-running_stat_width]) - + raw_cumsum[:-2*running_stat_width] - + raw_cumsum[2*running_stat_width:])).astype(DTYPE_INT)[::-1] + + cdef np.ndarray[DTYPE_INT_t] cpts = np.empty(num_cpts, dtype=DTYPE_INT) + cpts[0] = candidate_poss[0] + running_stat_width blacklist_pos = set(range( candidate_poss[0] - min_base_obs + 1, candidate_poss[0] + min_base_obs)) - cdef int cand_pos - cdef int num_cands = candidate_poss.shape[0] - cdef int cand_idx = 1 - cdef int added_cpts = 1 + cdef DTYPE_INT_t cand_pos + cdef DTYPE_INT_t num_cands = candidate_poss.shape[0] - ( + 2 * running_stat_width) + cdef DTYPE_INT_t cand_idx = 1 + cdef DTYPE_INT_t added_cpts = 1 while added_cpts < num_cpts: cand_pos = candidate_poss[cand_idx] if cand_pos not in blacklist_pos: - cpts[added_cpts] = cand_pos + min_base_obs + cpts[added_cpts] = cand_pos + running_stat_width added_cpts += 1 blacklist_pos.update(range( cand_pos - min_base_obs + 1, cand_pos + min_base_obs)) @@ -105,53 +110,55 @@ def c_valid_cpts_w_cap( return cpts -def c_valid_cpts(ndarray[DTYPE_t] raw_signal, int min_base_obs): - cdef ndarray[DTYPE_t] raw_cumsum = np.cumsum( +def c_valid_cpts(np.ndarray[DTYPE_t] raw_signal, DTYPE_INT_t min_base_obs, + DTYPE_INT_t running_stat_width): + cdef np.ndarray[DTYPE_t] raw_cumsum = np.cumsum( np.concatenate([[0.0], raw_signal])) - # get difference between all neighboring min_base_obs regions - cdef ndarray[int] candidate_poss = np.argsort(np.abs( - (2 * raw_cumsum[min_base_obs:-min_base_obs]) - - raw_cumsum[:-2*min_base_obs] - - raw_cumsum[2*min_base_obs:])).astype(np.int32)[::-1] + # get difference between all neighboring running_stat_width regions + cdef np.ndarray[DTYPE_INT_t] candidate_poss = np.argsort(np.abs( + (2 * raw_cumsum[running_stat_width:-running_stat_width]) - + raw_cumsum[:-2*running_stat_width] - + raw_cumsum[2*running_stat_width:])).astype(DTYPE_INT)[::-1] cpts = [candidate_poss[0]] blacklist_pos = set() - cdef int pos + cdef DTYPE_INT_t pos for pos in candidate_poss[1:]: if pos not in blacklist_pos: cpts.append(pos) blacklist_pos.update(range( pos-min_base_obs+1, pos+min_base_obs+1)) - return np.array(cpts) + min_base_obs + return np.array(cpts) + running_stat_width def c_valid_cpts_w_cap_t_test( - ndarray[DTYPE_t] raw_signal, int min_base_obs, int num_cpts): - cdef int pos, idx + np.ndarray[DTYPE_t] raw_signal, DTYPE_INT_t min_base_obs, + DTYPE_INT_t running_stat_width, DTYPE_INT_t num_cpts): + cdef DTYPE_INT_t pos, idx cdef DTYPE_t pos_diff, m1, m2, var1, var2 - cdef int num_cands = raw_signal.shape[0] - (min_base_obs * 2) + cdef DTYPE_INT_t num_cands = raw_signal.shape[0] - (running_stat_width * 2) # note these will not actually be t-scores, but will be a monotonic transform # so the rank order will be the same - cdef ndarray[DTYPE_t] t_scores = np.empty(num_cands, dtype=DTYPE) + cdef np.ndarray[DTYPE_t] t_scores = np.empty(num_cands, dtype=DTYPE) for pos in range(num_cands): # compute means m1 = 0 - for idx in range(min_base_obs): + for idx in range(running_stat_width): m1 += raw_signal[pos + idx] - m1 /= min_base_obs + m1 /= running_stat_width m2 = 0 - for idx in range(min_base_obs): - m2 += raw_signal[pos + min_base_obs + idx] - m2 /= min_base_obs + for idx in range(running_stat_width): + m2 += raw_signal[pos + running_stat_width + idx] + m2 /= running_stat_width # compute sum of variances var1 = 0 - for idx in range(min_base_obs): + for idx in range(running_stat_width): pos_diff = raw_signal[pos + idx] - m1 var1 += pos_diff * pos_diff var2 = 0 - for idx in range(min_base_obs): - pos_diff = raw_signal[pos + min_base_obs + idx] - m2 + for idx in range(running_stat_width): + pos_diff = raw_signal[pos + running_stat_width + idx] - m2 var2 += pos_diff * pos_diff if var1 + var2 == 0: @@ -161,20 +168,20 @@ def c_valid_cpts_w_cap_t_test( else: t_scores[pos] = (m2 - m1) / sqrt(var1 + var2) - cdef ndarray[int] candidate_poss = np.argsort( - t_scores).astype(np.int32)[::-1] + cdef np.ndarray[DTYPE_INT_t] candidate_poss = np.argsort( + t_scores).astype(DTYPE_INT)[::-1] - cdef ndarray[int] cpts = np.empty(num_cpts, dtype=np.int32) - cpts[0] = candidate_poss[0] + min_base_obs + cdef np.ndarray[DTYPE_INT_t] cpts = np.empty(num_cpts, dtype=DTYPE_INT) + cpts[0] = candidate_poss[0] + running_stat_width blacklist_pos = set(range( candidate_poss[0] - min_base_obs + 1, candidate_poss[0] + min_base_obs)) - cdef int cand_pos - cdef int cand_idx = 1 - cdef int added_cpts = 1 + cdef DTYPE_INT_t cand_pos + cdef DTYPE_INT_t cand_idx = 1 + cdef DTYPE_INT_t added_cpts = 1 while added_cpts < num_cpts: cand_pos = candidate_poss[cand_idx] if cand_pos not in blacklist_pos: - cpts[added_cpts] = cand_pos + min_base_obs + cpts[added_cpts] = cand_pos + running_stat_width added_cpts += 1 blacklist_pos.update(range( cand_pos - min_base_obs + 1, cand_pos + min_base_obs)) @@ -185,18 +192,18 @@ def c_valid_cpts_w_cap_t_test( return cpts def c_calc_llh_ratio( - ndarray[DTYPE_t] reg_means, - ndarray[DTYPE_t] reg_ref_means, ndarray[DTYPE_t] reg_ref_vars, - ndarray[DTYPE_t] reg_alt_means, ndarray[DTYPE_t] reg_alt_vars): - cdef float ref_z_sum, ref_log_var_sum, alt_z_sum, alt_log_var_sum - ref_z_sum, ref_log_var_sum, alt_z_sum, alt_log_var_sum = (0.0, 0.0 ,0.0 ,0.0) - cdef float ref_diff, alt_diff, log_lh_ratio - cdef int idx - cdef int reg_len = reg_means.shape[0] - for idx in range(reg_len): + np.ndarray[DTYPE_t] reg_means, + np.ndarray[DTYPE_t] reg_ref_means, np.ndarray[DTYPE_t] reg_ref_vars, + np.ndarray[DTYPE_t] reg_alt_means, np.ndarray[DTYPE_t] reg_alt_vars): + cdef DTYPE_t ref_z_sum, ref_log_var_sum, alt_z_sum, alt_log_var_sum + ref_z_sum, ref_log_var_sum, alt_z_sum, alt_log_var_sum = 0.0, 0.0 ,0.0 ,0.0 + cdef DTYPE_t ref_diff, alt_diff, log_lh_ratio + cdef DTYPE_INT_t idx + for idx in range(reg_means.shape[0]): ref_diff = reg_means[idx] - reg_ref_means[idx] ref_z_sum += (ref_diff * ref_diff) / reg_ref_vars[idx] ref_log_var_sum += log(reg_ref_vars[idx]) + alt_diff = reg_means[idx] - reg_alt_means[idx] alt_z_sum += (alt_diff * alt_diff) / reg_alt_vars[idx] alt_log_var_sum += log(reg_alt_vars[idx]) diff --git a/tombo/dynamic_programming.py b/tombo/dynamic_programming.py new file mode 100644 index 0000000..80523ba --- /dev/null +++ b/tombo/dynamic_programming.py @@ -0,0 +1,71 @@ +from __future__ import unicode_literals, absolute_import + +from builtins import int, range, dict + +import numpy as np +np.seterr(all='raise') + +from .c_dynamic_programming import c_base_forward_pass, c_base_traceback + +def forward_pass(reg_z_scores, min_obs_per_base): + # dynamic programming algorithm to find modeled signal to base assignment + + # fill banded path with cumulative probabilties from the previous signal + # either in the current base or the previous base (left or diagonal left + # from associated plotting) + + # get the first row data + prev_b_data, (prev_b_start, prev_b_end) = reg_z_scores[0] + prev_b_fwd_data = np.cumsum(prev_b_data) + # store number of observations since last diagonal at each position + # - forces forward pass to allow legal traceback paths while + # enforcing the minimum observations per base threshold + # - should also help from optimization pushing poor fitting bases + # to assign only an observation or two + # - will also use this data to traceback all reasonable paths + prev_b_last_diag = np.ones(prev_b_end - prev_b_start, + dtype=np.int64) * min_obs_per_base + # first row is just a cumsum since there is no previous row + reg_fwd_scores = [(prev_b_fwd_data, prev_b_last_diag, + (prev_b_start, prev_b_end))] + + for b_data, (b_start, b_end) in reg_z_scores[1:]: + b_fwd_data, prev_b_last_diag = c_base_forward_pass( + b_data, b_start, b_end, + prev_b_data, prev_b_start, prev_b_end, + prev_b_fwd_data, prev_b_last_diag, min_obs_per_base) + + # consider storing data to form traceback in one go at the + # end of this loop + reg_fwd_scores.append(( + b_fwd_data, prev_b_last_diag, (b_start, b_end))) + prev_b_data, prev_b_fwd_data, prev_b_start, prev_b_end = ( + b_data, b_fwd_data, b_start, b_end) + + return reg_fwd_scores + +def traceback(reg_fwd_scores, min_obs_per_base): + # traceback along maximally likely path + + # initilize array to store new segments + new_segs = np.empty(len(reg_fwd_scores) - 1, dtype=np.int64) + # get first two bases of data for lookups + curr_base_sig = 1 + curr_b_data, _, (curr_start, curr_end) = reg_fwd_scores[-1] + next_b_data, _, (next_start, next_end) = reg_fwd_scores[-2] + new_segs[-1] = c_base_traceback( + curr_b_data, curr_start, next_b_data, next_start, next_end, + curr_end - 1, min_obs_per_base) + for base_pos in range(len(reg_fwd_scores) - 3, -1, -1): + curr_b_data, curr_start = next_b_data, next_start + next_b_data, _, (next_start, next_end) = reg_fwd_scores[base_pos] + new_segs[base_pos] = c_base_traceback( + curr_b_data, curr_start, next_b_data, next_start, next_end, + new_segs[base_pos+1] - 1, min_obs_per_base) + + return new_segs + + +if __name__ == '__main__': + raise NotImplementedError( + 'This is a module. See commands with `tombo -h`') diff --git a/tombo/plot_commands.py b/tombo/plot_commands.py index 90f1929..624fd11 100644 --- a/tombo/plot_commands.py +++ b/tombo/plot_commands.py @@ -1,19 +1,32 @@ -import os, sys +from __future__ import division, unicode_literals, absolute_import +from future.utils import native + +from builtins import int, range, dict, map, zip + +import os +import io import re -import Queue +import sys +import queue import numpy as np import multiprocessing as mp from time import sleep +from operator import itemgetter from collections import defaultdict from itertools import repeat, groupby from pkg_resources import resource_string +if sys.version_info[0] > 2: + unicode = str + # import tombo functions -import tombo_stats as ts -import tombo_helper as th +from . import tombo_stats as ts +from . import tombo_helper as th + +from ._default_parameters import SMALLEST_PVAL, ROC_PLOT_POINTS VERBOSE = False @@ -30,13 +43,6 @@ #### ggplot via rpy2 functions #### ################################### -GG_LOAD_ERROR=( - '*' * 60 + '\nERROR: Must have rpy2, R and ' + - 'R package ggplot2 installed in order to plot. If these ' + - 'packages are installed, run:\n\t\t`python -c "import rpy2.robjects; ' + - 'from rpy2.robjects.packages import importr; ' + - 'importr(\'ggplot2\');"`\n\t to see installation issues.\n' + \ - '*' * 60 + '\n\n') try: import rpy2.robjects as r from rpy2.robjects.packages import importr @@ -44,15 +50,146 @@ # pass here and raise error when main functions are actually called pass -############################################ -#### Kmer signal distribution functions #### -############################################ + +################### +#### ROC Curve #### +################### + +def get_stat_seq(motif, pos_stat, genome_index): + if pos_stat['strand'].decode() == '+': + stat_seq = genome_index.get_seq( + pos_stat['chrm'].decode(), + pos_stat['pos'] - motif.mod_pos + 1, + pos_stat['pos'] + motif.motif_len - motif.mod_pos + 1) + else: + stat_seq = th.rev_comp(genome_index.get_seq( + pos_stat['chrm'].decode(), + pos_stat['pos'] - motif.motif_len + motif.mod_pos, + pos_stat['pos'] + motif.mod_pos)) + + return stat_seq + +def get_motif_stats( + motif, stats, genome_index, num_plot_points=ROC_PLOT_POINTS): + stat_has_mod = [] + for pos_stat in stats: + stat_seq = get_stat_seq(motif, pos_stat, genome_index) + if motif.motif_pat.match(stat_seq) is not None: + stat_has_mod.append(True) + # don't include sites that aren't at the base of interest + elif stat_seq[motif.mod_pos - 1] == motif.mod_base: + stat_has_mod.append(False) + + tp_cumsum = np.cumsum(stat_has_mod) + tp_rate = tp_cumsum / tp_cumsum[-1] + fp_cumsum = np.cumsum(np.logical_not(stat_has_mod)) + fp_rate = fp_cumsum / fp_cumsum[-1] + + precision = tp_cumsum / np.arange(1, len(stat_has_mod) + 1, dtype=float) + + # trim to number of requested points + tp_rate = tp_rate[np.linspace(0, tp_rate.shape[0] - 1, + num_plot_points).astype(np.int64)] + fp_rate = fp_rate[np.linspace(0, fp_rate.shape[0] - 1, + num_plot_points).astype(np.int64)] + precision = precision[np.linspace(0, precision.shape[0] - 1, + num_plot_points + 1).astype(np.int64)][1:] + + return tp_rate, fp_rate, precision + +def parse_motif_descs(stat_motif_descs): + parsed_motif_descs = [] + try: + for motif_desc in stat_motif_descs.split('::'): + raw_motif, mod_pos, mod_name = motif_desc.split(':') + motif = th.TomboMotif(raw_motif, int(mod_pos)) + parsed_motif_descs.append((motif, mod_name)) + except: + th._error_message_and_exit( + 'Invalid motif decriptions format. Format descriptions as: ' + + '"motif:mod_pos:name[::motif2:mod_pos2:name2...]".') + + return parsed_motif_descs + +def plot_roc(stats_fns, motif_descs, fasta_fn, min_reads, pdf_fn): + if len(motif_descs) != len(stats_fns): + th._error_message_and_exit( + 'Must provide exactly one set of motif descriptions for ' + + 'each statistics file.') + + if VERBOSE: sys.stderr.write('Parsing motifs.\n') + motif_descs = [parse_motif_descs(stat_motif_descs) + for stat_motif_descs in motif_descs] + mod_names = [mod_name for stat_mds in motif_descs + for _, mod_name in stat_mds] + if len(mod_names) != len(set(mod_names)): + th._error_message_and_exit('Modified base names are not unique.') + + if VERBOSE: sys.stderr.write('Parsing genome.\n') + genome_index = th.Fasta(fasta_fn) + + if VERBOSE: sys.stderr.write('Computing accuracy statistics.\n') + tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] + for stats_fn, stat_motif_descs in zip(stats_fns, motif_descs): + if not os.path.isfile(stats_fn): + th._warning_message('Statistics file does not exist. Skipping: ' + + stats_fn) + continue + stats, stat_type = ts.parse_stats(stats_fn) + stats = stats[np.logical_and( + stats['valid_cov'] >= min_reads, + np.logical_or(stat_type != ts.SAMP_COMP_TXT, + stats['control_cov'] >= min_reads))] + if stats.shape[0] == 0: + th._warning_message( + 'No locations pass coverage threshold. Skipping: ' + stats_fn) + continue + stats.sort(order=str('frac')) + + for motif, mod_name in stat_motif_descs: + if (stat_type == ts.ALT_MODEL_TXT and + get_stat_seq(motif, stats[0], genome_index)[motif.mod_pos - 1] != + motif.mod_base): + th._warning_message( + 'Cannot assess modified base accuracy with alternative ' + + 'model testing to another canonical base. Skipping: ' + + mod_name) + continue + mod_tp_rate, mod_fp_rate, mod_precision = get_motif_stats( + motif, stats, genome_index) + # print auc and average precision + auc = np.sum(mod_tp_rate[:-1] * (mod_fp_rate[1:] - mod_fp_rate[:-1])) + # TODO compute precision recall summary stat + if VERBOSE: sys.stderr.write('\t'.join(( + '', mod_name.ljust(30), 'AUC:', + '{:.4f}'.format(auc))) + '\n') + tp_rates.extend(mod_tp_rate) + fp_rates.extend(mod_fp_rate) + precisions.extend(mod_precision) + mod_names_for_r.extend(repeat(mod_name, len(mod_tp_rate))) + + if VERBOSE: sys.stderr.write('Plotting.\n') + rocDat = r.DataFrame({ + 'TP':r.FloatVector(tp_rates), + 'FP':r.FloatVector(fp_rates), + 'Precision':r.FloatVector(precisions), + 'Comparison':r.StrVector(mod_names_for_r)}) + r.r(resource_string(__name__, 'R_scripts/plotROC.R').decode()) + r.r('pdf("' + pdf_fn + '", height=4, width=6)') + r.globalenv[str('plotROC')](rocDat) + r.r('dev.off()') + + return + + +################################### +#### K-mer Signal Distribution #### +################################### def plot_kmer_dist( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, read_mean, upstrm_bases, dnstrm_bases, kmer_thresh, num_reads, r_struct_fn, dont_plot): - if VERBOSE: sys.stderr.write('Parsing files and tabulating k-mers.\n') kmer_width = upstrm_bases + dnstrm_bases + 1 reads_added = 0 all_kmers = defaultdict(list) @@ -64,16 +201,16 @@ def plot_kmer_dist( for r_data in cs_r_data] np.random.shuffle(files) for r_data in files: - means, seq = th.get_multiple_slots_read_centric( + r_means, r_seq = th.get_multiple_slots_read_centric( r_data, ['norm_mean', 'base']) - if means is None: continue - seq = ''.join(seq) + if r_means is None: continue + r_seq = b''.join(r_seq).decode() read_kmers = defaultdict(list) for kmer, event_mean in zip( - [seq[i:i+kmer_width] - for i in range(len(seq)-kmer_width+1)], - means[upstrm_bases:]): + [r_seq[i:i+kmer_width] + for i in range(len(r_seq)-kmer_width+1)], + r_means[upstrm_bases:]): read_kmers[kmer].append(event_mean) # if every k-mer is present (unless kmer is greater than 4) and # each k-mer has the requested number of occurences @@ -94,42 +231,41 @@ def plot_kmer_dist( break if reads_added in (0,1): - sys.stderr.write( - '****** ERROR ******\n\tOnly zero or one valid reads present. ' + + th._error_message_and_exit( + 'Only zero or one valid reads present. ' + 'Check corrected group used in resquiggle as well as ' + '[--num-kmer-threshold] parameter especially if requested ' + 'k-mer length is greater than 3 or 4. Consider setting ' + - 'to 0 for k-mer lengths > 4.\n') - sys.exit() + 'to 0 for k-mer lengths > 4.') if reads_added < num_reads: - sys.stderr.write( - '****** WARNING ******\tFewer valid reads present than ' + + th._warning_message( + 'Fewer valid reads present than ' + 'requested. Check corrected group used in ' + 'resquiggle as well as [--num-kmer-threshold] ' + 'parameter especially if requested k-mer length is ' + 'greater than 3 or 4. Consider setting to 0 for k-mer ' + - 'legnths > 4.\n') + 'legnths > 4.') if VERBOSE: sys.stderr.write('Preparing plot data.\n') kmer_levels = [kmer for means, kmer in sorted([ - (np.mean(zip(*means)[0]), kmer) + (np.mean(list(map(itemgetter(0), means))), kmer) for kmer, means in all_kmers.items()])] - plot_data = [ + plot_kmers, plot_bases, plot_means, plot_r_ids = zip(*[ (kmer, kmer[upstrm_bases], sig_mean, read_i) for kmer in kmer_levels - for sig_mean, read_i in all_kmers[kmer]] + for sig_mean, read_i in all_kmers[kmer]]) kmerDat = r.DataFrame({ 'Kmer':r.FactorVector( - r.StrVector(zip(*plot_data)[0]), + r.StrVector(plot_kmers), ordered=True, levels=r.StrVector(kmer_levels)), - 'Base':r.StrVector(zip(*plot_data)[1]), - 'Signal':r.FloatVector(zip(*plot_data)[2]), - 'Read':r.StrVector(zip(*plot_data)[3])}) - # df to plot kmers as tile of colors requires cowplot R package + 'Base':r.StrVector(plot_bases), + 'Signal':r.FloatVector(plot_means), + 'Read':r.StrVector(plot_r_ids)}) + # df to plot kmers as tile of colors requires gridExtra R package try: - cowplot = importr("cowplot") + importr(str('gridExtra')) baseDat = r.DataFrame({ 'Kmer':r.FactorVector( r.StrVector([kmer for kmer in kmer_levels @@ -141,9 +277,9 @@ def plot_kmer_dist( i - upstrm_bases for kmer in kmer_levels for i in range(kmer_width)])}) except: - sys.stderr.write( - '********* WARNING: Install R package `cowplot` for ' + - 'visual kmer display. Using text kmer display. ********\n') + th._warning_message( + 'Install R package `gridExtra` for ' + + 'visual kmer display. Using text kmer display.') baseDat = r.NA_Character if r_struct_fn is None: @@ -153,13 +289,13 @@ def plot_kmer_dist( dont_plot_r = r.BoolVector([dont_plot,]) if VERBOSE: sys.stderr.write('Plotting.\n') - r.r(resource_string(__name__, 'R_scripts/plotKmerDist.R')) + r.r(resource_string(__name__, 'R_scripts/plotKmerDist.R').decode()) if not dont_plot: r.r('pdf("' + pdf_fn + '", height=7, width=10)') if read_mean: - r.globalenv['plotKmerDistWReadPath']( + r.globalenv[str('plotKmerDistWReadPath')]( kmerDat, baseDat, r_struct_fn, dont_plot_r) else: - r.globalenv['plotKmerDist']( + r.globalenv[str('plotKmerDist')]( kmerDat, baseDat, r_struct_fn, dont_plot_r) if not dont_plot: r.r('dev.off()') @@ -181,20 +317,21 @@ def get_read_correction_data( old_segs, old_align_vals, new_align_vals, events_end, new_segs) = read_corr_data - if reg_type == 'start': + if np.issubdtype(type(native(reg_type)), np.integer): + if r_strand == '+': + reg_start = int(new_segs[reg_type - r_start] - (num_obs / 2) - 1) + else: + reg_start = int((new_segs[len(new_segs) - (reg_type - r_start) - 1] + - num_obs) + (num_obs / 2)) + elif reg_type == 'start': reg_start = 0 elif reg_type == 'end': reg_start = events_end - num_obs elif reg_type == 'random': reg_start = np.random.randint(0, events_end - num_obs) else: - # reg_type should be an integer which is the raw start position - assert isinstance(reg_type, int) - if r_strand == '+': - reg_start = int(new_segs[reg_type - r_start] - (num_obs / 2) - 1) - else: - reg_start = int((new_segs[len(new_segs) - (reg_type - r_start) - 1] - - num_obs) + (num_obs / 2)) + raise NotImplementedError( + 'Invalid reg_type (int or str) to extract read correction data') norm_reg_signal, scale_values = th.normalize_raw_signal( signal_data, raw_offset + reg_start, num_obs, @@ -223,8 +360,7 @@ def get_read_correction_data( i_new_segs = iter(new_segs) align_vals = [((old_b, next(i_old_segs) if old_b != '-' else -1), (new_b, next(i_new_segs) if new_b != '-' else -1)) - for old_b, new_b in zip( - old_align_vals, new_align_vals)] + for old_b, new_b in zip(old_align_vals, new_align_vals)] reg_align_vals = [ ((old_b, old_pos, old_pos in old_reg_segs), (new_b, new_pos, new_pos in new_reg_segs)) @@ -253,25 +389,27 @@ def get_read_correction_data( old_is_del.append(False) old_is_mismatch.append(old_b != new_b) - old_bases, old_reg_segs = zip(*[ - (b, pos) for b, pos, in_reg in zip(*reg_align_vals)[0] - if in_reg]) if ( - len(reg_align_vals) > 0 and - sum(zip(*zip(*reg_align_vals)[0])[2]) > 0) else ([], []) - new_bases, new_reg_segs = zip(*[ - (b, pos) for b, pos, in_reg in zip(*reg_align_vals)[1] - if in_reg]) if ( - len(reg_align_vals) > 0 and - sum(zip(*zip(*reg_align_vals)[1])[2]) > 0) else ([], []) + old_bases, old_reg_segs = [], [] + if (len(reg_align_vals) > 0 and + sum(map(itemgetter(2), map(itemgetter(0), reg_align_vals))) > 0): + old_bases, old_reg_segs = zip(*[ + (b, pos) for b, pos, in_reg in map(itemgetter(0), reg_align_vals) + if in_reg]) + new_bases, new_reg_segs = [], [] + if (len(reg_align_vals) > 0 and + sum(map(itemgetter(2), map(itemgetter(1), reg_align_vals))) > 0): + new_bases, new_reg_segs = zip(*[ + (b, pos) for b, pos, in_reg in map(itemgetter(1), reg_align_vals) + if in_reg]) # bring positions to zero start if aligning multiple sequences - sig_range = range(reg_start, reg_start + num_obs) + sig_range = list(range(reg_start, reg_start + num_obs)) if start_at_zero: old_reg_segs = [ old_seg_pos - reg_start for old_seg_pos in old_reg_segs] new_reg_segs = [ new_seg_pos - reg_start for new_seg_pos in new_reg_segs] - sig_range = range(0, num_obs) + sig_range = list(range(0, num_obs)) old_dat = { 'Position':r.FloatVector(old_reg_segs), @@ -511,11 +649,11 @@ def get_raw_signal_data( except: if not_warned: not_warned = False - sys.stderr.write( - '********** WARNING *********\n\tGenome resolved ' + - 'raw signal could not be retrieved for some reads. ' + - 'Ensure that reads have been re-squiggled and that ' + - 'all data slot corresponding accordingly.\n') + th._warning_message( + 'Genome resolved raw signal could not be retrieved ' + + 'for some reads. Ensure that reads have been ' + + 're-squiggled and that all data slot corresponding ' + + 'accordingly.') continue for base_i, (b_start, b_end) in enumerate(zip( overlap_seg_data[:-1], overlap_seg_data[1:])): @@ -525,7 +663,7 @@ def get_raw_signal_data( Signal.extend(r_sig[b_start-overlap_seg_data[0]: b_end-overlap_seg_data[0]]) Read.extend(list(repeat( - str(r_num) + '_' + group_num, b_end - b_start))) + unicode(r_num) + '_' + group_num, b_end - b_start))) Strand.extend(list(repeat( FWD_STRAND if r_data.strand == '+' else REV_STRAND, b_end - b_start))) @@ -552,14 +690,17 @@ def get_plot_types_data(plot_args, quant_offset=0): def get_base_r_data(all_reg_data, zero_start=False, is_rna=False): BaseStart, Bases, BaseRegion, BaseStrand = [], [], [], [] for reg_data in all_reg_data: + # skip regions without sequence data + if reg_data.seq is None: + continue if reg_data.strand == '+' or reg_data.strand is None: for i, base in enumerate(reg_data.seq): if is_rna and base == 'T': base = 'U' if zero_start: - BaseStart.append(str(i)) + BaseStart.append(unicode(i)) else: - BaseStart.append(str(i + reg_data.start)) + BaseStart.append(unicode(i + reg_data.start)) Bases.append(base) BaseRegion.append(reg_data.reg_id) BaseStrand.append(FWD_STRAND) @@ -569,9 +710,9 @@ def get_base_r_data(all_reg_data, zero_start=False, is_rna=False): if is_rna and base == 'T': base = 'U' if zero_start: - BaseStart.append(str(i)) + BaseStart.append(unicode(i)) else: - BaseStart.append(str(i + reg_data.start)) + BaseStart.append(unicode(i + reg_data.start)) Bases.append(base) BaseRegion.append(reg_data.reg_id) BaseStrand.append(REV_STRAND) @@ -589,14 +730,14 @@ def get_model_r_data(all_reg_model_data): Position, Strand, Mean, SD, Region = [], [], [], [], [] for reg_id, strand, fwd_model_data, rev_model_data in all_reg_model_data: if strand == '+' or strand is None: - for pos, (base_model_mean, base_model_sd) in fwd_model_data: + for pos, base_model_mean, base_model_sd in fwd_model_data: Position.append(pos) Strand.append(FWD_STRAND) Mean.append(base_model_mean) SD.append(base_model_sd) Region.append(reg_id) if strand == '-' or strand is None: - for pos, (base_model_mean, base_model_sd) in rev_model_data: + for pos, base_model_mean, base_model_sd in rev_model_data: Position.append(pos) Strand.append(REV_STRAND) Mean.append(base_model_mean) @@ -646,6 +787,8 @@ def get_reg_r_stats(all_reg_stats, are_pvals=True): def plot_corrections( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, reg_type, num_obs, num_reads): + th._warning_message('The plot_correction command may be deprecated in ' + + 'future versions of Tombo.') if VERBOSE: sys.stderr.write('Preparing plot data.\n') OldSegDat, NewSegDat, SigDat, DiffDat = [], [], [], [] raw_read_coverage = th.parse_fast5s( @@ -666,26 +809,25 @@ def plot_corrections( if len(OldSegDat) >= num_reads: break if len(OldSegDat) == 0: - sys.stderr.write( - 'ERROR: No reads were able to be processed. This command is ' + + th._error_message_and_exit( + 'No reads were able to be processed. This command is ' + 'only applicable to reads processed with event_resquiggle. ' + 'Also check that --corrected-group and --basecall-subgroup ' + - 'match the event_resquiggle command.\n') - sys.exit() + 'match the event_resquiggle command.') if VERBOSE and len(OldSegDat) < num_reads: - sys.stderr.write( - 'WARNING: Fewer reads than requested were able to ' + + th._warning_message( + 'Fewer reads than requested were able to ' + 'be processed. Likely too few reads provided or ' + - 'those provided were not corrected.\n') + 'those provided were not corrected.') OldSegDat = r.DataFrame.rbind(*OldSegDat) NewSegDat = r.DataFrame.rbind(*NewSegDat) SigDat = r.DataFrame.rbind(*SigDat) DiffDat = r.DataFrame.rbind(*DiffDat) if VERBOSE: sys.stderr.write('Plotting.\n') - r.r(resource_string(__name__, 'R_scripts/plotReadCorr.R')) + r.r(resource_string(__name__, 'R_scripts/plotReadCorr.R').decode()) r.r('pdf("' + pdf_fn + '", height=7, width=11)') - r.globalenv['plotReadCorr'](OldSegDat, NewSegDat, SigDat, DiffDat) + r.globalenv[str('plotReadCorr')](OldSegDat, NewSegDat, SigDat, DiffDat) r.r('dev.off()') return @@ -694,37 +836,37 @@ def plot_multi_corrections( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, num_reads_per_plot, num_regions, num_obs, include_orig_bcs, genome_locations): + th._warning_message('The plot_multi_correction command may be deprecated ' + + 'in future versions of Tombo.') num_regions = num_regions if num_regions % 2 == 0 else \ num_regions + 1 raw_read_coverage = th.parse_fast5s( f5_dirs1, corrected_group, basecall_subgroups) read_coverage = th.get_coverage(raw_read_coverage) - coverage_regions = [] - for chrom_strand, chrom_coverage in read_coverage.items(): - chrm_coverage_regions = [ - (x, len(list(y))) for x, y in groupby(chrom_coverage)] - chrm_reg_starts = np.cumsum(np.insert( - zip(*chrm_coverage_regions)[1], 0, 0)) - coverage_regions.extend(zip( - zip(*chrm_coverage_regions)[0], - [start + (reg_len / 2) for start, reg_len in - zip(chrm_reg_starts, zip(*chrm_coverage_regions)[1])], - repeat(chrom_strand[0]), repeat(chrom_strand[1]))) if genome_locations is None: + coverage_regions = [] + for (chrm, strand), cs_coverage in read_coverage.items(): + reg_covs, reg_lens = zip(*[ + (x, len(list(y))) for x, y in groupby(cs_coverage)]) + coverage_regions.extend(zip( + reg_covs, [start + (reg_len // 2) for start, reg_len in + zip(np.cumsum(np.insert(reg_lens, 0, 0)), reg_lens)], + repeat(chrm), repeat(strand))) + # randomly select regions with at least num_reads_to_plot regions coverage_regions = [ (chrm, reg_center, strand) for stat, reg_center, chrm, strand in coverage_regions if stat >= num_reads_per_plot] np.random.shuffle(coverage_regions) - plot_locs = zip( + plot_locs = list(zip( ['{:03d}'.format(rn) for rn in range(num_regions)], - coverage_regions[:num_regions]) + coverage_regions[:num_regions])) if len(plot_locs) < num_regions: - sys.stderr.write( - '*' * 60 + '\nWarning: Fewer regions contain minimum ' + - 'number of reads than requested.\n' + '*' * 60 + '\n') + th._warning_message( + 'Fewer regions contain minimum ' + + 'number of reads than requested.') else: if VERBOSE: sys.stderr.write('Parsing genome locations.\n') parsed_locations = [] @@ -747,15 +889,12 @@ def plot_multi_corrections( if (chrm, strand) in read_coverage and read_coverage[(chrm, strand)][start] > 0] if len(plot_locs) < len(parsed_locations): - sys.stderr.write( - '*' * 60 + '\nWarning: Some regions did not contain ' + - 'read coverage.\n' + '*' * 60 + '\n') + th._warning_message( + 'Some regions did not contain read coverage.') if len(plot_locs) == 0: - sys.stderr.write( - '*' * 60 + '\nERROR: No regions contain minimum ' + - 'number of reads.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'No regions contain minimum number of reads.') if VERBOSE: sys.stderr.write('Preparing plot data.\n') OldSegDat, NewSegDat, SigDat = [], [], [] @@ -798,12 +937,12 @@ def plot_multi_corrections( SigDat = r.DataFrame.rbind(*SigDat) if VERBOSE: sys.stderr.write('Plotting.\n') - r.r(resource_string(__name__, 'R_scripts/plotMultiReadCorr.R')) + r.r(resource_string(__name__, 'R_scripts/plotMultiReadCorr.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') if include_orig_bcs and OldSegDat is not None: - r.globalenv['plotMultiReadCorr'](OldSegDat, NewSegDat, SigDat) + r.globalenv[str('plotMultiReadCorr')](OldSegDat, NewSegDat, SigDat) else: - r.globalenv['plotMultiReadCorrNoOrig'](NewSegDat, SigDat) + r.globalenv[str('plotMultiReadCorrNoOrig')](NewSegDat, SigDat) r.r('dev.off()') return @@ -840,11 +979,11 @@ def get_plots_titles(all_reg_data, all_reg_data2, overplot_type, if all_reg_data2 is None: if int_i.strand is None: reg_title = int_i.chrm + ' ' + int_i.reg_text + \ - " ::: Coverage: " + str(r_cov[0]) + r_ovp[0] + \ - " + " + str(r_cov[1]) + r_ovp[1] + " -" + " ::: Coverage: " + unicode(r_cov[0]) + r_ovp[0] + \ + " + " + unicode(r_cov[1]) + r_ovp[1] + " -" else: - cov_str = str(r_cov[0]) + r_ovp[0] if int_i.strand == '+' \ - else str(r_cov[1]) + r_ovp[1] + cov_str = unicode(r_cov[0]) + r_ovp[0] if int_i.strand == '+' \ + else unicode(r_cov[1]) + r_ovp[1] reg_title = int_i.chrm + ( ":" + int_i.strand if int_i.strand else '') + \ ' ' + int_i.reg_text + " ::: Coverage: " + cov_str @@ -857,17 +996,17 @@ def get_plots_titles(all_reg_data, all_reg_data2, overplot_type, titles.append( int_i.chrm + ' ' + int_i.reg_text + " ::: Coverage: Sample (Red): " + - str(r_cov[0]) + r_ovp[0] + " + " + - str(r_cov[1]) + r_ovp[1] + " -; Control (Black): " + - str(r_cov[2]) + r_ovp[2] + " + " + - str(r_cov[3]) + r_ovp[3] + " -") + unicode(r_cov[0]) + r_ovp[0] + " + " + + unicode(r_cov[1]) + r_ovp[1] + " -; Control (Black): " + + unicode(r_cov[2]) + r_ovp[2] + " + " + + unicode(r_cov[3]) + r_ovp[3] + " -") else: cov_str = ( - 'Sample (Red): ' + str(r_cov[0]) + r_ovp[0] + - '; Control (Black): ' + str(r_cov[2]) + r_ovp[2] + 'Sample (Red): ' + unicode(r_cov[0]) + r_ovp[0] + + '; Control (Black): ' + unicode(r_cov[2]) + r_ovp[2] ) if int_i.strand == '+' else ( - 'Sample (Red): ' + str(r_cov[1]) + r_ovp[1] + - '; Control (Black): ' + str(r_cov[3]) + r_ovp[3]) + 'Sample (Red): ' + unicode(r_cov[1]) + r_ovp[1] + + '; Control (Black): ' + unicode(r_cov[3]) + r_ovp[3]) titles.append( int_i.chrm + ":" + int_i.strand + ' ' + int_i.reg_text + " ::: Coverage: " + cov_str) @@ -884,10 +1023,7 @@ def plot_single_sample( if VERBOSE: sys.stderr.write('Preparing plot data.\n') all_reg_data = th.get_region_reads(plot_intervals, raw_read_coverage) if len(all_reg_data) == 0: - sys.stderr.write( - '*' * 60 + '\nERROR: No reads in any selected regions.\n' - + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit('No reads in any selected regions.') rna = th.is_rna(raw_read_coverage) Titles, plot_types = get_plots_titles( @@ -898,10 +1034,10 @@ def plot_single_sample( (all_reg_data, plot_types, overplot_thresh, 'Group1')) if VERBOSE: sys.stderr.write('Plotting.\n') - r.r(resource_string(__name__, 'R_scripts/plotSingleRun.R')) + r.r(resource_string(__name__, 'R_scripts/plotSingleRun.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') - r.globalenv['plotSingleRun'](SignalData, QuantData, BoxData, - EventData, BasesData, Titles) + r.globalenv[str('plotSingleRun')](SignalData, QuantData, BoxData, + EventData, BasesData, Titles) r.r('dev.off()') return @@ -911,25 +1047,20 @@ def filter_and_merge_group_regs(g1_data, g2_data): for r1, r2 in zip(g1_data, g2_data): both_reads = r1.reads + r2.reads if len(both_reads) > 0: - merged_reg_data.append(th.intervalData( - r1.reg_id, r1.chrm, r1.start, r1.end, - r1.strand, r1.reg_text, both_reads)) + merged_reg_data.append(r1._replace(reads=both_reads)) filt_g1.append(r1) filt_g2.append(r2) else: both_no_cov.append(':'.join(map(str, ( - r1.chrm, str(r1.start) + '-' + str(r1.end), r1.strand)))) + r1.chrm, unicode(r1.start) + '-' + unicode(r1.end), + r1.strand)))) if len(both_no_cov) > 0 and VERBOSE: - sys.stderr.write( - '*' * 60 + '\nWarning: Some regions include no reads: ' + - '\t'.join(both_no_cov) + '\n' + '*' * 60 + '\n') + th._warning_message( + 'Some regions include no reads: ' + '\t'.join(both_no_cov)) if len(merged_reg_data) == 0: - sys.stderr.write( - '*' * 60 + '\nERROR: No reads in any selected regions.\n' - + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit('No reads in any selected regions.') return merged_reg_data, filt_g1, filt_g2 @@ -960,9 +1091,9 @@ def plot_two_samples( (all_reg_data2, plot_types, overplot_thresh, 'Group2'), 0.5) if VERBOSE: sys.stderr.write('Plotting.\n') - r.r(resource_string(__name__, 'R_scripts/plotGroupComp.R')) + r.r(resource_string(__name__, 'R_scripts/plotGroupComp.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') - r.globalenv['plotGroupComp']( + r.globalenv[str('plotGroupComp')]( r.DataFrame.rbind(SignalData1, SignalData2), r.DataFrame.rbind(QuantData1, QuantData2), r.DataFrame.rbind(BoxData1, BoxData2), @@ -972,7 +1103,7 @@ def plot_two_samples( if seqs_fn is not None: if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') - with open(seqs_fn, 'w') as seqs_fp: + with io.open(seqs_fn, 'wt') as seqs_fp: for int_i in merged_reg_data: # get the interval from the base data struct reg_seq = int_i.seq if int_i.strand == '+' else th.rev_comp( @@ -990,26 +1121,24 @@ def filter_reads(reads, int_start, int_end): """ return [r_data for r_data in reads if not (r_data.start >= int_end or r_data.end <= int_start)] - kmer_ref, upstrm_bases, _, _ = ts.parse_tombo_model(tb_model_fn) + std_ref = ts.TomboModel(tb_model_fn) # compute kmer values to make strand specific calculations easier - kmer_width = len(next(kmer_ref.iterkeys())) - dnstrm_bases = kmer_width - upstrm_bases - 1 - expand_width = max(upstrm_bases, dnstrm_bases) + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + expand_width = max(std_ref.central_pos, dnstrm_bases) filt_width = expand_width if min_reg_overlap is None else \ expand_width + min_reg_overlap if alt_model_fn is not None: - alt_ref, alt_upstrm_bases, _, _ = ts.parse_tombo_model( - alt_model_fn) - assert (alt_upstrm_bases == upstrm_bases and - kmer_width == len(next(alt_ref.iterkeys()))), ( - '********* ERROR *********\n\tStandard model not based on ' + - 'the same kmer position as alternative model.') + alt_ref = ts.TomboModel(alt_model_fn) + if (alt_ref.central_pos != std_ref.central_pos or + alt_ref.kmer_width != alt_ref.kmer_width): + th._error_message_and_exit( + 'Standard model not based on the same kmer position ' + + 'as alternative model.') # expand regions to get kmers at first and last positions - expanded_intervals = [ - th.intervalData(p_int.reg_id, p_int.chrm, p_int.start - expand_width, - p_int.end + expand_width, p_int.strand, p_int.reg_text) - for p_int in plot_intervals] + expanded_intervals = [p_int._replace(start=p_int.start - expand_width, + end=p_int.end + expand_width) + for p_int in plot_intervals] # get reads and region sequence expanded_intervals = th.get_region_reads( expanded_intervals, raw_read_coverage) @@ -1017,12 +1146,11 @@ def filter_reads(reads, int_start, int_end): rev_expand_seqs = [th.rev_comp(int_i.seq) for int_i in expanded_intervals] # convert back to original plot_intervals with seq from exanded intervals all_reg_data = [ - th.intervalData( - int_i.reg_id, int_i.chrm, int_i.start + expand_width, - int_i.end - expand_width, int_i.strand, int_i.reg_text, - filter_reads(int_i.reads, int_i.start + filt_width, - int_i.end - filt_width), - int_i.seq[expand_width:-expand_width]) + int_i._replace(start=int_i.start + expand_width, + end=int_i.end - expand_width, + reads=filter_reads(int_i.reads, int_i.start + filt_width, + int_i.end - filt_width), + seq=int_i.seq[expand_width:-expand_width]) for int_i in expanded_intervals] all_reg_model_data, all_reg_alt_model_data = [], [] @@ -1030,41 +1158,44 @@ def filter_reads(reads, int_start, int_end): all_reg_data, expand_seqs, rev_expand_seqs): clipped_reg_seq = reg_seq clipped_rev_seq = rev_seq - if upstrm_bases > dnstrm_bases: - clipped_reg_seq = reg_seq[:dnstrm_bases-upstrm_bases] - clipped_rev_seq = rev_seq[:dnstrm_bases-upstrm_bases] - elif dnstrm_bases > upstrm_bases: - clipped_reg_seq = reg_seq[dnstrm_bases-upstrm_bases:] - clipped_rev_seq = rev_seq[dnstrm_bases-upstrm_bases:] + if std_ref.central_pos > dnstrm_bases: + clipped_reg_seq = reg_seq[:dnstrm_bases-std_ref.central_pos] + clipped_rev_seq = rev_seq[:dnstrm_bases-std_ref.central_pos] + elif dnstrm_bases > std_ref.central_pos: + clipped_reg_seq = reg_seq[dnstrm_bases-std_ref.central_pos:] + clipped_rev_seq = rev_seq[dnstrm_bases-std_ref.central_pos:] + fwd_kmers = [ + clipped_reg_seq[i:i + std_ref.kmer_width] + for i in range(len(clipped_reg_seq) - std_ref.kmer_width + 1)] + rev_kmers = [ + clipped_rev_seq[i:i + std_ref.kmer_width] + for i in range(len(clipped_rev_seq) - std_ref.kmer_width + 1)] all_reg_model_data.append(( reg_data.reg_id, reg_data.strand, - [(reg_data.start + pos, kmer_ref[''.join(bs)]) - for pos, bs in enumerate(zip(*[ - clipped_reg_seq[i:] for i in range(kmer_width)])) - if not any(b in ('-', 'N') for b in bs)], - [(reg_data.end - pos - 1, kmer_ref[''.join(bs)]) - for pos, bs in enumerate(zip(*[ - clipped_rev_seq[i:] for i in range(kmer_width)])) - if not any(b in ('-', 'N') for b in bs)])) + [(reg_data.start + pos, std_ref.means[kmer], + std_ref.sds[kmer]) + for pos, kmer in enumerate(fwd_kmers) if not th.invalid_seq(kmer)], + [(reg_data.end - pos - 1, std_ref.means[kmer], + std_ref.sds[kmer]) for pos, kmer in enumerate(rev_kmers) + if not th.invalid_seq(kmer)])) # if alternative model is supplied add info if alt_model_fn is not None: all_reg_alt_model_data.append(( reg_data.reg_id, reg_data.strand, - [(reg_data.start + pos, alt_ref[''.join(bs)]) - for pos, bs in enumerate(zip(*[ - clipped_reg_seq[i:] for i in range(kmer_width)])) - if not any(b in ('-', 'N') for b in bs)], - [(reg_data.end - pos - 1, alt_ref[''.join(bs)]) - for pos, bs in enumerate(zip(*[ - clipped_rev_seq[i:] for i in range(kmer_width)])) - if not any(b in ('-', 'N') for b in bs)])) + [(reg_data.start + pos, alt_ref.means[kmer], + alt_ref.sds[kmer]) + for pos, kmer in enumerate(fwd_kmers) + if not th.invalid_seq(kmer)], + [(reg_data.end - pos - 1, alt_ref.means[kmer], + alt_ref.sds[kmer]) + for pos, kmer in enumerate(rev_kmers) + if not th.invalid_seq(kmer)])) return all_reg_data, all_reg_model_data, all_reg_alt_model_data def plot_motif_centered_with_stats( raw_read_coverage1, raw_read_coverage2, plot_intervals, - stat_locs, overplot_thresh, pdf_fn, frac_order, - tb_model_fn, alt_model_fn=None): + stat_locs, overplot_thresh, pdf_fn, tb_model_fn, alt_model_fn=None): if VERBOSE: sys.stderr.write('Preparing plot data.\n') ModelData = r.r('NULL') @@ -1109,16 +1240,21 @@ def plot_motif_centered_with_stats( BasesData = get_base_r_data(merged_reg_data) + plot_poss, plot_stats = zip(*stat_locs) # stat lists StatsData = r.DataFrame({ - 'Position':r.FloatVector(zip(*stat_locs)[0]), - 'Stat':r.FloatVector(zip(*stat_locs)[1])}) + 'Position':r.FloatVector(plot_poss), + 'Stat':r.FloatVector(plot_stats)}) if VERBOSE: sys.stderr.write('Plotting.\n') - r.r(resource_string(__name__, 'R_scripts/plotMotifStats.R')) + r.r(resource_string(__name__, 'R_scripts/plotMotifStats.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=8)') - r.globalenv['plotMotifStats']( - SignalData, BasesData, StatsData, frac_order, ModelData) + if alt_model_fn is None: + r.globalenv[str('plotMotifStats')]( + SignalData, BasesData, StatsData, ModelData) + else: + r.globalenv[str('plotMotifStats')]( + SignalData, BasesData, StatsData, ModelData, AltModelData) r.r('dev.off()') return @@ -1143,19 +1279,21 @@ def plot_model_single_sample( (all_reg_data, plot_types, overplot_thresh, 'Group1')) if VERBOSE: sys.stderr.write('Plotting.\n') - r.r(resource_string(__name__, 'R_scripts/plotModelComp.R')) + r.r(resource_string(__name__, 'R_scripts/plotModelComp.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') if alt_model_fn is None: - r.globalenv['plotModelComp'](SignalData, QuantData, BoxData, - EventData, BasesData, Titles, ModelData) + r.globalenv[str('plotModelComp')]( + SignalData, QuantData, BoxData, EventData, + BasesData, Titles, ModelData) else: - r.globalenv['plotModelComp'](SignalData, QuantData, BoxData, EventData, - BasesData, Titles, ModelData, AltModelData) + r.globalenv[str('plotModelComp')]( + SignalData, QuantData, BoxData, EventData, + BasesData, Titles, ModelData, AltModelData) r.r('dev.off()') if seqs_fn is not None: if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') - with open(seqs_fn, 'w') as seqs_fp: + with io.open(seqs_fn, 'wt') as seqs_fp: for int_i in all_reg_data: reg_seq = int_i.seq if int_i.strand == '+' else th.rev_comp( int_i.seq) @@ -1166,84 +1304,21 @@ def plot_model_single_sample( return def plot_per_read_modification( - plot_intervals, raw_read_coverage, tb_model_fn, alt_model_fn, pdf_fn, - fm_lag, num_reads, box_center): - if alt_model_fn is not None: - alt_ref, upstrm_bases, alt_base, _ = ts.parse_tombo_model(alt_model_fn) - dnstrm_bases = len(next(alt_ref.iterkeys())) - upstrm_bases - 1 - - def calc_read_lh(r_means, alt_base_locs, begin_lag, end_lag, reg_ref_means, - reg_ref_vars, reg_alt_means, reg_alt_vars): - r_stats = np.empty(r_means.shape) - r_stats[:] = np.NAN - for alt_base_pos in alt_base_locs: - alt_pos = alt_base_pos + end_lag - r_stats[alt_pos] = ts.calc_llh_ratio( - r_means[alt_pos-end_lag:alt_pos+begin_lag], - reg_ref_means[alt_pos-end_lag:alt_pos+begin_lag], - reg_ref_vars[alt_pos-end_lag:alt_pos+begin_lag], - reg_alt_means[alt_pos-end_lag:alt_pos+begin_lag], - reg_alt_vars[alt_pos-end_lag:alt_pos+begin_lag]) - - return r_stats - + all_reg_data, all_reg_stats, are_pvals, box_center, pdf_fn): if VERBOSE: sys.stderr.write('Preparing plot data.\n') - # get reads overlapping each region along with all kmers - min_reg_overlap = (fm_lag * 2) + 1 if fm_lag > 0 else None - all_reg_data, all_reg_model_data, all_reg_alt_model_data = get_reg_kmers( - tb_model_fn, plot_intervals, raw_read_coverage, min_reg_overlap, - alt_model_fn) - all_reg_stats = [] - for reg_i, (reg_data, (reg_id, strand, fwd_model_data, - rev_model_data)) in enumerate(zip( - all_reg_data, all_reg_model_data)): - model_data = (fwd_model_data if reg_data.strand == "+" else - rev_model_data[::-1]) - ref_means, ref_sds = zip(*zip(*model_data)[1]) - reg_events = get_reg_events( - reg_data.reads, reg_data.start, reg_data.end, reg_data.strand, - read_rows=True, num_reads=num_reads) - if alt_model_fn is None: - with np.errstate(invalid='ignore'): - pvals = ts.z_score_to_p_value( - -np.abs(reg_events - ref_means) / ref_sds) * 2.0 - if fm_lag > 0: - all_reg_stats.append(( - reg_data.reg_id, - ts.calc_window_fishers_method(pvals, fm_lag))) - else: - all_reg_stats.append((reg_data.reg_id, pvals)) - else: - fwd_alt_model, rev_alt_model = all_reg_alt_model_data[reg_i][2:] - alt_model = (fwd_alt_model if reg_data.strand == "+" else - rev_alt_model[::-1]) - begin_lag = upstrm_bases if reg_data.strand =='+' else dnstrm_bases - end_lag = dnstrm_bases if reg_data.strand =='+' else upstrm_bases - alt_means, alt_sds = zip(*zip(*alt_model)[1]) - ref_vars = np.square(ref_sds) - alt_vars = np.square(alt_sds) - alt_base_locs = [alt_pos.start() for alt_pos in re.finditer( - alt_base, reg_data.seq[end_lag:-begin_lag])] - all_reg_stats.append((reg_data.reg_id, np.apply_along_axis( - calc_read_lh, 1, reg_events, - alt_base_locs, begin_lag, end_lag, ref_means, - ref_vars, alt_means, alt_vars))) - - are_pvals = alt_model_fn is None StatData, OrdData = get_reg_r_stats(all_reg_stats, are_pvals) BasesData = get_base_r_data(all_reg_data, zero_start=True) if VERBOSE: sys.stderr.write('Plotting.\n') - r.r(resource_string(__name__, 'R_scripts/plotPerReadStats.R')) + r.r(resource_string(__name__, 'R_scripts/plotPerReadStats.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') - r.globalenv['plotPerReadStats'](StatData, OrdData, BasesData, box_center, - are_pvals) + r.globalenv[str('plotPerReadStats')]( + StatData, OrdData, BasesData, box_center, are_pvals) r.r('dev.off()') return - ################################# #### Plot processing methods #### ################################# @@ -1263,10 +1338,10 @@ def get_valid_model_fns( plot_default_alt, raw_read_coverage) if f5_dirs2 is not None and tb_model_fn is not None: - sys.stderr.write( - '********* WARNING ******** Both a second set of FAST5s and a ' + - 'tombo model were provided. Two samples with model plotting is not ' + - 'currently available. Models requested will be ignored.\n') + th._warning_message( + 'Both a second set of FAST5s and a tombo model were ' + + 'provided. Two samples with model plotting is not ' + + 'currently available. Models requested will be ignored.') return tb_model_fn, alt_model_fn @@ -1283,14 +1358,12 @@ def plot_max_coverage( raw_read_coverage, f5_dirs2) if f5_dirs2 is None: coverage_regions = [] - for (chrom, strand), chrom_coverage in read_coverage.items(): - chrm_coverage_regions = [ - (x, len(list(y))) for x, y in groupby(chrom_coverage)] + for (chrm, strand), cs_coverage in read_coverage.items(): + reg_covs, reg_lens = zip(*[ + (x, len(list(y))) for x, y in groupby(cs_coverage)]) coverage_regions.extend(zip( - zip(*chrm_coverage_regions)[0], - np.cumsum(np.insert( - zip(*chrm_coverage_regions)[1], 0, 0)), - repeat(chrom), repeat(strand))) + reg_covs, np.cumsum(np.insert(reg_lens, 0, 0)), + repeat(chrm), repeat(strand))) # max coverage plots both strands coverage plot_intervals = [ @@ -1312,28 +1385,26 @@ def plot_max_coverage( read_coverage2 = th.get_coverage(raw_read_coverage2) coverage_regions = [] # only process chromosomes in both read groups - for (chrom, strand) in set(read_coverage).intersection( + for (chrm, strand) in set(read_coverage).intersection( read_coverage2): - chrom_coverage = read_coverage[(chrom, strand)] - chrom_coverage2 = read_coverage2[(chrom, strand)] - if chrom_coverage.shape[0] >= chrom_coverage2.shape[0]: - merged_chrom_cov = np.pad( - chrom_coverage2, (0,chrom_coverage.shape[0] - - chrom_coverage2.shape[0]), - 'constant', constant_values=0) + chrom_coverage + chrm_coverage = read_coverage[(chrm, strand)] + chrm_coverage2 = read_coverage2[(chrm, strand)] + if chrm_coverage.shape[0] >= chrm_coverage2.shape[0]: + merged_chrm_cov = np.pad( + chrm_coverage2, (0, chrm_coverage.shape[0] - + chrm_coverage2.shape[0]), + 'constant', constant_values=0) + chrm_coverage else: - merged_chrom_cov = np.pad( - chrom_coverage, (0, chrom_coverage2.shape[0] - - chrom_coverage.shape[0]), - 'constant', constant_values=0) + chrom_coverage2 + merged_chrm_cov = np.pad( + chrm_coverage, (0, chrm_coverage2.shape[0] - + chrm_coverage.shape[0]), + 'constant', constant_values=0) + chrm_coverage2 - chrm_coverage_regions = [ - (x, len(list(y))) for x, y in groupby(merged_chrom_cov)] + reg_covs, reg_lens = zip(*[ + (x, len(list(y))) for x, y in groupby(merged_chrm_cov)]) coverage_regions.extend(zip( - zip(*chrm_coverage_regions)[0], - np.cumsum(np.insert( - zip(*chrm_coverage_regions)[1], 0, 0)), - repeat(chrom), repeat(strand))) + reg_covs, np.cumsum(np.insert(reg_lens, 0, 0)), + repeat(chrm), repeat(strand))) # max coverage plots both strands coverage plot_intervals = [ @@ -1396,23 +1467,13 @@ def plot_genome_locations( return def plot_per_read_mods_genome_location( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - num_bases, genome_locations, tb_model_fn, alt_model_fn, - fm_lag, num_reads, box_center, plot_default_stnd, - plot_default_alt): - if (tb_model_fn is None and not plot_default_stnd and - alt_model_fn is None and not plot_default_alt): - sys.stderr.write('*********** WARNING ********\n\tNo model ' + - 'indicated, so loading default standard model.\n') - plot_default_stnd = True - - if VERBOSE: sys.stderr.write( - 'Parsing per read modifications at genome locations.\n') + f5_dirs, corrected_group, basecall_subgroups, pdf_fn, + per_read_stats_fn, genome_locations, num_bases, num_reads, box_center, + fasta_fn): + if VERBOSE: sys.stderr.write('Parsing genome locations.\n') genome_locations = [ chrm_pos.replace('"', '').replace("'", "").split(':')[:3] for chrm_pos in genome_locations] - # minus one here as all python internal coords are 0-based, but - # genome is generally 1-based plot_intervals = [] for i, chrm_pos_strand in enumerate(genome_locations): if len(chrm_pos_strand) == 2: @@ -1425,15 +1486,52 @@ def plot_per_read_mods_genome_location( plot_intervals.append(th.intervalData( '{:03d}'.format(i), chrm, int_start, int_start + num_bases, strand)) - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - tb_model_fn, alt_model_fn = get_valid_model_fns( - tb_model_fn, plot_default_stnd, alt_model_fn, - plot_default_alt, raw_read_coverage) + # add sequence to each region if fast5s or fasta are provided + if fasta_fn is not None: + genome_index = th.Fasta(fasta_fn) + plot_intervals_w_seq = [] + for int_data in plot_intervals: + plot_intervals_w_seq.append( + int_data._replace(seq=genome_index.get_seq( + int_data.chrm, int_data.start, int_data.end))) + plot_intervals = plot_intervals_w_seq + elif f5_dirs is not None: + raw_read_coverage = th.parse_fast5s( + f5_dirs, corrected_group, basecall_subgroups) + plot_intervals = th.get_region_reads(plot_intervals, raw_read_coverage) + else: + th._warning_message( + 'No read FAST5 directory or genome FASTA file provided. ' + + 'Plotting without sequence.') + + if VERBOSE: sys.stderr.write('Parsing per read statistics.\n') + per_read_stats = ts.PerReadStats(per_read_stats_fn) + interval_stats = [] + for int_data in plot_intervals: + int_stats = per_read_stats.get_region_stats(int_data, num_reads) + if int_stats is not None: + # convert long form stats to matrix form (so they can be clustered) + int_stats.sort(order=str('read_id')) + # use interval data instead of stats dimensions since regDat is + # used to compute some window distances in R, so it must be full + # matrix for the region with NAs + int_len = int_data.end - int_data.start + 1 + all_read_stats = np.split( + int_stats, np.where(int_stats['read_id'][:-1] != + int_stats['read_id'][1:])[0] + 1) + read_stats_mat = np.empty((len(all_read_stats), int_len)) + read_stats_mat[:] = np.NAN + for read_i, read_int_stats in enumerate(all_read_stats): + np.put(read_stats_mat[read_i,:], + read_int_stats['pos'] - int_data.start, + read_int_stats['stat']) + interval_stats.append((int_data.reg_id, read_stats_mat)) + + are_pvals = per_read_stats.are_pvals + per_read_stats.close() plot_per_read_modification( - plot_intervals, raw_read_coverage, tb_model_fn, alt_model_fn, pdf_fn, - fm_lag, num_reads, box_center) + plot_intervals, interval_stats, are_pvals, box_center, pdf_fn) return @@ -1443,32 +1541,52 @@ def plot_motif_centered( motif, fasta_fn, deepest_coverage, tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt): if VERBOSE: sys.stderr.write('Identifying genomic k-mer locations.\n') - fasta_records = th.parse_fasta(fasta_fn) - motif_pat = th.parse_motif(motif) - motif_len = len(motif) + genome_index = th.Fasta(fasta_fn) + motif = th.TomboMotif(motif) + def get_motif_locs(covered_chrms): - # TODO: search over negative strand as well motif_locs = [] - for chrm, seq in fasta_records.iteritems(): + for chrm in genome_index.iter_chrms(): if chrm not in covered_chrms: continue - for motif_loc in motif_pat.finditer(seq): - motif_locs.append((chrm, motif_loc.start())) + seq = genome_index.get_seq(chrm) + for motif_loc in motif.motif_pat.finditer(seq): + motif_locs.append((chrm, motif_loc.start(), '+' + if not motif.is_palindrome else None)) + # search over negative strand as well if not palindromic + if not motif.is_palindrome: + for motif_loc in motif.rev_comp_pat.finditer(seq): + motif_locs.append((chrm, motif_loc.start(), '-')) if len(motif_locs) == 0: - sys.stderr.write( - 'Motif (' + motif_pat.pattern + - ') not found in genome.\n') - sys.exit() + th._error_message_and_exit( + 'Motif (' + motif.raw_motif + ') not found in genome.') elif len(motif_locs) < num_regions: - sys.stderr.write( - 'WARNING: Motif (' + motif_pat.pattern + - ') only found ' + str(len(motif_locs)) + - 'times in genome.\n') + th._warning_message( + 'Motif (' + motif.raw_motif + ') only found ' + + unicode(len(motif_locs)) + ' times in genome.') num_region = len(motif_locs) np.random.shuffle(motif_locs) return motif_locs + def get_pos_cov(chrm, pos, strand, read_coverage, read_coverage2=None): + def get_strand_cov(cov_strand): + try: + if read_coverage2 is None: + return read_coverage[(chrm, cov_strand)][pos] + else: + return min(read_coverage[(chrm, cov_strand)][pos], + read_coverage2[(chrm, cov_strand)][pos]) + except (IndexError, KeyError): + return 0 + + # if strand is not specified get max coverage over both strands + if strand is None: + return max(get_strand_cov('+'), get_strand_cov('-')) + # else get coverage for strand with motif + return get_strand_cov(strand) + + raw_read_coverage = th.parse_fast5s( f5_dirs1, corrected_group, basecall_subgroups) tb_model_fn, alt_model_fn = get_valid_model_fns( @@ -1477,127 +1595,120 @@ def get_motif_locs(covered_chrms): if deepest_coverage: read_coverage = th.get_coverage(raw_read_coverage) - if f5_dirs2 is not None: - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corrected_group, basecall_subgroups) - - covered_chrms = set(zip(*raw_read_coverage)[0]).intersection( - zip(*raw_read_coverage2)[0]) + if f5_dirs2 is None: + covered_chrms = set(map(itemgetter(0), raw_read_coverage)) # filter out motif_locs to chromosomes not covered motif_locs = get_motif_locs(covered_chrms) if deepest_coverage: - read_coverage2 = th.get_coverage(raw_read_coverage2) - if VERBOSE: sys.stderr.write( - 'Finding deepest coverage regions.\n') - def get_cov(chrm, pos): - try: - plus_cov = min(read_coverage[(chrm, '+')][pos], - read_coverage2[(chrm, '+')][pos]) - except (IndexError, KeyError): - plus_cov = 0 - try: - minus_cov = min(read_coverage[(chrm, '-')][pos], - read_coverage2[(chrm, '-')][pos]) - except (IndexError, KeyError): - minus_cov = 0 - return max(plus_cov, minus_cov) - + if VERBOSE: sys.stderr.write('Finding deepest coverage regions.\n') motif_locs_cov = sorted([ - (get_cov(chrm, pos), chrm, pos) - for chrm, pos in motif_locs], reverse=True) - if motif_locs_cov[0][0] == 0: - sys.stderr.write( - '*' * 60 + '\nERROR: Motif not covered ' + - 'by both groups at any positions.\n' - + '*' * 60 + '\n') - sys.exit() - + (get_pos_cov(chrm, pos, strand, read_coverage), + chrm, pos, strand) + for chrm, pos, strand in motif_locs], reverse=True) plot_intervals = [] - for i, (cov, chrm, pos) in enumerate(motif_locs_cov): + for i, (cov, chrm, pos, strand) in enumerate(motif_locs_cov): int_start = max( - 0, pos - int((num_bases - motif_len + 1) / 2.0)) + 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) + int_end = int_start + num_bases plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, - int_start + num_bases, '+')) + '{:03d}'.format(i), chrm, int_start, int_end, strand)) if len(plot_intervals) >= num_regions: break + # plot random covered regions else: # iterate over regions and check if they have any coverage plot_intervals = [] - for i, (chrm, pos) in enumerate(motif_locs): + for i, (chrm, pos, strand) in enumerate(motif_locs): int_start = max( - 0, pos - int((num_bases - motif_len + 1) / 2.0)) + 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) int_end = int_start + num_bases - for strand in ('+', '-'): - if ((chrm, strand) in raw_read_coverage and - (chrm, strand) in raw_read_coverage2 and - any(r_data.start < pos < r_data.end - for r_data in raw_read_coverage[(chrm, strand)]) and - any(r_data2.start < pos < r_data2.end - for r_data2 in raw_read_coverage2[(chrm, strand)])): - plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, - int_end, strand)) + if strand is None and any( + ((chrm, s) in raw_read_coverage and + any(r_data.start < pos < r_data.end + for r_data in raw_read_coverage[(chrm, s)])) + for s in ('+', '-')): + plot_intervals.append(th.intervalData( + '{:03d}'.format(i), chrm, int_start, int_end, strand)) + elif ((chrm, strand) in raw_read_coverage and + any(r_data.start < pos < r_data.end + for r_data in raw_read_coverage[(chrm, strand)])): + plot_intervals.append(th.intervalData( + '{:03d}'.format(i), chrm, int_start, int_end, strand)) if len(plot_intervals) >= num_regions: break - plot_two_samples( - plot_intervals, raw_read_coverage, raw_read_coverage2, - overplot_thresh, overplot_type, pdf_fn) + if tb_model_fn is None: + plot_single_sample( + plot_intervals, raw_read_coverage, overplot_thresh, + overplot_type, pdf_fn) + else: + plot_model_single_sample( + plot_intervals, raw_read_coverage, tb_model_fn, + overplot_type, overplot_thresh, pdf_fn, alt_model_fn) + # two sample plot else: - covered_chrms = set(zip(*raw_read_coverage)[0]) + raw_read_coverage2 = th.parse_fast5s( + f5_dirs2, corrected_group, basecall_subgroups) + + covered_chrms = set(map(itemgetter(0), raw_read_coverage)).intersection( + map(itemgetter(0), raw_read_coverage2)) # filter out motif_locs to chromosomes not covered motif_locs = get_motif_locs(covered_chrms) if deepest_coverage: - if VERBOSE: sys.stderr.write( - 'Finding deepest coverage regions.\n') - def get_cov(chrm, pos): - try: - plus_cov = read_coverage[(chrm, '+')][pos] - except (IndexError, KeyError): - plus_cov = 0 - try: - minus_cov = read_coverage[(chrm, '-')][pos] - except (IndexError, KeyError): - minus_cov = 0 - return max(plus_cov, minus_cov) - + read_coverage2 = th.get_coverage(raw_read_coverage2) + if VERBOSE: sys.stderr.write('Finding deepest coverage regions.\n') motif_locs_cov = sorted([ - (get_cov(chrm, pos), chrm, pos) - for chrm, pos in motif_locs], reverse=True) + (get_pos_cov(chrm, pos, strand, read_coverage, read_coverage2), + chrm, pos, strand) + for chrm, pos, strand in motif_locs], reverse=True) + if motif_locs_cov[0][0] == 0: + th._error_message_and_exit( + 'Motif not covered by both groups at any positions.') plot_intervals = [] - for i, (cov, chrm, pos)in enumerate(motif_locs_cov): + for i, (cov, chrm, pos, strand) in enumerate(motif_locs_cov): int_start = max( - 0, pos - int((num_bases - motif_len + 1) / 2.0)) - int_end = int_start + num_bases + 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_end, '+')) + '{:03d}'.format(i), chrm, int_start, + int_start + num_bases, strand)) if len(plot_intervals) >= num_regions: break + # plot random covered regions else: # iterate over regions and check if they have any coverage plot_intervals = [] - for i, (chrm, pos) in enumerate(motif_locs): + for i, (chrm, pos, strand) in enumerate(motif_locs): int_start = max( - 0, pos - int((num_bases - motif_len + 1) / 2.0)) + 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) int_end = int_start + num_bases - for strand in ('+', '-'): - if ((chrm, strand) in raw_read_coverage and + if strand is None and any(( + (chrm, s) in raw_read_coverage and + (chrm, s) in raw_read_coverage2 and any(r_data.start < pos < r_data.end - for r_data in raw_read_coverage[(chrm, strand)])): - plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, - int_end, strand)) + for r_data in raw_read_coverage[(chrm, s)]) and + any(r_data2.start < pos < r_data2.end + for r_data2 in raw_read_coverage2[(chrm, s)])) + for s in ('+', '-')): + plot_intervals.append(th.intervalData( + '{:03d}'.format(i), chrm, int_start, int_end, strand)) + elif ((chrm, strand) in raw_read_coverage and + (chrm, strand) in raw_read_coverage2 and + any(r_data.start < pos < r_data.end + for r_data in raw_read_coverage[(chrm, strand)]) and + any(r_data2.start < pos < r_data2.end + for r_data2 in raw_read_coverage2[(chrm, strand)])): + plot_intervals.append(th.intervalData( + '{:03d}'.format(i), chrm, int_start, int_end, strand)) + if len(plot_intervals) >= num_regions: break - if tb_model_fn is None: - plot_single_sample( - plot_intervals, raw_read_coverage, overplot_thresh, - overplot_type, pdf_fn) - else: - plot_model_single_sample( - plot_intervals, raw_read_coverage, tb_model_fn, - overplot_type, overplot_thresh, pdf_fn, alt_model_fn) + if len(plot_intervals) == 0: + th._error_message_and_exit( + 'Motif not covered by both groups at any positions.') + + plot_two_samples( + plot_intervals, raw_read_coverage, raw_read_coverage2, + overplot_thresh, overplot_type, pdf_fn) return @@ -1650,16 +1761,15 @@ def plot_max_diff( def plot_most_signif( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, f5_dirs2, num_regions, overplot_thresh, seqs_fn, num_bases, - overplot_type, qval_thresh, stats_fn, tb_model_fn, alt_model_fn, - plot_default_stnd, plot_default_alt, stat_order): + overplot_type, stats_fn, tb_model_fn, alt_model_fn, + plot_default_stnd, plot_default_alt): if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats, stat_type = ts.parse_stats(stats_fn) raw_read_coverage = th.parse_fast5s( f5_dirs1, corrected_group, basecall_subgroups) plot_intervals = ts.get_most_signif_regions( - all_stats, num_bases, num_regions, qval_thresh, - fraction_order=not stat_order) + all_stats, num_bases, num_regions) tb_model_fn, alt_model_fn = get_valid_model_fns( tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, raw_read_coverage, f5_dirs2) @@ -1682,17 +1792,18 @@ def plot_most_signif( return -def get_unique_intervals(plot_intervals, covered_poss, num_regions=None): +def get_unique_intervals(plot_intervals, covered_poss=None, num_regions=None): # unique genomic regions filter uniq_p_intervals = [] used_intervals = defaultdict(set) for int_i in plot_intervals: # could have significant region immediately next to # beginning/end of reads - interval_poss = range(int_i.start, int_i.end) - if int_i.start not in used_intervals[(int_i.chrm, int_i.strand)] and all( - pos in covered_poss[(int_i.chrm, int_i.strand)] - for pos in interval_poss): + interval_poss = list(range(int_i.start, int_i.end)) + if int_i.start not in used_intervals[(int_i.chrm, int_i.strand)] and ( + covered_poss is None or all( + pos in covered_poss[(int_i.chrm, int_i.strand)] + for pos in interval_poss)): uniq_p_intervals.append(int_i) used_intervals[(int_i.chrm, int_i.strand)].update(interval_poss) if num_regions is not None and len(uniq_p_intervals) >= num_regions: @@ -1703,31 +1814,23 @@ def get_unique_intervals(plot_intervals, covered_poss, num_regions=None): def plot_motif_centered_signif( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, f5_dirs2, num_regions, overplot_thresh, motif, stats_fn, - context_width, num_stats, stat_order, tb_model_fn): + context_width, num_stats, tb_model_fn, alt_model_fn, + plot_default_stnd, plot_default_alt, fasta_fn): try: - cowplot = importr("cowplot") + importr(str('gridExtra')) except: - sys.stderr.write( - '*' * 60 + '\nERROR: Must have R packge `cowplot` ' + - 'installed in order to create motif centered plots ' + - '(install via `install.packages(cowplot)` from ' + - 'an R prompt).\n' + '*' * 60 + '\n\n') - sys.exit() + th._error_message_and_exit( + 'Must have R packge `gridExtra` installed in order to ' + + 'create motif centered plots.') - motif_pat = th.parse_motif(motif) - motif_len = len(motif) + motif = th.TomboMotif(motif) + + if fasta_fn is not None: + genome_index = th.Fasta(fasta_fn) if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats, stat_type = ts.parse_stats(stats_fn) - # check that fraction un-modified is included in stats data - if not stat_order and all(np.isnan(all_stats['frac'][:50])): - sys.stderr.write( - '********** WARNING *********\tFraction requested for plotting, ' + - 'but does not appear to be included in statistics file, so ' + - 'plotting q-values.') - stat_order = True - if not stat_order: - all_stats.sort(order='frac') + all_stats.sort(order=str('frac')) raw_read_coverage1 = th.parse_fast5s( f5_dirs1, corrected_group, basecall_subgroups) @@ -1735,78 +1838,94 @@ def plot_motif_centered_signif( f5_dirs2, corrected_group, basecall_subgroups) \ if f5_dirs2 is not None else None - def log_max_stat(pval): - return -np.log10(max(th.SMALLEST_PVAL, pval)) - def get_stats(stat): - if not stat_order: - return 1 - stat['frac'] - return log_max_stat(stat['mt_stat']) + tb_model_fn, alt_model_fn = get_valid_model_fns( + tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, + raw_read_coverage1, f5_dirs2) + all_stats_dict = dict( - ((str(stat['chrm']), str(stat['strand']), stat['pos']), get_stats(stat)) - for stat in all_stats) - covered_poss = defaultdict(set) - for stat in all_stats: - covered_poss[(str(stat['chrm']), str(stat['strand']))].add(stat['pos']) + ((stat[str('chrm')].decode(), stat[str('strand')].decode(), + stat[str('pos')]), 1 - stat[str('frac')]) for stat in all_stats) - if VERBOSE: sys.stderr.write( - 'Finding signficant regions with motif.\n') + if VERBOSE: sys.stderr.write('Finding signficant regions with motif.\n') motif_regions_data = [] + search_width = ((context_width + motif.motif_len) * 2) - 1 for stat in all_stats: - reg_data = th.get_region_sequences( - [th.intervalData( - '0', str(stat['chrm']), stat['pos'] - motif_len + 1, - stat['pos'] + motif_len, str(stat['strand']))], - raw_read_coverage1, raw_read_coverage2)[0] + chrm, strand, start, end = ( + stat['chrm'].decode(), stat['strand'].decode(), + max(stat['pos'] - motif.motif_len - context_width + 1, 0), + stat['pos'] + motif.motif_len + context_width) + if fasta_fn is None: + reg_seq = th.get_region_sequences( + [th.intervalData('0', chrm, start, end, strand)], + raw_read_coverage1, raw_read_coverage2)[0].seq + else: + reg_seq = genome_index.get_seq(chrm, start, end) + + if strand == '-': + reg_seq = th.rev_comp(reg_seq) - reg_match = motif_pat.search(reg_data.seq) + reg_match = motif.motif_pat.search(reg_seq) if reg_match: - motif_regions_data.append(( - stat['pos'], str(stat['chrm']), str(stat['strand']), - reg_match.start())) + offset = reg_match.start() + if strand == '-': + offset = search_width - offset - motif.motif_len + reg_start = (stat['pos'] - motif.motif_len + offset - + (context_width * 2) + 1) + if (reg_start, chrm, strand) not in motif_regions_data: + motif_regions_data.append((reg_start, chrm, strand)) if len(motif_regions_data) >= num_stats: break - # get plot intervals for all stat regions then trim to - # num_regions after getting all p-values for plotting - plot_width = motif_len + (context_width * 2) - plot_intervals = [] - for i, (pos, chrm, strand, offset) in enumerate(motif_regions_data): - int_start = pos - motif_len + offset - context_width + 1 - plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_start + plot_width, - strand)) - if len(plot_intervals) >= num_stats: break - # need to handle forward and reverse strand stats separately since - # reverse strand stats are in reverse order wrt motif - # note check for key in stats dict as significant position - # may lie next to region with coverage below the threshold + if len(motif_regions_data) == 0: + th._error_message_and_exit( + 'No covered and tested sites contain motif of interest.') + if len(motif_regions_data) < num_stats: + th._warning_message( + 'Fewer covered and tested motif sites found than requested.') + + plot_width = motif.motif_len + (context_width * 2) + def get_stat_pos(start, chrm, strand): + # need to handle forward and reverse strand stats separately since + # reverse strand stats are in reverse order wrt motif + # note try-except for key in stats dict as significant position + # may lie next to region with coverage below the threshold + reg_pos_stats = [] + for pos in range(start, start + plot_width): + try: + stat = all_stats_dict[(chrm, strand, pos)] + except KeyError: + stat = 0.0 + if strand == '+': + plot_pos = pos - start + else: + plot_pos = -1 * (pos - start - plot_width + 1) + reg_pos_stats.append((plot_pos, stat)) + + return reg_pos_stats + stat_locs = [ - (pos - int_i.start, all_stats_dict[(int_i.chrm, int_i.strand, pos)] - if (chrm, strand, pos) in all_stats_dict else 0.0) - for int_i in plot_intervals for pos in range(int_i.start, int_i.end) - if int_i.strand == '+'] + [ - (-1 * (pos - int_i.end + 1), - all_stats_dict[(int_i.chrm, int_i.strand, pos)] - if (int_i.chrm, int_i.strand, pos) in all_stats_dict - else 0.0) for int_i in plot_intervals - for pos in range(int_i.start, int_i.end) if int_i.strand == '-'] + loc_stat for motif_loc in motif_regions_data + for loc_stat in get_stat_pos(*motif_loc)] + # TODO: Fix so that negative strand reads are plotted too. - # requires adding "don't reverse signal" option in getting plot - # data - plot_intervals = [ - int_i for int_i in plot_intervals if int_i.strand == '+'] - plot_intervals = get_unique_intervals( - plot_intervals, covered_poss, num_regions) + # requires adding "don't reverse signal" option in getting plot data + plot_intervals = [] + for i, (reg_start, chrm, strand) in enumerate(motif_regions_data): + if strand == '-': continue + plot_intervals.append(th.intervalData( + '{:03d}'.format(i), chrm, reg_start, reg_start + plot_width, strand)) + if len(plot_intervals) >= num_regions: + break plot_motif_centered_with_stats( raw_read_coverage1, raw_read_coverage2, plot_intervals, - stat_locs, overplot_thresh, pdf_fn, not stat_order, tb_model_fn) + stat_locs, overplot_thresh, pdf_fn, tb_model_fn, alt_model_fn) return def cluster_most_signif( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - f5_dirs2, num_regions, qval_thresh, num_bases, + f5_dirs2, num_regions, num_bases, r_struct_fn, num_processes, fasta_fn, stats_fn, slide_span): if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats, stat_type = ts.parse_stats(stats_fn) @@ -1827,7 +1946,7 @@ def cluster_most_signif( read_coverage2)) plot_intervals = ts.get_most_signif_regions( - all_stats, num_bases + (slide_span * 2), num_regions, qval_thresh) + all_stats, num_bases + (slide_span * 2), num_regions) # unique genomic regions filter plot_intervals = get_unique_intervals(plot_intervals, covered_poss) @@ -1839,19 +1958,18 @@ def cluster_most_signif( # the exact range found expand_pos = 2 seq_intervals = [ - th.intervalData( - int_i.reg_id, int_i.chrm, int_i.start - expand_pos, - int_i.start + expand_pos + num_bases + (slide_span * 2), - int_i.strand, int_i.reg_text) + int_i._replace( + start=int_i.start - expand_pos, + end=int_i.start + expand_pos + num_bases + (slide_span * 2)) for int_i in plot_intervals] if fasta_fn is None: # add region sequences to column names for saved dist matrix reg_seqs = [reg_data.seq for reg_data in th.get_region_sequences( seq_intervals, raw_read_coverage1, raw_read_coverage2)] else: - fasta_records = th.parse_fasta(fasta_fn) + genome_index = th.Fasta(fasta_fn) reg_seqs = [ - fasta_records[int_i.chrm][int_i.start:int_i.end] + genome_index.get_seq(int_i.chrm, int_i.start, int_i.end) for int_i in seq_intervals] if VERBOSE: sys.stderr.write('Getting base signal.\n') @@ -1870,14 +1988,6 @@ def cluster_most_signif( int_i.start:int_i.start+num_bases+(slide_span_val*2)]) for int_i in plot_intervals] - # some code to output reg signal for discovery plotting - """sys.stdout.write('\n'.join( - '\t'.join(('\t'.join(map(str, reg_diff)), reg_seq, - int_i.chrm, str(int_i.start), int_i.strand)) - for reg_seq, int_i, reg_diff in - zip(reg_seqs, plot_intervals reg_sig_diffs)) + '\n') - sys.exit()""" - if VERBOSE: sys.stderr.write('Getting distance between signals.\n') manager = mp.Manager() index_q = manager.Queue() @@ -1888,7 +1998,7 @@ def cluster_most_signif( args = (reg_sig_diffs, index_q, dists_q, slide_span) processes = [] - for p_id in xrange(num_processes): + for p_id in range(num_processes): p = mp.Process(target=ts.get_pairwise_dists, args=args) p.start() @@ -1899,7 +2009,7 @@ def cluster_most_signif( try: row_dists = dists_q.get(block=False) reg_sig_diff_dists.append(row_dists) - except Queue.Empty: + except queue.Empty: sleep(1) continue # empty any entries left in queue after processes have finished @@ -1907,7 +2017,7 @@ def cluster_most_signif( row_dists = dists_q.get(block=False) reg_sig_diff_dists.append(row_dists) - reg_sig_diff_dists = zip(*sorted(reg_sig_diff_dists))[1] + reg_sig_diff_dists = list(map(itemgetter(1), sorted(reg_sig_diff_dists))) reg_sig_diff_dists = r.r.matrix( r.FloatVector(np.concatenate(reg_sig_diff_dists)), @@ -1915,16 +2025,16 @@ def cluster_most_signif( if r_struct_fn is not None: reg_sig_diff_dists.colnames = r.StrVector( - ['::'.join((seq, int_i.chrm, int_i.strand, str(int_i.start))) + ['::'.join((seq, int_i.chrm, int_i.strand, unicode(int_i.start))) for seq, int_i in zip(reg_seqs, plot_intervals)]) r_struct_fn = r.StrVector([r_struct_fn,]) else: r_struct_fn = r.NA_Character if VERBOSE: sys.stderr.write('Plotting (and saving data).\n') - r.r(resource_string(__name__, 'R_scripts/plotSigMDS.R')) + r.r(resource_string(__name__, 'R_scripts/plotSigMDS.R').decode()) r.r('pdf("' + pdf_fn + '", height=7, width=7)') - r.globalenv['plotSigMDS'](reg_sig_diff_dists, r_struct_fn) + r.globalenv[str('plotSigMDS')](reg_sig_diff_dists, r_struct_fn) r.r('dev.off()') return @@ -1943,20 +2053,29 @@ def plot_main(args): ts.VERBOSE = VERBOSE try: - ggplot = importr("ggplot2") + ggplot = importr(str('ggplot2')) except: - sys.stderr.write(GG_LOAD_ERROR) - sys.exit() - - base_args = [args.fast5_basedirs, args.corrected_group, - args.basecall_subgroups, args.pdf_filename] + th._error_message_and_exit( + 'Must have rpy2, R and R package ggplot2 installed in ' + + 'order to plot. If these packages are installed, ' + + 'run:\n\t\t`python -c "import rpy2.robjects; from ' + + 'rpy2.robjects.packages import importr; ' + + 'importr(str(\'ggplot2\'));"`\n\t to see installation issues.') + + # roc plotting doesn't use read dirs + try: + base_args = [args.fast5_basedirs, args.corrected_group, + args.basecall_subgroups, args.pdf_filename] + except: + pass try: genome_opts = [ ('overplot_thresh', args.overplot_threshold), ('overplot_type', args.overplot_type)] except: pass - nbase_opt = [('num_bases', args.num_bases if 'num_bases' in args else None),] + nbase_opt = [('num_bases', args.num_bases + if 'num_bases' in args else None),] nreg_opt = [('num_regions', args.num_regions if 'num_regions' in args else None),] nobs_opt = [('num_obs', args.num_obs @@ -1982,10 +2101,9 @@ def plot_main(args): motif_opt = [('motif', args.motif if 'motif' in args else None),] seqfn_opt = [('seqs_fn', args.sequences_filename if 'sequences_filename' in args else None),] - qval_opt = [('qval_thresh', args.q_value_threshold - if 'q_value_threshold' in args else None),] statfn_opt = [('stats_fn', args.statistics_filename if 'statistics_filename' in args else None),] + if args.subcmd == 'plot_max_coverage': kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt) @@ -2006,16 +2124,15 @@ def plot_main(args): plot_max_diff(*base_args, **kwargs) elif args.subcmd == 'plot_most_significant': kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + - seqfn_opt + qval_opt + statfn_opt + tbmod_opt + atbmod_opt + - dtbmod_opt + datbmod_opt + - [('stat_order', args.statistic_order)]) + seqfn_opt + statfn_opt + tbmod_opt + atbmod_opt + + dtbmod_opt + datbmod_opt) plot_most_signif(*base_args, **kwargs) elif args.subcmd == 'plot_motif_with_stats': kwargs = dict(f5dirs2_opt + nreg_opt + motif_opt + statfn_opt + - tbmod_opt + + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt + + fasta_opt + [('overplot_thresh', args.overplot_threshold), ('context_width', args.num_context), - ('stat_order', args.statistic_order), ('num_stats', args.num_statistics)]) plot_motif_centered_signif(*base_args, **kwargs) elif args.subcmd == 'plot_correction': @@ -2027,11 +2144,17 @@ def plot_main(args): ('include_orig_bcs', args.include_original_basecalls)]) plot_multi_corrections(*base_args, **kwargs) elif args.subcmd == 'cluster_most_significant': - kwargs = dict(f5dirs2_opt + nreg_opt + qval_opt + nbase_opt + + kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + fasta_opt + statfn_opt + rdata_opt + [('num_processes', args.processes), ('slide_span', args.slide_span)]) cluster_most_signif(*base_args, **kwargs) + elif args.subcmd == 'plot_per_read': + kwargs = dict(glocs_opt + fasta_opt + nbase_opt + + [('per_read_stats_fn', args.per_read_statistics_filename), + ('num_reads', args.num_reads), + ('box_center', args.box_center)]) + plot_per_read_mods_genome_location(*base_args, **kwargs) elif args.subcmd == 'plot_kmer': kwargs = dict(nread_opt + rdata_opt + [('read_mean', args.read_mean), @@ -2040,13 +2163,13 @@ def plot_main(args): ('kmer_thresh', args.num_kmer_threshold), ('dont_plot', args.dont_plot)]) plot_kmer_dist(*base_args, **kwargs) - elif args.subcmd == 'plot_per_read': - kwargs = dict(nbase_opt + glocs_opt + tbmod_opt + atbmod_opt + - dtbmod_opt + datbmod_opt + - [('fm_lag', args.fishers_method_context), - ('num_reads', args.num_reads), - ('box_center', args.box_center)]) - plot_per_read_mods_genome_location(*base_args, **kwargs) + elif args.subcmd == 'plot_roc': + kwargs = dict(fasta_opt + + [('pdf_fn', args.pdf_filename), + ('motif_descs', args.motif_descriptions), + ('stats_fns', args.statistics_filenames), + ('min_reads', args.minimum_test_reads)]) + plot_roc(**kwargs) else: sys.stderr.write('ERROR: Invalid tombo sub-command entered. ' + 'Should have been caught by argparse.\n') @@ -2055,5 +2178,5 @@ def plot_main(args): if __name__ == '__main__': - raise NotImplementedError, ( + raise NotImplementedError( 'This is a module. See commands with `tombo -h`') diff --git a/tombo/resquiggle.py b/tombo/resquiggle.py index 52439cc..f611b5a 100644 --- a/tombo/resquiggle.py +++ b/tombo/resquiggle.py @@ -1,70 +1,73 @@ -import os, sys +from __future__ import division, unicode_literals, absolute_import +from builtins import int, range, dict, map, zip + +import os +import io import re +import sys +import mappy +import queue + +# Future warning from cython in h5py +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) import h5py -import Queue -import pkg_resources import numpy as np np.seterr(all='raise') import multiprocessing as mp from time import sleep -from subprocess import call -from tempfile import NamedTemporaryFile -from collections import defaultdict, namedtuple +from operator import itemgetter +from collections import defaultdict + +if sys.version_info[0] > 2: + unicode = str # import tombo modules/functions -import tombo_stats as ts -import tombo_helper as th +from . import tombo_stats as ts +from . import tombo_helper as th -from c_helper import c_new_means, c_valid_cpts_w_cap_t_test -from _model_resquiggle import forward_pass, traceback -from dynamic_programming import c_reg_z_scores, c_banded_forward_pass, \ - c_banded_traceback, c_base_z_scores, c_adaptive_banded_forward_pass +from ._default_parameters import ( + SEG_PARAMS_TABLE, ALGN_PARAMS_TABLE, EXTRA_SIG_FACTOR, MASK_FILL_Z_SCORE, + MASK_BASES, START_BANDWIDTH, START_SEQ_WINDOW, BAND_BOUNDARY_THRESH, + DEL_FIX_WINDOW, MAX_DEL_FIX_WINDOW, MIN_EVENT_TO_SEQ_RATIO, MAX_RAW_CPTS) + +from .dynamic_programming import traceback, forward_pass +from .c_helper import ( + c_new_means, c_valid_cpts_w_cap, c_valid_cpts_w_cap_t_test) +from .c_dynamic_programming import ( + c_reg_z_scores, c_banded_forward_pass, c_banded_traceback, + c_base_z_scores, c_adaptive_banded_forward_pass) VERBOSE = False +PROGRESS_INTERVAL = 1000 _PROFILE_RSQGL = False -_PROFILE_ALIGN = False _DEBUG_FIT = False _DEBUG_FULL = False _DEBUG_MIDDLE = False +_DEBUG_PARAMS = False +_DRY_RUN = any((_DEBUG_PARAMS, _DEBUG_FIT, _DEBUG_FULL, _DEBUG_MIDDLE)) _NUM_DEBUG_ENDS = 250 -# allow this many times the alignment batch size into the queue of -# reads to be resquiggled -ALIGN_BATCH_MULTIPLIER = 5 -PROGRESS_INTERVAL = 500 - -# table containing the number of observations per event and the running -# difference width for different conditions this will need to be updated -# particularly for RNA motor updates most likely -OBS_PER_EVENT_TABLE = {'RNA':(60, 20), 'DNA':(7, 3)} - -readInfo = namedtuple( - 'readInfo', - ('ID', 'Subgroup', 'ClipStart', 'ClipEnd', - 'Insertions', 'Deletions', 'Matches', 'Mismatches')) -mapperData = namedtuple('mapperData', ('exe', 'type', 'index')) -# set default index to None -mapperData.__new__.__defaults__ = (None,) - -M5_FIELDS = ( - 'qName', 'qLength', 'qStart', 'qEnd', 'qStrand', - 'tName', 'tLength', 'tStart', 'tEnd', 'tStrand', - 'score', 'numMatch', 'numMismatch', 'numIns', 'numDel', - 'mapQV', 'qAlignedSeq', 'matchPattern', 'tAlignedSeq') -SAM_FIELDS = ( - 'qName', 'flag', 'rName', 'pos', 'mapq', - 'cigar', 'rNext', 'pNext', 'tLen', 'seq', 'qual') -CIGAR_PAT = re.compile('(\d+)([MIDNSHP=X])') +############################################### +########## Read Segmentation Scoring ########## +############################################### -############################################ -########## Debug output functions ########## -############################################ +def get_read_seg_score(norm_signal, segs, r_ref_means, r_ref_sds): + return np.mean([ + np.abs((b_m - b_ref_m) / b_ref_s) + for b_m, b_ref_m, b_ref_s in + zip(c_new_means(norm_signal, segs), r_ref_means, r_ref_sds)]) + + +################################## +########## Debug Output ########## +################################## def _write_middle_debug(z_scores, fwd_pass, band_event_starts, debug_fp, reg_id, debug_num_seq=_NUM_DEBUG_ENDS, @@ -75,12 +78,12 @@ def _write_middle_debug(z_scores, fwd_pass, band_event_starts, debug_end_start = len(band_event_starts) - debug_num_seq debug_fp.write('\n'.join( '\t'.join(map(str, (band_pos + band_event_starts[seq_pos], seq_pos, - score, str(reg_id) + 'z_begin'))) + score, unicode(reg_id) + 'z_begin'))) for seq_pos, s_data in enumerate(z_scores[:debug_num_seq]) for band_pos, score in enumerate(s_data)) + '\n') debug_fp.write('\n'.join( '\t'.join(map(str, (band_pos + band_event_starts[seq_pos], seq_pos, - score, str(reg_id) + 'fwd_begin'))) + score, unicode(reg_id) + 'fwd_begin'))) for seq_pos, s_data in enumerate(fwd_pass[:debug_num_seq]) for band_pos, score in enumerate(s_data)) + '\n') if short: return @@ -88,13 +91,13 @@ def _write_middle_debug(z_scores, fwd_pass, band_event_starts, debug_fp.write('\n'.join( '\t'.join(map(str, ( band_pos + band_event_starts[debug_end_start + seq_pos], seq_pos, - score, str(reg_id) + 'z_end'))) + score, unicode(reg_id) + 'z_end'))) for seq_pos, s_data in enumerate(z_scores[-debug_num_seq:]) for band_pos, score in enumerate(s_data)) + '\n') debug_fp.write('\n'.join( '\t'.join(map(str, ( band_pos + band_event_starts[debug_end_start + seq_pos], seq_pos, - score, str(reg_id) + 'fwd_end'))) + score, unicode(reg_id) + 'fwd_end'))) for seq_pos, s_data in enumerate(fwd_pass[-debug_num_seq:]) for band_pos, score in enumerate(s_data)) + '\n') @@ -107,20 +110,20 @@ def _write_full_debug(fwd_pass_move, band_event_starts, top_max_pos, band_poss = [] event_scores = [] for seq_pos, event_pos in enumerate(read_tb[1:]): - seq_e_poss = range(prev_event_pos, event_pos) seq_band_poss = [e_pos - band_event_starts[seq_pos] - for e_pos in seq_e_poss] + for e_pos in range(prev_event_pos, event_pos)] band_poss.extend(seq_band_poss) - event_scores.extend([z_scores[seq_pos][b_pos] for b_pos in seq_band_poss]) + event_scores.extend([z_scores[seq_pos][b_pos] + for b_pos in seq_band_poss]) prev_event_pos = event_pos debug_fp.write('\n'.join( - '\t'.join(map(str, (e_pos, b_pos, e_score, str(reg_id)))) + '\t'.join(map(str, (e_pos, b_pos, e_score, unicode(reg_id)))) for e_pos, (b_pos, e_score) in enumerate(zip( band_poss, event_scores))) + '\n') - fail_str = (('Failed ' if final_score < 0 else 'Pass ') + str(final_score) + - ' ' + str(float(final_score) / len(read_tb))) - failed_fp.write(fail_str + '\t' + str(reg_id) + '\n') + fail_str = (('Failed ' if final_score < 0 else 'Pass ') + + unicode(final_score) + ' ' + unicode(final_score / len(read_tb))) + failed_fp.write(fail_str + '\t' + unicode(reg_id) + '\n') return @@ -129,313 +132,223 @@ def _write_tb_debug(fwd_pass_move, band_event_starts, top_max_pos, read_tb = c_banded_traceback(fwd_pass_move, band_event_starts, top_max_pos) debug_fp.write('\n'.join( '\t'.join(map(str, (event_pos, seq_pos, - str(reg_id) + 'fwd_begin'))) + unicode(reg_id) + 'fwd_begin'))) for seq_pos, event_pos in enumerate(read_tb[:debug_num_seq])) + '\n') debug_fp.write('\n'.join( '\t'.join(map(str, (event_pos, seq_pos, - str(reg_id) + 'fwd_end'))) + unicode(reg_id) + 'fwd_end'))) for seq_pos, event_pos in enumerate(read_tb[-debug_num_seq:])) + '\n') return +def _write_fit_debug( + norm_signal, segs, r_ref_means, r_ref_sds, genome_seq): + norm_means = c_new_means(norm_signal, segs) + with io.open('eventless_testing.model.txt', 'wt') as fp: + fp.write('Position\tMean\tSD\n' + '\n'.join( + '\t'.join(map(str, (pos, p_mean, p_std))) + for pos, (p_mean, p_std) in enumerate(zip( + r_ref_means, r_ref_sds))) + '\n') + with io.open('eventless_testing.seq.txt', 'wt') as fp: + fp.write('Base\tPosition\tSignalMean\n' + '\n'.join( + '\t'.join(map(str, (b, pos, p_mean))) for pos, (b, p_mean) in + enumerate(zip(genome_seq, norm_means))) + '\n') + Position, Signal = [], [] + for base_i, (b_start, b_end) in enumerate(zip(segs[:-1], segs[1:])): + Position.extend( + base_i + np.linspace(0, 1, b_end - b_start, endpoint=False)) + Signal.extend(norm_signal[b_start:b_end]) + with io.open('eventless_testing.signal.txt', 'wt') as fp: + fp.write('Position\tSignal\n' + '\n'.join( + '\t'.join(map(str, (pos, sig))) + for pos, sig in zip(Position, Signal)) + '\n') + + return -################################################# -########## Raw Signal Re-squiggle Code ########## -################################################# +def _write_params_debug( + norm_signal, segs, r_ref_means, r_ref_sds, + running_stat_width, min_obs_per_base, mean_obs_per_event, + match_evalue, skip_pen, bandwidth, fast5_fn): + mean_half_z_score = get_read_seg_score( + norm_signal, segs, r_ref_means, r_ref_sds) + sys.stdout.write( + '\t'.join(map(str, ( + running_stat_width, min_obs_per_base, mean_obs_per_event, + match_evalue, skip_pen, bandwidth, fast5_fn, + mean_half_z_score))) + '\n') + + return + +def _open_debug_fps(): + score_fp = io.open('debug_event_align.txt', 'wt') + score_fp.write('EventPos\tSeqPos\tScore\tRegion\n') + tb_fp = io.open('debug_event_align.traceback.txt', 'wt') + tb_fp.write('EventPos\tSeqPos\tRegion\n') + full_fit_fp = io.open('debug_event_align.full_fit.txt', 'wt') + full_fit_fp.write('EventPos\tBandPos\tEventScore\tRegion\n') + full_failed_fp = io.open('debug_event_align.full_failed.txt', 'wt') + full_failed_fp.write('DidFail\tRegion\n') + debug_fps = [score_fp, tb_fp, full_fit_fp, full_failed_fp] + + return debug_fps + + +############################################ +########## Raw Signal Re-squiggle ########## +############################################ def get_model_fit_segs( - segs, norm_signal, r_ref_means, r_ref_sds, running_diff_width, - del_fix_window, max_new_cpts, extra_sig_factor=1.1): + segs, norm_signal, r_ref_means, r_ref_sds, min_obs_per_base, + max_raw_cpts=None, del_fix_window=DEL_FIX_WINDOW, + max_del_fix_window=MAX_DEL_FIX_WINDOW, + extra_sig_factor=EXTRA_SIG_FACTOR): """ Find new segments at skipped bases during dynamic programming segmentation. - :param all_del_ranges: List of start stop tuple ranges of deletion locations :param segs: current Read segment locations :param norm_signal: Normalized read siganl :param r_ref_means: Read refererence means from genomic sequence :param r_ref_sds: Read refererence standard deviations from genomic sequence - :param running_diff_width: Width of moving neighboring windows over which to compute segment locations - :param del_fix_window: amount to extend skipped base windows - :param max_new_cpts: Maximum new changepoints to find - :param extra_sig_factor: Amount of extra signal to require in order to perform signal space re-squiggle + :param min_obs_per_base: Minimum raw observations to assign to each base + :param max_raw_cpts: Maximum new changepoints to find from raw signal + :param del_fix_window: initial bases to extend skipped base windows + :param max_del_fix_window: max bases to extend skipped base windows + :param extra_sig_factor: Amount of extra signal to require in order to + perform signal space re-squiggle :returns: New segments with skipped bases resolved """ - def get_deletion_ranges(): - all_del_ranges = [] - for del_pos in np.where(np.diff(segs) == 0)[0]: - if (all_del_ranges and - del_pos <= all_del_ranges[-1][1] + del_fix_window + 2): - all_del_ranges[-1] = (all_del_ranges[-1][0], - del_pos + del_fix_window + 1) + def merge_del_windows(all_del_windows): + merged_del_windows = [] + for start, end in all_del_windows: + if (len(merged_del_windows) > 0 and + start < merged_del_windows[-1][1]): + merged_del_windows[-1] = (merged_del_windows[-1][0], end) else: - all_del_ranges.append((del_pos - del_fix_window, - del_pos + del_fix_window + 1)) - if len(all_del_ranges) == 0: - return + merged_del_windows.append((start, end)) + return merged_del_windows + + def window_too_small(start, end): + n_events = end - start + sig_start, sig_end = segs[start], segs[end] + sig_len = sig_end - sig_start + # windows are expanded by one base and the extra signal factor + # to allow some room to search for best path + return sig_len <= ((n_events + 1) * min_obs_per_base) * extra_sig_factor + + def expand_small_windows(all_del_windows): + expanded_del_windows = [] + windows_expanded = False + for start, end in all_del_windows: + if window_too_small(start, end): + windows_expanded = True + start -= 1 + end += 1 + expanded_del_windows.append((start, end)) + + return expanded_del_windows, windows_expanded + + def trim_del_window_ends(all_del_windows): # potentially trim first and last windows - if all_del_ranges[0][0] < 0: - all_del_ranges[0] = (0, all_del_ranges[0][1]) - if all_del_ranges[-1][1] > len(segs) - 1: - all_del_ranges[-1] = (all_del_ranges[-1][0], len(segs) - 1) + if all_del_windows[0][0] < 0: + all_del_windows[0] = (0, all_del_windows[0][1]) + if all_del_windows[-1][1] > len(segs) - 1: + all_del_windows[-1] = (all_del_windows[-1][0], len(segs) - 1) - if max_new_cpts is not None and max([ - end - start for start, end in all_del_ranges]) > max_new_cpts: - raise NotImplementedError, ( - 'Too many changepoints required for re-squiggle algorithm') + return all_del_windows - return all_del_ranges + def get_deletion_windows(): + # get initial windows around deletions/skipped bases + all_del_windows = [] + for del_pos in np.where(np.diff(segs) == 0)[0]: + if (len(all_del_windows) > 0 and + del_pos < all_del_windows[-1][1] + del_fix_window): + all_del_windows[-1] = (all_del_windows[-1][0], + del_pos + del_fix_window + 1) + else: + all_del_windows.append((del_pos - del_fix_window, + del_pos + del_fix_window + 1)) + if len(all_del_windows) == 0: + return - all_del_ranges = get_deletion_ranges() - if all_del_ranges is None: + windows_expanded = False + all_del_windows = merge_del_windows(all_del_windows) + all_del_windows = trim_del_window_ends(all_del_windows) + # expand small windows until there are no more or the max del window + # expansions have been attempted. + for _ in range(max_del_fix_window - del_fix_window): + all_del_windows, windows_expanded = expand_small_windows( + all_del_windows) + if not windows_expanded: break + all_del_windows = merge_del_windows(all_del_windows) + all_del_windows = trim_del_window_ends(all_del_windows) + + if windows_expanded and any( + window_too_small(start, end) for start, end in all_del_windows): + raise NotImplementedError( + 'Not enough raw signal around potential genomic deletion(s)') + + if max_raw_cpts is not None and max([ + end - start for start, end in all_del_windows]) > max_raw_cpts: + raise NotImplementedError( + 'Read contains too many potential genomic deletions') + + return all_del_windows + + + all_del_windows = get_deletion_windows() + if all_del_windows is None: return segs - for start, end in all_del_ranges: + for start, end in all_del_windows: n_events = end - start sig_start, sig_end = segs[start], segs[end] sig_len = sig_end - sig_start - if sig_len <= (n_events * running_diff_width) * extra_sig_factor: - raise NotImplementedError, ( - 'Too little signal around event-aligned genomic deletion') - # since there are no read starts to start from - pseudo_starts = np.linspace(0, sig_len, n_events + 1, dtype=np.int32) # find signal space z-scores mapping without real banding by allowing - # entire window to be searched + # entire window to be searched (c_reg_z_scores will clip base search + # windows to enforce min_obs_per_base) + pseudo_starts = np.linspace(0, sig_len, n_events + 1, dtype=np.int64) reg_z_scores = c_reg_z_scores( norm_signal[sig_start:sig_end], r_ref_means[start:end], r_ref_sds[start:end], pseudo_starts, - 0, n_events, n_events, running_diff_width) - reg_fwd_scores = forward_pass(reg_z_scores, running_diff_width) + 0, n_events, n_events, min_obs_per_base) + reg_fwd_scores = forward_pass(reg_z_scores, min_obs_per_base) # perform signal based scoring segmentation # - it is ~60X faster than base space - reg_segs = traceback(reg_fwd_scores, running_diff_width) + sig_start + reg_segs = traceback(reg_fwd_scores, min_obs_per_base) + sig_start + assert reg_segs.shape[0] == end - start - 1 segs[start+1:end] = reg_segs if np.diff(segs).min() < 1: - raise NotImplementedError, ( - 'New segments include zero length events') + raise NotImplementedError('New segments include zero length events') if segs[0] < 0: - raise NotImplementedError, ( - 'New segments start with negative index') + raise NotImplementedError('New segments start with negative index') if segs[-1] > norm_signal.shape[0]: - raise NotImplementedError, ( - 'New segments end past raw signal values') + raise NotImplementedError('New segments end past raw signal values') return segs - -########################################################## -########## Standard banding dynamic programming ########## -########################################################## - -def _get_masked_event_mapping( - event_means, r_ref_means, r_ref_sds, - mapped_start_offset, mapped_end_offset, skip_pen, stay_pen, z_shift, - bandwidth, band_boundary_thresh=5, score_thresh=0.0, - mask_fill_z_score=-10, mask_bases=50, end_event_gap=0, - reg_id=None, debug_fps=None): - """ - Perform banded dynamic programming sequence to event alignment forcing - the path to start and end at the previously discovered locations. - This is performed by masking the z-scores outside a "cone" extended - mask_bases from the beginning and end of the middle of the read. - """ - half_bandwidth = bandwidth / 2 - seq_len = r_ref_means.shape[0] - events_len = event_means.shape[0] - - # check if the mapped start and end positions are too close to the end of - # the events array and extend the bandwidth window if so - band_events_start_pos = ( - 0 if half_bandwidth <= mapped_start_offset else - mapped_start_offset - half_bandwidth) - band_events_end_pos = ( - events_len - bandwidth if half_bandwidth <= mapped_end_offset - else events_len - half_bandwidth - mapped_end_offset) - band_event_starts = np.linspace( - band_events_start_pos, band_events_end_pos, seq_len).astype(np.int32) - - # figure out how many bases on each end contain masking to only compute - # masked z-scores here - start_mask_seq_len = max( - mask_bases, next(i for i, bes in enumerate(band_event_starts) - if bes >= mapped_start_offset)) - end_mask_seq_len = max( - mask_bases, next( - i for i, bes in enumerate(band_event_starts[::-1]) - if bes + bandwidth <= events_len - mapped_end_offset)) - assert start_mask_seq_len + end_mask_seq_len < seq_len, ( - 'Invalid masking encountered in dynamic sequence to events mapping') - - # get masked z-scores at the beginning of the read - mask_start_pos = np.linspace( - mapped_start_offset + 1 + end_event_gap, - band_event_starts[mask_bases - 1] + bandwidth, - mask_bases).astype(np.int32) - def get_start_mask_z_score(seq_pos, event_pos): - start_mask_len = max(mapped_start_offset - event_pos, 0) - end_mask_len = (0 if seq_pos >= mask_bases else - bandwidth - (mask_start_pos[seq_pos] - event_pos)) - event_vals = event_means[event_pos + start_mask_len: - event_pos + bandwidth - end_mask_len] - b_z_scores = c_base_z_scores( - event_vals, r_ref_means[seq_pos], r_ref_sds[seq_pos]) - masked_z_scores = np.concatenate([ - [mask_fill_z_score] * start_mask_len, b_z_scores, - [mask_fill_z_score] * end_mask_len]) - return masked_z_scores - start_scores = [get_start_mask_z_score(seq_pos, event_pos) - for seq_pos, event_pos in enumerate( - band_event_starts[:start_mask_seq_len])] - - # now the same for the end masked positions - mask_end_pos = np.linspace( - events_len - mapped_end_offset - end_event_gap - 1, - band_event_starts[-mask_bases], - mask_bases).astype(np.int32) - def get_end_mask_z_score(seq_pos, event_pos): - start_mask_len = ( - 0 if seq_len - seq_pos - 1 >= mask_bases else - mask_end_pos[seq_len - seq_pos - 1] - event_pos) - end_mask_len = max( - event_pos + bandwidth - events_len + mapped_end_offset - 1, 0) - event_vals = event_means[event_pos + start_mask_len: - event_pos + bandwidth - end_mask_len] - b_z_scores = c_base_z_scores( - event_vals, r_ref_means[seq_pos], r_ref_sds[seq_pos]) - masked_z_scores = np.concatenate([ - [mask_fill_z_score] * start_mask_len, b_z_scores, - [mask_fill_z_score] * end_mask_len]) - return masked_z_scores - end_scores = [get_end_mask_z_score( - seq_pos + seq_len - end_mask_seq_len, event_pos) - for seq_pos, event_pos in enumerate( - band_event_starts[-end_mask_seq_len:])] - - # compute middle z_scores, combine and shift - unmasked_z_scores = [ - c_base_z_scores(event_means[event_pos:event_pos + bandwidth], - r_ref_means[seq_pos + start_mask_seq_len], - r_ref_sds[seq_pos + start_mask_seq_len]) - for seq_pos, event_pos in enumerate(band_event_starts[ - start_mask_seq_len:-end_mask_seq_len])] - shifted_z_scores = z_shift + np.row_stack( - start_scores + unmasked_z_scores + end_scores) - fwd_pass, fwd_pass_move = c_banded_forward_pass( - shifted_z_scores, band_event_starts, skip_pen, stay_pen) - - # perform traceback - top_max_pos = np.argmax(fwd_pass[-1,:]) - - if _DEBUG_FULL: - _write_full_debug(fwd_pass_move, band_event_starts, top_max_pos, - shifted_z_scores, debug_fps[2], debug_fps[3], reg_id, - fwd_pass[-1,top_max_pos]) - if _DEBUG_MIDDLE: - _write_middle_debug(shifted_z_scores, fwd_pass, band_event_starts, - debug_fps[0], reg_id) - _write_tb_debug(fwd_pass_move, band_event_starts, top_max_pos, - debug_fps[1], reg_id) - - if fwd_pass[-1,top_max_pos] < score_thresh: - raise NotImplementedError, ( - 'No valid path found through raw signal of long read') - - read_tb = c_banded_traceback(fwd_pass_move, band_event_starts, top_max_pos, - band_boundary_thresh) - start_trim_i = 0 - while read_tb[start_trim_i] < 0: - read_tb[start_trim_i] = 0 - start_trim_i += 1 - end_trim_i = 1 - while read_tb[-end_trim_i] > events_len: - read_tb[-end_trim_i] = events_len - end_trim_i += 1 - - # TODO: barrow code from debug_full to add strict filter for reads - # with significant portion (500 events) not matching model well - - return read_tb - -def _get_mapping_ends( - event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - seq_window, event_window, score_thresh, reg_id=None, debug_fps=None): - if event_means.shape[0] < event_window: - raise NotImplementedError, ( - 'Read too short for eventless start/end discovery') - if r_ref_means.shape[0] < seq_window: - raise NotImplementedError, ( - 'Genomic mapping too short for eventless start/end discovery') - - # banded z-scores (moving up one event per base for start/end discovery - start_z_scores = z_shift - np.row_stack([ - np.abs(event_means[seq_pos:seq_pos + event_window] - - r_ref_means[seq_pos]) / r_ref_sds[seq_pos] - for seq_pos in range(seq_window)]) - start_band_event_starts = np.arange(seq_window, dtype=np.int32) - - start_fwd_pass, start_fwd_pass_move = c_banded_forward_pass( - start_z_scores, start_band_event_starts, skip_pen, stay_pen) - - #print '\n'.join( - # '\t'.join(map(str, (pos, np.max(fwd_pass[pos-1,:]), 'start', reg_id))) - # for pos in [100,200,300,400]) - - # find max along the top and right edges to start traceback - top_max_pos = np.argmax(start_fwd_pass[-1,:]) - if start_fwd_pass[-1,top_max_pos] < score_thresh: - raise NotImplementedError, ( - 'No valid path found through start of raw signal') - - # perform traceback - start_tb = c_banded_traceback( - start_fwd_pass_move, start_band_event_starts, top_max_pos) - - - # Now identify the end of the read - n_events = event_means.shape[0] - n_bases = r_ref_means.shape[0] - end_band_event_starts = np.arange(n_events - event_window - seq_window, - n_events - event_window, dtype=np.int32) - end_z_scores = z_shift - np.row_stack([ - np.abs(event_means[end_band_event_starts[seq_pos]: - end_band_event_starts[seq_pos] + event_window] - - r_ref_means[n_bases - seq_window + seq_pos]) / r_ref_sds[ - n_bases - seq_window + seq_pos] - for seq_pos in range(seq_window)]) - - end_fwd_pass, end_fwd_pass_move = c_banded_forward_pass( - end_z_scores, end_band_event_starts, skip_pen, stay_pen) - #print '\n'.join( - # '\t'.join(map(str, (pos, np.max(fwd_pass[pos-1,:]), 'end', reg_id))) - # for pos in [100,200,300,400]) - - # find max along the top and right edges to start traceback - top_max_pos = np.argmax(end_fwd_pass[-1,:]) - if end_fwd_pass[-1,top_max_pos] < score_thresh: - raise NotImplementedError, 'No valid path found through end of raw signal' - - # perform traceback - end_tb = c_banded_traceback( - end_fwd_pass_move, end_band_event_starts, top_max_pos) - - return start_tb[0], end_tb[-1] +##################################################### +########## Static Band Dynamic Programming ########## +##################################################### def get_short_read_event_mapping( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - score_thresh, reg_id=None, debug_fps=None): + reg_id=None, debug_fps=None): """ - Perform banded dynamic programming sequence to event alignment without masking + Perform banded dynamic programming sequence to event alignment + without masking :param event_means: Numpy array with read base means :param r_ref_means: Numpy array with read reference means :param r_ref_sds: Numpy array with read reference standard deviations :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 expected value) - :param z_shift: Shift z-scores by this amount (includes matching positive expected value) - :param score_thresh: Threshold for a read to pass + :param stay_pen: Penalty applied to stay states (should shift to 0 + expected value) + :param z_shift: Shift z-scores by this amount (includes matching + positive expected value) :returns: Event to sequence mapping for full length of short read """ @@ -444,10 +357,10 @@ def get_short_read_event_mapping( # create read starts in order to clip just the corners of the full events to # seqeunce matrix - mask_len = min(seq_len, events_len) / 4 + mask_len = min(seq_len, events_len) // 4 band_event_starts = np.concatenate([ np.zeros(seq_len - mask_len * 2), - np.linspace(0, mask_len, mask_len * 2)]).astype(np.int32) + np.linspace(0, mask_len, mask_len * 2)]).astype(np.int64) bandwidth = events_len - mask_len shifted_z_scores = z_shift - np.row_stack([ @@ -469,101 +382,20 @@ def get_short_read_event_mapping( _write_tb_debug(fwd_pass_move, band_event_starts, top_max_pos, debug_fps[1], reg_id) - if fwd_pass[-1,top_max_pos] < score_thresh: - raise NotImplementedError, ( - 'No valid path found through raw signal of short read') read_tb = c_banded_traceback(fwd_pass_move, band_event_starts, top_max_pos) return read_tb -def _find_base_assignment( - norm_signal, min_base_obs, num_events, tb_model, - genome_seq, genome_loc, skip_pen, stay_pen, z_shift, bandwidth, - ends_bandwidth=2000, ends_seq_window=300, ends_thresh=0.0, - reg_id=None, debug_fps=None): - # get events before clipping - valid_cpts = th.get_valid_cpts(norm_signal, min_base_obs, num_events) - #valid_cpts = c_valid_cpts_w_cap_t_test( - # norm_signal, min_base_obs, num_events) - valid_cpts.sort() - event_means = c_new_means(norm_signal, valid_cpts) - kmer_ref, kmer_width, upstrm_bases, dnstrm_bases = tb_model - r_ref_means, r_ref_sds = map(np.array, zip(*[ - kmer_ref[kmer] for kmer in [''.join(bs) for bs in zip(*[ - genome_seq[i:] for i in range(kmer_width)])]])) - # trim genome seq to match model-able positions - genome_seq = genome_seq[upstrm_bases:-dnstrm_bases] - if genome_loc.Strand == '+': - genome_loc = th.genomeLoc( - genome_loc.Start + upstrm_bases, '+', genome_loc.Chrom) - else: - genome_loc = th.genomeLoc( - genome_loc.Start + dnstrm_bases, '-', genome_loc.Chrom) - - # for short reads, just search the whole read with a larger bandwidth - if (event_means.shape[0] < (ends_bandwidth + ends_seq_window) * 2 or - r_ref_means.shape[0] < ends_seq_window * 2): - seq_events = get_short_read_event_mapping( - event_means, r_ref_means, r_ref_sds, - skip_pen, stay_pen, z_shift, sr_bandwidth, ends_thresh, - reg_id=reg_id, debug_fps=debug_fps) - seq_segs = valid_cpts[seq_events] - read_start_rel_to_raw = seq_segs[0] - seq_segs = seq_segs - read_start_rel_to_raw - return (seq_segs, r_ref_means, r_ref_sds, read_start_rel_to_raw, - genome_seq, genome_loc) - - # identify the start and end of the read within the signal using a larger - # bandwidth - mapped_start, mapped_end = _get_mapping_ends( - event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - ends_seq_window, ends_bandwidth, ends_thresh, - reg_id=reg_id, debug_fps=debug_fps) - - # get number of events to clip and how far into the events the - # discovered start is located - hald_bandwidth = bandwidth / 2 - if mapped_start < hald_bandwidth: - events_start_clip = 0 - mapped_start_offset = mapped_start - else: - events_start_clip = mapped_start - hald_bandwidth - mapped_start_offset = hald_bandwidth - - if mapped_end + hald_bandwidth > event_means.shape[0]: - events_end_clip = event_means.shape[0] - mapped_end_offset = event_means.shape[0] - mapped_end - else: - events_end_clip = mapped_end + hald_bandwidth - mapped_end_offset = hald_bandwidth - - # now find full sequence to events path using a smaller bandwidth - event_means = event_means[events_start_clip:events_end_clip] - valid_cpts = valid_cpts[events_start_clip:events_end_clip + 1] - read_tb = _get_masked_event_mapping( - event_means, r_ref_means, r_ref_sds, - mapped_start_offset, mapped_end_offset, - skip_pen, stay_pen, z_shift, bandwidth, - reg_id=reg_id, debug_fps=debug_fps) - seq_segs = valid_cpts[read_tb] - read_start_rel_to_raw = seq_segs[0] - seq_segs = seq_segs - read_start_rel_to_raw - - return (seq_segs, r_ref_means, r_ref_sds, read_start_rel_to_raw, - genome_seq, genome_loc) - - - -########################################################## -########## Adaptive banding dynamic programming ########## -########################################################## +####################################################### +########## Adaptive Band Dynamic Programming ########## +####################################################### def get_masked_start_fwd_pass( event_means, r_ref_means, r_ref_sds, mapped_start_offset, skip_pen, stay_pen, z_shift, bandwidth, events_per_base, - mask_fill_z_score=-10, mask_bases=50, end_event_gap=0, - reg_id=None, debug_fps=None): + mask_fill_z_score=MASK_FILL_Z_SCORE, + mask_bases=MASK_BASES, reg_id=None, debug_fps=None): """ Perform banded dynamic programming sequence to event alignment forcing the path to start and end at the previously discovered locations. @@ -573,17 +405,22 @@ def get_masked_start_fwd_pass( :param event_means: Numpy array with read base means :param r_ref_means: Numpy array with read reference means :param r_ref_sds: Numpy array with read reference standard deviations - :param mapped_start_offset: Previously identified start of genomic sequence within events + :param mapped_start_offset: Previously identified start of genomic + sequence within events :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 expected value) - :param z_shift: Shift z-scores by this amount (includes matching positive expected value) - :param bandwidth: Bandwidth over which to search for sequence to event mapping + :param stay_pen: Penalty applied to stay states (should shift to 0 + expected value) + :param z_shift: Shift z-scores by this amount (includes matching positive + expected value) + :param bandwidth: Bandwidth over which to search for sequence to + event mapping :param events_per_base: Average events per base for the start mapping - :returns: Event to sequence mapping for start of read including forward pass scores, forward pass move + :returns: Event to sequence mapping for start of read including forward + pass scores, forward pass move values, band starts within the events vector and z-scores """ - half_bandwidth = bandwidth / 2 + half_bandwidth = bandwidth // 2 # check if the mapped start position is too close to the end of # the events array and extend the bandwidth window if so @@ -596,7 +433,7 @@ def get_masked_start_fwd_pass( band_event_starts = np.linspace( band_events_start_pos, band_events_start_pos + (tmp_seq_len * events_per_base), - tmp_seq_len).astype(np.int32) + tmp_seq_len).astype(np.int64) mask_seq_len = max( mask_bases, next(i + 2 for i, bes in enumerate(band_event_starts) if bes >= mapped_start_offset)) @@ -604,9 +441,9 @@ def get_masked_start_fwd_pass( # get masked z-scores at the beginning of the read mask_start_pos = np.linspace( - mapped_start_offset + 1 + end_event_gap, + mapped_start_offset + 1, band_event_starts[mask_bases - 1] + bandwidth, - mask_bases).astype(np.int32) + mask_bases).astype(np.int64) def get_start_mask_z_score(seq_pos, event_pos): start_mask_len = max(mapped_start_offset - event_pos, 0) end_mask_len = (0 if seq_pos >= mask_bases else @@ -618,6 +455,7 @@ def get_start_mask_z_score(seq_pos, event_pos): masked_z_scores = np.concatenate([ [mask_fill_z_score] * start_mask_len, b_z_scores, [mask_fill_z_score] * end_mask_len]) + del b_z_scores return masked_z_scores shifted_z_scores = z_shift + np.row_stack([ get_start_mask_z_score(seq_pos, event_pos) @@ -629,28 +467,38 @@ def get_start_mask_z_score(seq_pos, event_pos): def get_mapping_start( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - seq_window, bandwidth, score_thresh, reg_id=None, debug_fps=None): + seq_window, bandwidth, norm_signal, valid_cpts, score_thresh, + min_obs_per_base, reg_id=None, debug_fps=None): """ Perform banded dynamic programming sequence to event alignment through - The beginning of an read to identify the start of genome sequence to event matching + The beginning of an read to identify the start of genome sequence to + event matching :param event_means: Numpy array with read base means :param r_ref_means: Numpy array with read reference means :param r_ref_sds: Numpy array with read reference standard deviations :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 expected value) - :param z_shift: Shift z-scores by this amount (includes matching positive expected value) - :param seq_window: Number of genomic bases to search over for the start of the read - :param bandwidth: Bandwidth over which to search for sequence to event mapping - :param score_thresh: Threshold for a read to pass - - :returns: Start of seqeunce to event alignment and the mean events_per_base through the queried portion of a read + :param stay_pen: Penalty applied to stay states (should shift to 0 + expected value) + :param z_shift: Shift z-scores by this amount (includes matching positive + expected value) + :param seq_window: Number of genomic bases to search over for the start of + the read + :param bandwidth: Bandwidth over which to search for sequence to + event mapping + :param norm_signal: Normalized raw signal vector + :param valid_cpts: Segmentation positions within norm_signal + :param score_thresh: Read mean half-normal signal segmentation score + threshold + + :returns: Start position (0-based) of seqeunce to event alignment within + events and the mean events_per_base through the queried portion of read """ if event_means.shape[0] < bandwidth: - raise NotImplementedError, ( + raise NotImplementedError( 'Read too short for eventless start/end discovery') if r_ref_means.shape[0] < seq_window: - raise NotImplementedError, ( + raise NotImplementedError( 'Genomic mapping too short for eventless start/end discovery') # banded z-scores (moving up one event per base for start/end discovery @@ -659,86 +507,106 @@ def get_mapping_start( r_ref_means[seq_pos]) / r_ref_sds[seq_pos] for seq_pos in range(seq_window)]) start_band_event_starts = np.linspace( - 0, seq_window, seq_window).astype(np.int32) + 0, seq_window, seq_window).astype(np.int64) - np.arange(seq_window, dtype=np.int32) + np.arange(seq_window, dtype=np.int64) start_fwd_pass, start_fwd_pass_move = c_banded_forward_pass( start_z_scores, start_band_event_starts, skip_pen, stay_pen) # find max along the top and right edges to start traceback top_max_pos = np.argmax(start_fwd_pass[-1,:]) - if start_fwd_pass[-1,top_max_pos] < score_thresh: - # TODO: Add iterative start search with set number of iterations - # for reads that start further into the read. - raise NotImplementedError, ( - 'No valid path found through start of raw signal') # perform traceback start_tb = c_banded_traceback( start_fwd_pass_move, start_band_event_starts, top_max_pos) + + # check that read start mapping is valid to avoid wasted compute on + # adaptive dp + start_segs = valid_cpts[start_tb] + start_sig = norm_signal[start_segs[0]:start_segs[-1]] + start_segs = start_segs - start_segs[0] + start_segs = get_model_fit_segs( + start_segs, start_sig, r_ref_means[:seq_window], + r_ref_sds[:seq_window], min_obs_per_base) + if get_read_seg_score( + start_sig, start_segs, r_ref_means[:seq_window], + r_ref_sds[:seq_window]) > score_thresh: + raise NotImplementedError( + 'Poor raw to expected signal matching at read start') + # compute the average events per base to use for the start forward pass - events_per_base = float(start_tb[-1] - start_tb[0]) / len(start_tb) + events_per_base = (start_tb[-1] - start_tb[0]) / len(start_tb) + start_loc = start_tb[0] - return start_tb[0], events_per_base + return start_loc, events_per_base def find_adaptive_base_assignment( - norm_signal, min_base_obs, num_events, tb_model, + norm_signal, running_stat_width, min_obs_per_base, num_events, std_ref, genome_seq, genome_loc, skip_pen, stay_pen, z_shift, bandwidth, is_rna, - start_bandwidth=2000, start_seq_window=500, start_thresh=0.0, - band_boundary_thresh=5, reg_id=None, debug_fps=None): + score_thresh, start_bandwidth=START_BANDWIDTH, + start_seq_window=START_SEQ_WINDOW, + band_boundary_thresh=BAND_BOUNDARY_THRESH, reg_id=None, debug_fps=None): """ - Perform banded dynamic programming sequence to event alignment by first identifying the start of the - sequence to event matching and then performing banded matching through the whole read + Perform banded dynamic programming sequence to event alignment by first + identifying the start of the sequence to event matching and then + performing banded matching through the whole read :param norm_signal: Numpy array with normalized read signal - :param min_base_obs: Minimum number of raw observations per base + :param running_stat_width: Width of neighboring windows over which to + compute changepoint stats + :param min_obs_per_base: Minimum number of raw observations per base :param num_events: Number of events to identify in this read - :param tb_model: A Tombo model + :param std_ref: A TomboModel object :param genome_seq: Genomic sequence for this read :param genome_loc: Mapped genomic location for this read :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 expected value) - :param z_shift: Shift z-scores by this amount (includes matching positive expected value) - :param bandwidth: Bandwidth over which to search for sequence to event mapping + :param stay_pen: Penalty applied to stay states (should shift to 0 + expected value) + :param z_shift: Shift z-scores by this amount (includes matching positive + expected value) + :param bandwidth: Bandwidth over which to search for sequence to + event mapping :param is_rna: Is this an RNA read + :param score_thresh: Read mean half-normal segmentation score threshold - :returns: Start of seqeunce to event alignment and the mean events_per_base through the queried portion of a read + :returns: Start of seqeunce to event alignment and the mean + events_per_base through the queried portion of a read """ # get events - # for RNA evenly smaller events could be detrimental to the fit - # so perform slower segmentation which does not allow small events if is_rna: + # RNA bases show consistent variable spread so use t-test segmentation valid_cpts = c_valid_cpts_w_cap_t_test( - norm_signal, min_base_obs, num_events) + norm_signal, min_obs_per_base, running_stat_width, num_events) else: - valid_cpts = th.get_valid_cpts(norm_signal, min_base_obs, num_events) + valid_cpts = c_valid_cpts_w_cap( + norm_signal, min_obs_per_base, running_stat_width, num_events) + #valid_cpts = th.get_valid_cpts( + # norm_signal, running_stat_width, num_events) valid_cpts.sort() event_means = c_new_means(norm_signal, valid_cpts) - kmer_ref, kmer_width, upstrm_bases, dnstrm_bases = tb_model - r_ref_means, r_ref_sds = map(np.array, zip(*[ - kmer_ref[kmer] for kmer in [''.join(bs) for bs in zip(*[ - genome_seq[i:] for i in range(kmer_width)])]])) + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + r_ref_means, r_ref_sds, _, _ = ts.get_ref_from_seq(genome_seq, std_ref) # trim genome seq to match model-able positions - genome_seq = genome_seq[upstrm_bases:-dnstrm_bases] + genome_seq = genome_seq[std_ref.central_pos:-dnstrm_bases] seq_len = len(genome_seq) if genome_loc.Strand == '+': - genome_loc = th.genomeLoc( - genome_loc.Start + upstrm_bases, '+', genome_loc.Chrom) + genome_loc = genome_loc._replace( + Start=genome_loc.Start + std_ref.central_pos) else: - genome_loc = th.genomeLoc( - genome_loc.Start + dnstrm_bases, '-', genome_loc.Chrom) + genome_loc = genome_loc._replace(Start=genome_loc.Start + dnstrm_bases) # for short reads, just search the whole read with a larger bandwidth if (event_means.shape[0] < start_bandwidth + start_seq_window or seq_len < start_seq_window): seq_events = get_short_read_event_mapping( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, - z_shift, start_thresh, reg_id=reg_id, debug_fps=debug_fps) + z_shift, reg_id=reg_id, debug_fps=debug_fps) seq_segs = valid_cpts[seq_events] read_start_rel_to_raw = seq_segs[0] seq_segs = seq_segs - read_start_rel_to_raw + return (seq_segs, r_ref_means, r_ref_sds, read_start_rel_to_raw, genome_seq, genome_loc) @@ -746,12 +614,13 @@ def find_adaptive_base_assignment( # bandwidth mapped_start, events_per_base = get_mapping_start( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - start_seq_window, start_bandwidth, start_thresh, - reg_id=reg_id, debug_fps=debug_fps) + start_seq_window, start_bandwidth, norm_signal, valid_cpts, + score_thresh, min_obs_per_base, reg_id=reg_id, + debug_fps=debug_fps) # get number of events to clip and how far into the events the # discovered start is located - half_bandwidth = bandwidth / 2 + half_bandwidth = bandwidth // 2 if mapped_start < half_bandwidth: events_start_clip = 0 mapped_start_offset = mapped_start @@ -768,30 +637,29 @@ def find_adaptive_base_assignment( mapped_start_offset, skip_pen, stay_pen, z_shift, bandwidth, events_per_base, reg_id=reg_id, debug_fps=debug_fps) start_seq_len = start_event_starts.shape[0] - fwd_pass = np.row_stack([ - start_fwd_pass, - np.empty((seq_len - start_seq_len, bandwidth))]) - fwd_pass_move = np.row_stack([ - start_fwd_pass_move, - np.empty((seq_len - start_seq_len, bandwidth), dtype=np.int32)]) - band_event_starts = np.concatenate( - [start_event_starts, - np.empty(seq_len - start_seq_len, dtype=np.int32)]) + fwd_pass = np.empty((seq_len+1, bandwidth), dtype=np.float64) + fwd_pass[:start_seq_len+1] = start_fwd_pass + fwd_pass_move = np.empty((seq_len+1, bandwidth), dtype=np.int64) + fwd_pass_move[:start_seq_len+1] = start_fwd_pass_move + band_event_starts = np.empty((seq_len,), dtype=np.int64) + band_event_starts[:start_seq_len] = start_event_starts #fwd_pass[start_seq_len+1:,:] = np.NAN #fwd_pass_move[start_seq_len+1:,:] = np.NAN #band_event_starts[start_seq_len:] = np.NAN if _DEBUG_FULL or _DEBUG_MIDDLE: - shifted_z_scores = c_adaptive_banded_forward_pass( + rest_z_scores = c_adaptive_banded_forward_pass( fwd_pass, fwd_pass_move, band_event_starts, event_means, r_ref_means, r_ref_sds, z_shift, skip_pen, stay_pen, - start_seq_len, True) - shifted_z_scores = np.row_stack([start_z_scores, shifted_z_scores]) + start_seq_len, MASK_FILL_Z_SCORE, True) + shifted_z_scores = np.empty((seq_len, bandwidth), dtype=np.float64) + shifted_z_scores[:start_seq_len] = start_z_scores + shifted_z_scores[start_seq_len:] = rest_z_scores else: c_adaptive_banded_forward_pass( fwd_pass, fwd_pass_move, band_event_starts, event_means, r_ref_means, r_ref_sds, z_shift, skip_pen, stay_pen, - start_seq_len) + start_seq_len, MASK_FILL_Z_SCORE) top_max_pos = np.argmax(fwd_pass[-1,:]) if _DEBUG_FULL: @@ -804,9 +672,6 @@ def find_adaptive_base_assignment( _write_tb_debug(fwd_pass_move, band_event_starts, top_max_pos, debug_fps[1], reg_id) - if fwd_pass[-1,top_max_pos] < start_thresh: - raise NotImplementedError, ( - 'No valid path found through raw signal of long read') read_tb = c_banded_traceback( fwd_pass_move, band_event_starts, top_max_pos, band_boundary_thresh) @@ -828,69 +693,78 @@ def find_adaptive_base_assignment( genome_seq, genome_loc) - -################################################### -########## Resquiggle initial processing ########## -################################################### +###################################### +########## Re-squiggle Read ########## +###################################### def resquiggle_read( - fast5_fn, genome_seq, tb_model, outlier_thresh, genome_loc, read_info, - basecall_group, corrected_group, compute_sd, skip_pen, stay_pen, z_shift, - bandwidth, obs_filter, bio_samp_type, del_fix_window=5, - min_event_to_seq_ratio=1.1, max_new_cpts=None, in_place=True, - skip_index=False, reg_id=None, debug_fps=None, const_scale=None): + fast5_data, fast5_fn, genome_seq, genome_loc, align_info, std_ref, + outlier_thresh, bc_grp, corr_grp, bio_samp_type, compute_sd, + seg_params, sig_aln_params, obs_filter, max_raw_cpts=MAX_RAW_CPTS, + min_event_to_seq_ratio=MIN_EVENT_TO_SEQ_RATIO, + in_place=True, skip_index=False, reg_id=None, debug_fps=None, + const_scale=None): """ Perform banded dynamic programming sequence to event alignment for this read - :param fast5_fn: Filename for a read + :param fast5_data: Open h5py object containing read information + :param fast5_fn: Relative path to filename for index creation :param genome_seq: Genomic sequence for this read - :param tb_model: A Tombo model + :param genome_loc: Mapped genomic location named tuple for this read + :param align_info: A alignInfo named tuple for this read + :param std_ref: A TomboModel object :param outlier_thresh: Outlier threshold for raw signal normalization - :param genome_loc: Mapped genomic location for this read - :param read_info: A read info named tuple for this read - :param basecall_group: The basecalled read group to analyze - :param corrected_group: The tombo corrected group to write results + :param bc_grp: The basecalled read group to analyze + :param corr_grp: The tombo corrected group to write results + :param bio_samp_type: Biological sample type (either 'DNA' or 'RNA' or + None to determine from read) :param compute_sd: Should SD computations be computed and saved to file - :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 expected value) - :param z_shift: Shift z-scores by this amount (includes matching positive expected value) - :param bandwidth: Bandwidth over which to search for sequence to event mapping - :param obs_filter: Obervations per base filter to apply for filtered slot in FAST5 + :param seg_params: 3 segmenation parameters (mean_obs_per_event, + running_stat_width and min_obs_per_base) + :param sig_aln_params: Signal align parameters (match_evalue, skip_pen + and bandwidth) + :param obs_filter: Obervations per base filter to apply for filtered slot + in FAST5 """ - # errors should not happen here since these slotes were checked - # in alignment function, but old zombie processes might cause - # problems here try: - fast5_data = h5py.File(fast5_fn, 'r') channel_info = th.get_channel_info(fast5_data) # extract raw data for this read - all_raw_signal = fast5_data['/Raw/Reads/'].values()[0]['Signal'].value - - if bio_samp_type is None: - is_rna = th.is_read_rna(fast5_data) - bio_samp_type = 'RNA' if is_rna else 'DNA' - else: - is_rna = bio_samp_type == 'RNA' - - fast5_data.close() + all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'].value except: #raise - raise NotImplementedError, ( - 'Error opening file for re-squiggle. This should have ' + - 'been caught during the alignment phase. Check that there ' + - 'are no other tombo processes or processes accessing ' + - 'these HDF5 files running simultaneously') + raise NotImplementedError( + 'Channel or raw signal information not found in FAST5 file') # flip raw signal for re-squiggling + is_rna = bio_samp_type == 'RNA' if is_rna: all_raw_signal = all_raw_signal[::-1] - mean_obs_per_base, running_diff_width = OBS_PER_EVENT_TABLE[bio_samp_type] - num_events = max(all_raw_signal.shape[0] / mean_obs_per_base, + if sig_aln_params is None: + match_evalue, skip_pen, bandwidth, score_thresh = ALGN_PARAMS_TABLE[ + bio_samp_type] + else: + # unpack signal alignment parameters + match_evalue, skip_pen, bandwidth, score_thresh = sig_aln_params + bandwidth = int(bandwidth) + z_shift, stay_pen = ts.get_dynamic_prog_params(match_evalue) + + if seg_params is None: + (running_stat_width, min_obs_per_base, + mean_obs_per_event) = SEG_PARAMS_TABLE[bio_samp_type] + else: + (running_stat_width, min_obs_per_base, + mean_obs_per_event) = seg_params + + # compute number of events to find + # ensure at least a minimal number of events per mapped sequence are found + num_events = max(all_raw_signal.shape[0] // mean_obs_per_event, int(len(genome_seq) * min_event_to_seq_ratio)) + # ensure that there isn't *far* too much signal for the mapped sequence + # i.e. one adaptive bandwidth per base is too much to find a good mapping if num_events / bandwidth > len(genome_seq): - raise NotImplementedError, 'Too much raw signal for short mapped sequence' + raise NotImplementedError('Too much raw signal for mapped sequence') # normalize signal if const_scale is not None: norm_signal, scale_values = th.normalize_raw_signal( @@ -904,9 +778,9 @@ def resquiggle_read( (segs, r_ref_means, r_ref_sds, read_start_rel_to_raw, genome_seq, genome_loc) = find_adaptive_base_assignment( - norm_signal, running_diff_width, num_events, tb_model, + norm_signal, running_stat_width, min_obs_per_base, num_events, std_ref, genome_seq, genome_loc, skip_pen, stay_pen, z_shift, bandwidth, is_rna, - reg_id=reg_id, debug_fps=debug_fps) + score_thresh, reg_id=reg_id, debug_fps=debug_fps) norm_signal = norm_signal[read_start_rel_to_raw: read_start_rel_to_raw + segs[-1]] @@ -914,553 +788,69 @@ def resquiggle_read( # to be fixed. segs = get_model_fit_segs( segs, norm_signal, r_ref_means, r_ref_sds, - running_diff_width, del_fix_window, max_new_cpts) + min_obs_per_base, max_raw_cpts) + if get_read_seg_score( + norm_signal, segs, r_ref_means, r_ref_sds) > score_thresh: + raise NotImplementedError('Poor raw to expected signal matching') if segs.shape[0] != len(genome_seq) + 1: - raise ValueError, ('Aligned sequence does not match number ' + - 'of segments produced') + raise ValueError('Aligned sequence does not match number ' + + 'of segments produced') # Output for testing/visualization of event-less re-squiggle + if _DEBUG_PARAMS: + _write_params_debug( + norm_signal, segs, r_ref_means, r_ref_sds, + running_stat_width, min_obs_per_base, mean_obs_per_event, + match_evalue, skip_pen, bandwidth, fast5_fn) if _DEBUG_FIT: - norm_means = c_new_means(norm_signal, segs) - #r_mean_z = np.mean([np.abs((b_m - b_ref_m) / b_ref_s) - # for b_m, b_ref_m, b_ref_s in - # zip(norm_means, r_ref_means, r_ref_sds)]) - #if r_mean_z > 1: - # print fast5_fn - with open('eventless_testing.model.txt', 'w') as fp: - fp.write('Position\tMean\tSD\n' + '\n'.join( - '\t'.join(map(str, (pos, p_mean, p_std))) - for pos, (p_mean, p_std) in enumerate(zip( - r_ref_means, r_ref_sds))) + '\n') - with open('eventless_testing.seq.txt', 'w') as fp: - fp.write('Base\tPosition\tSignalMean\n' + '\n'.join( - '\t'.join(map(str, (b, pos, p_mean))) for pos, (b, p_mean) in - enumerate(zip(genome_seq, norm_means))) + '\n') - Position, Signal = [], [] - for base_i, (b_start, b_end) in enumerate(zip(segs[:-1], segs[1:])): - Position.extend( - base_i + np.linspace(0, 1, b_end - b_start, endpoint=False)) - Signal.extend(norm_signal[b_start:b_end]) - with open('eventless_testing.signal.txt', 'w') as fp: - fp.write('Position\tSignal\n' + '\n'.join( - '\t'.join(map(str, (pos, sig))) - for pos, sig in zip(Position, Signal)) + '\n') + _write_fit_debug( + norm_signal, segs, r_ref_means, r_ref_sds, genome_seq) if in_place: - # write re-squiggle event assignment to the read FAST5 file - th.write_new_fast5_group( - fast5_fn, genome_loc, read_start_rel_to_raw, segs, genome_seq, - norm_signal, scale_values, corrected_group, read_info.Subgroup, - 'median', outlier_thresh, compute_sd, align_info=read_info, - rna=is_rna) + if not _DRY_RUN: + # write re-squiggle event assignment to the read FAST5 file + th.write_new_fast5_group( + fast5_data, genome_loc, read_start_rel_to_raw, segs, + genome_seq, norm_signal, scale_values, corr_grp, + align_info.Subgroup, 'median', outlier_thresh, + compute_sd, align_info=align_info, rna=is_rna) else: # create new hdf5 file to hold corrected read events pass if not skip_index: return th.prep_index_data( - fast5_fn, genome_loc, read_start_rel_to_raw, segs, - corrected_group, read_info.Subgroup, is_rna, obs_filter) - - return - -def _resquiggle_worker( - basecalls_q, progress_q, failed_reads_q, index_q, basecall_group, - corrected_group, tb_model_fn, outlier_thresh, compute_sd, skip_pen, - match_evalue, bandwidth, obs_filter, const_scale, bio_samp_type): - num_processed = 0 - skip_index = index_q is None - if not skip_index: proc_index_data = [] - debug_fps = None - if _DEBUG_MIDDLE or _DEBUG_FULL: - score_fp = open('debug_event_align.txt', 'w') - score_fp.write('EventPos\tSeqPos\tScore\tRegion\n') - tb_fp = open('debug_event_align.traceback.txt', 'w') - tb_fp.write('EventPos\tSeqPos\tRegion\n') - full_fit_fp = open('debug_event_align.full_fit.txt', 'w') - full_fit_fp.write('EventPos\tBandPos\tEventScore\tRegion\n') - full_failed_fp = open('debug_event_align.full_failed.txt', 'w') - full_failed_fp.write('DidFail\tRegion\n') - debug_fps = [score_fp, tb_fp, full_fit_fp, full_failed_fp] - - # parse tombo model (ignore alt_base and model_name) - kmer_ref, upstrm_bases, _, _ = ts.parse_tombo_model(tb_model_fn) - kmer_width = len(next(kmer_ref.iterkeys())) - dnstrm_bases = kmer_width - upstrm_bases - 1 - tb_model = (kmer_ref, kmer_width, upstrm_bases, dnstrm_bases) - - # get dynamic programming parameters - z_shift, stay_pen = ts.get_dynamic_prog_params(match_evalue) - - while True: - try: - fast5_fn, sgs_align_data = basecalls_q.get(block=False) - # None values placed in queue when all files have - # been processed - if fast5_fn is None: break - except Queue.Empty: - sleep(1) - continue - - num_processed += 1 - if num_processed % int(PROGRESS_INTERVAL / 5) == 0: - progress_q.put(int(PROGRESS_INTERVAL / 5)) - # process different read subgroups sequentially so that the same - # file is never open simultaneously - for genome_seq, genome_loc, read_info in sgs_align_data: - try: - index_data = resquiggle_read( - fast5_fn, genome_seq, tb_model, outlier_thresh, - genome_loc, read_info, basecall_group, corrected_group, - compute_sd, skip_pen, stay_pen, z_shift, bandwidth, - obs_filter, bio_samp_type, skip_index=skip_index, - reg_id=num_processed, debug_fps=debug_fps, - const_scale=const_scale) - if not skip_index: - proc_index_data.append(index_data) - if index_data[1][6]: - failed_reads_q.put(( - 'Read filtered by observation per base ' + - 'thresholds (revert with `tombo clear_filters`)', - read_info.Subgroup + th.FASTA_NAME_JOINER + fast5_fn)) - except Exception as e: - # uncomment to identify mysterious errors - #raise - try: - th.write_error_status( - fast5_fn, corrected_group, read_info.Subgroup, str(e)) - except: - pass - failed_reads_q.put(( - str(e), read_info.Subgroup + th.FASTA_NAME_JOINER + fast5_fn)) - - if not skip_index: index_q.put(proc_index_data) + fast5_fn, genome_loc, read_start_rel_to_raw, + segs, corr_grp, align_info.Subgroup, is_rna, obs_filter) return -if _PROFILE_RSQGL: - _resquiggle_wrapper = _resquiggle_worker - def _resquiggle_worker(*args): - import cProfile - cProfile.runctx('_resquiggle_wrapper(*args)', globals(), locals(), - filename='resquiggle_eventless_main.prof') - return - - -############################################ -########## Genomic Alignment Code ########## -############################################ - -def clip_m5_alignment(alignVals, start, strand, chrm): - """ - Clip hard and soft clipped bases from an m5 format alignment - """ - # clip read to first matching bases - start_clipped_read_bases = 0 - start_clipped_genome_bases = 0 - start_clipped_align_bases = 0 - r_base, g_base = alignVals[0] - while r_base == '-' or g_base == '-': - start_clipped_read_bases += int(r_base != '-') - start_clipped_genome_bases += int(g_base != '-') - start_clipped_align_bases += 1 - r_base, g_base = alignVals[start_clipped_align_bases] - - end_clipped_read_bases = 0 - end_clipped_genome_bases = 0 - end_clipped_align_bases = 0 - r_base, g_base = alignVals[-1] - while r_base == '-' or g_base == '-': - end_clipped_read_bases += int(r_base != '-') - end_clipped_genome_bases += int(g_base != '-') - end_clipped_align_bases += 1 - r_base, g_base = alignVals[-1 * (end_clipped_align_bases + 1)] - - alignVals = alignVals[start_clipped_align_bases:] - if end_clipped_align_bases > 0: - alignVals = alignVals[:-1*end_clipped_align_bases] - - if strand == '+' and start_clipped_genome_bases > 0: - genome_loc = th.genomeLoc( - start + start_clipped_genome_bases, '+', chrm) - elif strand == '-' and end_clipped_genome_bases > 0: - genome_loc = th.genomeLoc( - start + end_clipped_genome_bases, '-', chrm) - else: - genome_loc = th.genomeLoc(start, strand, chrm) - - return alignVals, start_clipped_read_bases, \ - end_clipped_read_bases, genome_loc - -def parse_m5_record(r_m5_record, read_id, bc_subgroup): - """ - Parse a single m5 formatted alignment - """ - if r_m5_record['tStrand'] != '+': - raise NotImplementedError, ( - 'Mapping indicates negative strand reference mapping') - - if r_m5_record['qStrand'] == "+": - alignVals = zip(r_m5_record['qAlignedSeq'], - r_m5_record['tAlignedSeq']) - else: - alignVals = zip(th.rev_comp(r_m5_record['qAlignedSeq']), - th.rev_comp(r_m5_record['tAlignedSeq'])) - - alignVals, start_clipped_bases, end_clipped_bases, genome_loc \ - = clip_m5_alignment( - alignVals, int(r_m5_record['tStart']), - r_m5_record['qStrand'], r_m5_record['tName']) - tSeq = ''.join(zip(*alignVals)[1]) - - # TOOD compute indel/match/mismatch counts - read_info = readInfo( - read_id, bc_subgroup, start_clipped_bases, end_clipped_bases, - 0, 0, 0, 0) - - return tSeq, genome_loc, read_info - -def parse_m5_output(align_output, batch_reads_data): - """ - Parse a batch of m5 formatted alignments - """ - alignments = dict( - (read_fn_sg, None) for read_fn_sg in batch_reads_data.keys()) - for line in align_output: - r_m5_record = dict(zip(M5_FIELDS, line.strip().split())) - if len(r_m5_record) != len(M5_FIELDS): - continue - # store the alignment if none is stored for this read or - # if this read has the highest map quality thus far - qName = r_m5_record['qName'].replace(th.FN_SPACE_FILLER, ' ') - if alignments[qName] is None or ( - int(alignments[qName]['score']) > int(r_m5_record['score']) - and int(r_m5_record['score']) < 255): - alignments[qName] = r_m5_record - - batch_align_failed_reads = [] - batch_align_data = [] - for read_fn_sg, r_m5_record in alignments.iteritems(): - bc_subgroup, read_fn = read_fn_sg.split(th.FASTA_NAME_JOINER) - read_id = batch_reads_data[read_fn_sg][1] - if r_m5_record is None: - batch_align_failed_reads.append( - ('Alignment not produced.', read_fn_sg)) - else: - try: - batch_align_data.append((read_fn, parse_m5_record( - r_m5_record, read_id, bc_subgroup))) - except Exception as e: - batch_align_failed_reads.append((str(e), read_fn_sg)) - - return batch_align_failed_reads, batch_align_data - -def parse_sam_record( - r_sam_record, genome_index, read_id, bc_subgroup, - skip_align_stats=False): - """ - Parse a single of sam formatted alignment - """ - def parse_cigar(strand): - # parse cigar string - cigar = [ - (int(reg_len), reg_type) for reg_len, reg_type in - CIGAR_PAT.findall(r_sam_record['cigar'])] - if len(cigar) < 1: - raise RuntimeError, 'Invalid cigar string produced' - - if strand == '-': - cigar = cigar[::-1] - - return cigar - - def get_just_tseq(cigar, strand): - start_clipped_bases = 0 - end_clipped_bases = 0 - # handle clipping elements (H and S) - if cigar[0][1] == 'H': - start_clipped_bases += cigar[0][0] - cigar = cigar[1:] - if cigar[-1][1] == 'H': - end_clipped_bases += cigar[-1][0] - cigar = cigar[:-1] - if cigar[0][1] == 'S': - start_clipped_bases += cigar[0][0] - cigar = cigar[1:] - if cigar[-1][1] == 'S': - end_clipped_bases += cigar[-1][0] - cigar = cigar[:-1] - - tLen = sum([reg_len for reg_len, reg_type in cigar - if reg_type in 'MDN=X']) - tSeq = genome_index[r_sam_record['rName']][ - int(r_sam_record['pos']) - 1: - int(r_sam_record['pos']) + tLen - 1] - if strand == '-': tSeq = th.rev_comp(tSeq) - - # check that cigar starts and ends with matched bases - while cigar[0][1] not in 'M=X': - if cigar[0][1] in 'ND': - tSeq = tSeq[cigar[0][0]:] - else: - start_clipped_bases += cigar[0][0] - cigar = cigar[1:] - while cigar[-1][1] not in 'M=X': - if cigar[-1][1] in 'ND': - tSeq = tSeq[:-cigar[-1][0]] - else: - end_clipped_bases += cigar[0][0] - cigar = cigar[:-1] - - return tSeq, start_clipped_bases, end_clipped_bases - - def get_qseq(cigar, strand): - # record clipped bases and remove from query seq as well as cigar - qSeq = r_sam_record['seq'] if strand == '+' else th.rev_comp( - r_sam_record['seq']) - start_clipped_bases = 0 - end_clipped_bases = 0 - # handle clipping elements (H and S) - if cigar[0][1] == 'H': - start_clipped_bases += cigar[0][0] - cigar = cigar[1:] - if cigar[-1][1] == 'H': - end_clipped_bases += cigar[-1][0] - cigar = cigar[:-1] - if cigar[0][1] == 'S': - start_clipped_bases += cigar[0][0] - qSeq = qSeq[cigar[0][0]:] - cigar = cigar[1:] - if cigar[-1][1] == 'S': - end_clipped_bases += cigar[-1][0] - qSeq = qSeq[:-cigar[-1][0]] - cigar = cigar[:-1] - - return qSeq, start_clipped_bases, end_clipped_bases, cigar - - def get_tseq(qSeq, start_clipped_bases, end_clipped_bases, cigar, strand): - tLen = sum([reg_len for reg_len, reg_type in cigar - if reg_type in 'MDN=X']) - tSeq = genome_index[r_sam_record['rName']][ - int(r_sam_record['pos']) - 1: - int(r_sam_record['pos']) + tLen - 1] - if strand == '-': tSeq = th.rev_comp(tSeq) - - # check that cigar starts and ends with matched bases - while cigar[0][1] not in 'M=X': - if cigar[0][1] in 'ND': - tSeq = tSeq[cigar[0][0]:] - else: - qSeq = qSeq[cigar[0][0]:] - start_clipped_bases += cigar[0][0] - cigar = cigar[1:] - while cigar[-1][1] not in 'M=X': - if cigar[-1][1] in 'ND': - tSeq = tSeq[:-cigar[-1][0]] - else: - qSeq = qSeq[:-cigar[-1][0]] - end_clipped_bases += cigar[-1][0] - cigar = cigar[:-1] - - qLen = sum([reg_len for reg_len, reg_type in cigar - if reg_type in 'MI=X']) - assert len(qSeq) == qLen, 'Read sequence from SAM and ' + \ - 'cooresponding cigar string do not agree.' - - return tSeq, qSeq, start_clipped_bases, end_clipped_bases, cigar - - def get_align_stats(tSeq, qSeq, cigar, strand): - num_ins, num_del, num_match, num_mismatch = 0, 0, 0, 0 - tPos, qPos = 0, 0 - for reg_len, reg_type in cigar: - if reg_type in 'M=X': - num_reg_match = sum( - qBase == tBase for qBase, tBase in - zip(qSeq[qPos:qPos+reg_len], - tSeq[tPos:tPos+reg_len])) - num_match += num_reg_match - num_mismatch += reg_len - num_reg_match - tPos += reg_len - qPos += reg_len - elif reg_type in 'IP': - num_ins += reg_len - qPos += reg_len - else: - num_del += reg_len - tPos += reg_len - - return num_ins, num_del, num_match, num_mismatch - - strand = '-' if int(r_sam_record['flag']) & 0x10 else '+' - cigar = parse_cigar(r_sam_record['cigar']) - if skip_align_stats: - # if alignment statistics are not requested, then only the template - # (genome) sequence is required and can be parsed slightly more quickly - # not command line option is available for this at the moment, but - # could be easily added with this code present. The resquiggle command - # has a lot of command line options and this seems a better default - (tSeq, start_clipped_bases, - end_clipped_bases) = get_just_tseq(cigar, strand) - num_ins, num_del, num_match, num_mismatch = 0, 0, 0, 0 - else: - qSeq, start_clipped_bases, end_clipped_bases, cigar = get_qseq( - cigar, strand) - tSeq, qSeq, start_clipped_bases, end_clipped_bases, cigar = get_tseq( - qSeq, start_clipped_bases, end_clipped_bases, cigar, strand) - num_ins, num_del, num_match, num_mismatch = get_align_stats( - tSeq, qSeq, cigar, strand) - - read_info = readInfo( - read_id, bc_subgroup, start_clipped_bases, end_clipped_bases, - num_ins, num_del, num_match, num_mismatch) - genome_loc = th.genomeLoc( - int(r_sam_record['pos']) - 1, strand, r_sam_record['rName']) - - return tSeq, genome_loc, read_info - -def parse_sam_output(align_output, batch_reads_data, genome_index): - """ - Parse a batch of sam formatted alignment - """ - # create dictionary with empty slot to each read - alignments = dict( - (read_fn_sg, None) for read_fn_sg in batch_reads_data.keys()) - for line in align_output: - if line.startswith('@'): continue - r_sam_record = dict(zip(SAM_FIELDS, line.strip().split())) - if len(r_sam_record) < len(SAM_FIELDS): continue - if r_sam_record['rName'] == '*': continue - # store the alignment if none is stored for this read or - # if this read has the highest map quality thus far - qName = r_sam_record['qName'].replace(th.FN_SPACE_FILLER, ' ') - if alignments[qName] is None or ( - int(alignments[qName]['mapq']) > int(r_sam_record['mapq']) - and int(r_sam_record['mapq']) < 255): - alignments[qName] = r_sam_record - - batch_align_failed_reads = [] - batch_align_data = [] - for read_fn_sg, r_sam_record in alignments.iteritems(): - bc_subgroup, read_fn = read_fn_sg.split(th.FASTA_NAME_JOINER) - read_id = batch_reads_data[read_fn_sg][1] - if r_sam_record is None: - batch_align_failed_reads.append( - ('Alignment not produced (if all reads failed ' + - 'check for index files)', read_fn_sg)) - else: - try: - batch_align_data.append((read_fn, parse_sam_record( - r_sam_record, genome_index, read_id, bc_subgroup))) - except Exception as e: - #raise - batch_align_failed_reads.append((str(e), read_fn_sg)) - - return batch_align_failed_reads, batch_align_data - -def _prep_graphmap_options( - genome_fn, read_fn, out_fn, output_format, num_align_ps): - return ['align', '-r', genome_fn, '-d', read_fn, '-o', out_fn, - '-L', output_format, '-t', str(num_align_ps)] - -def _prep_bwa_mem_options(genome_fn, read_fn, num_align_ps): - return ['mem', '-x', 'ont2d', '-v', '1', '-t', str(num_align_ps), - genome_fn, read_fn] - -def _prep_minimap2_options(genome_fn, read_fn, num_align_ps, index_fn): - mapper_genome = genome_fn if index_fn is None else index_fn - return ['-ax', 'map-ont', '-t', str(num_align_ps), mapper_genome, read_fn] - -def align_to_genome(batch_reads_data, genome_fn, mapper_data, genome_index, - num_align_ps, output_format='sam'): - """ - Align a batch of reads to the provided genome - """ - # prepare fasta text with batch reads - batch_reads_fasta = '' - for read_fn_sg, (read_seq, read_id) in batch_reads_data.iteritems(): - # note spaces aren't allowed in read names so replace with - # vertical bars and undo to retain file names - batch_reads_fasta += ">" + read_fn_sg.replace(' ', th.FN_SPACE_FILLER) + \ - '\n' + read_seq + '\n' - - read_fp = NamedTemporaryFile(suffix='.fasta') - read_fp.write(batch_reads_fasta) - read_fp.flush() - out_fp = NamedTemporaryFile() - - # optionally suppress output from mapper with devnull sink - with open(os.devnull, 'w') as FNULL: - if mapper_data.type == 'graphmap': - mapper_options = _prep_graphmap_options( - genome_fn, read_fp.name, out_fp.name, - output_format, num_align_ps) - stdout_sink = FNULL - elif mapper_data.type == 'bwa_mem': - mapper_options = _prep_bwa_mem_options( - genome_fn, read_fp.name, num_align_ps) - stdout_sink = out_fp - elif mapper_data.type == 'minimap2': - mapper_options = _prep_minimap2_options( - genome_fn, read_fp.name, num_align_ps, mapper_data.index) - stdout_sink = out_fp - else: - raise RuntimeError, 'Mapper not supported' - - try: - exitStatus = call([mapper_data.exe,] + mapper_options, - stdout=stdout_sink, stderr=FNULL) - out_fp.seek(0) - align_output = out_fp.readlines() - # close files here so that they persist until - # after basecalling is finished - read_fp.close() - out_fp.close() - except: - # whole mapping call failed so all reads failed - return ([( - 'Problem running/parsing genome mapper. ' + - 'Ensure you have a compatible version installed.' + - 'Potentially failed to locate BWA index files.', - read_fn_sg) for read_fn_sg - in batch_reads_data.keys()], []) - - if output_format == 'sam': - batch_parse_failed_reads, batch_align_data = parse_sam_output( - align_output, batch_reads_data, genome_index) - elif output_format == 'm5': - batch_parse_failed_reads, batch_align_data = parse_m5_output( - align_output, batch_reads_data) - else: - raise RuntimeError, 'Mapper output type not supported' - return batch_parse_failed_reads, batch_align_data +####################################### +########## Genomic Alignment ########## +####################################### -def get_read_seq(fast5_fn, basecall_group, basecall_subgroup, bio_samp_type): +def get_read_seq(fast5_data, bc_grp, bc_subgrp, bio_samp_type): """ Extract the read sequence from the Fastq slot providing useful error messages """ try: - fast5_data = h5py.File(fast5_fn, 'r') + fastq_raw_value = fast5_data[ + '/Analyses/' + bc_grp + '/' + bc_subgrp + '/Fastq'].value except: - raise NotImplementedError, ( - 'Error opening file for alignment. This should have ' + - 'been caught during the FAST5 prep phase. Check that there ' + - 'are no other tombo processes or processes accessing ' + - 'these FAST5 files running simultaneously') + raise NotImplementedError('Fastq slot not present in --basecall-group') + # depending on how fastq data was stored it may already be encoded + # as unicode, so this would fail. try: - read_seq = fast5_data[ - '/Analyses/' + basecall_group + '/' + basecall_subgroup + - '/Fastq'].value.split('\n')[1] - except: - raise RuntimeError, ('Fastq slot not present in --basecall-group') + fastq_raw_value = fastq_raw_value.decode() + except (TypeError, AttributeError): + pass - try: - read_data = fast5_data['/Raw/Reads/'].values()[0] - except: - raise RuntimeError, ( - 'Raw data is not found in Raw/Reads/Read_[read#]') + read_seq = fastq_raw_value.split('\n')[1] + + read_data = th.get_raw_read_slot(fast5_data) # looks like read_id attribute has been removed in some files and attribute # is not really necessary for tombo @@ -1468,238 +858,223 @@ def get_read_seq(fast5_fn, basecall_group, basecall_subgroup, bio_samp_type): read_id = read_data.attrs['read_id'] except: try: - read_id = str(read_data.attrs['read_num']) + read_id = unicode(read_data.attrs['read_num']) except: - read_id = str(np.random.randint(1000000000)) + read_id = unicode(np.random.randint(1000000000)) try: - if ((bio_samp_type is not None and bio_samp_type == 'RNA') or - th.is_read_rna(fast5_data)): - read_seq = th.rev_transcribe(read_seq) + if bio_samp_type is None: + bio_samp_type = 'RNA' if th.is_read_rna(fast5_data) else 'DNA' except: - raise RuntimeError, 'Error determining whether read is DNA or RNA' + raise NotImplementedError('Cannot determine whether read is DNA or RNA') + if bio_samp_type == 'RNA': + read_seq = th.rev_transcribe(read_seq) + return read_seq, read_id, bio_samp_type + +def map_read(fast5_data, bc_grp, bc_subgrp, corr_grp, + aligner, genome_index, bio_samp_type): + read_seq, read_id, bio_samp_type = get_read_seq( + fast5_data, bc_grp, bc_subgrp, bio_samp_type) try: - fast5_data.close() - except: - raise RuntimeError, ( - 'Could not close FAST5 file. Possibly corrupted file') + alignment = next(aligner.map(str(read_seq))) + except StopIteration: + raise NotImplementedError('Alignment not produced') + + chrm = alignment.ctg + # subtract one to put into 0-based index + ref_start = alignment.r_st + ref_end = alignment.r_en + strand = '+' if alignment.strand == 1 else '-' + num_match = alignment.mlen + num_ins, num_del, num_aligned = 0, 0 ,0 + for op_len, op in alignment.cigar: + if op == 1: num_ins += op_len + elif op in (2,3): num_del += op_len + elif op in (0,7,8): num_aligned += op_len + elif op == 6: pass + else: + # soft and hard clipping are not reported in the + # mappy cigar + raise NotImplementedError('Invalid cigar operation') + if strand == '+': + start_clipped_bases = alignment.q_st + end_clipped_bases = len(read_seq) - alignment.q_en + else: + start_clipped_bases = len(read_seq) - alignment.q_en + end_clipped_bases = alignment.q_st + + genome_seq = genome_index.get_seq(chrm, ref_start, ref_end) + if strand == '-': + genome_seq = th.rev_comp(genome_seq) + assert len(genome_seq) == ref_end - ref_start, ( + 'Discordant mapped position and sequence') + align_info = th.alignInfo( + read_id, bc_subgrp, start_clipped_bases, end_clipped_bases, + num_ins, num_del, num_match, num_aligned - num_match) + genome_loc = th.genomeLoc(ref_start, strand, chrm) + + return genome_seq, genome_loc, align_info, bio_samp_type + +def load_minimap_index(genome_fn, mm_index): + if mm_index: + aligner = mappy.Aligner(str(mm_index), preset=str('map-ont')) + else: + aligner = mappy.Aligner(str(genome_fn), preset=str('map-ont')) - return read_seq, read_id + return aligner -def align_and_parse( - fast5s_to_process, genome_fn, mapper_data, - genome_index, basecall_group, basecall_subgroups, num_align_ps, - bio_samp_type): - """ - Align and parse a batch of reads - """ - batch_reads_data = {} - batch_get_data_failed_reads = [] - for fast5_fn in fast5s_to_process: - for bc_subgroup in basecall_subgroups: + +######################################## +########## Re-squiggle Worker ########## +######################################## + +def _resquiggle_worker( + fast5_q, progress_q, failed_reads_q, index_q, bc_grp, bc_subgrps, + corr_grp, genome_fn, mm_index, tb_model_fn, + outlier_thresh, compute_sd, sig_aln_params, obs_filter, + const_scale, bio_samp_type, seg_params, overwrite, in_place=True): + num_processed = 0 + debug_fps = None + if _DEBUG_MIDDLE or _DEBUG_FULL: + debug_fps = _open_debug_fps() + + # create minimap2 aligner instance + aligner = load_minimap_index(genome_fn, mm_index) + genome_index = th.Fasta(genome_fn) + + # parse tombo model (ignore alt_base and model_name) + std_ref = ts.TomboModel(tb_model_fn) + + while True: + try: + fast5_fn = fast5_q.get(block=False) + except queue.Empty: + break + + num_processed += 1 + if num_processed % int(PROGRESS_INTERVAL / 10) == 0: + progress_q.put(int(PROGRESS_INTERVAL / 10)) + + if _DRY_RUN: + prep_result = h5py.File(fast5_fn, 'r+') + else: + # prep the fast5 file for writing + prep_result = th.prep_fast5( + fast5_fn, corr_grp, overwrite, in_place, bc_grp, + return_fp=True) + if isinstance(prep_result, h5py.File): + fast5_data = prep_result + else: + failed_reads_q.put(prep_result) + continue + + for bc_subgrp in bc_subgrps: try: - read_seq, read_id = get_read_seq( - fast5_fn, basecall_group, bc_subgroup, bio_samp_type) - batch_reads_data[bc_subgroup + th.FASTA_NAME_JOINER + - fast5_fn] = (read_seq, read_id) + (genome_seq, genome_loc, align_info, + bio_samp_type) = map_read( + fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, + genome_index, bio_samp_type) + index_data = resquiggle_read( + fast5_data, fast5_fn, genome_seq, genome_loc, align_info, + std_ref, outlier_thresh, bc_grp, corr_grp, bio_samp_type, + compute_sd, seg_params, sig_aln_params, obs_filter, + skip_index=index_q is None, reg_id=num_processed, + debug_fps=debug_fps, const_scale=const_scale) + if index_q is not None: + index_q.put(index_data) + if index_data[1][6]: + failed_reads_q.put(( + 'Read filtered by observation per base ' + + 'thresholds (revert with `tombo clear_filters`)', + bc_subgrp + ':::' + fast5_fn)) except Exception as e: # uncomment to identify mysterious errors #raise - batch_get_data_failed_reads.append(( - str(e), bc_subgroup + th.FASTA_NAME_JOINER + fast5_fn)) - - batch_align_failed_reads, batch_align_data = align_to_genome( - batch_reads_data, genome_fn, mapper_data, - genome_index, num_align_ps) - # regroup reads by filename (for 2D reads to be processed together - # and avoid the same HDF5 file being opened simultaneuously) - fn_batch_align_data = defaultdict(list) - for fast5_fn, sg_align_data in batch_align_data: - fn_batch_align_data[fast5_fn].append(sg_align_data) - # uncomment to identify mysterious errors - #print "Get data errors: " + str(batch_get_data_failed_reads) - #print "Align read errors: " + str(batch_align_failed_reads) - - return (batch_get_data_failed_reads + batch_align_failed_reads, - fn_batch_align_data) - -def align_reads( - fast5_batch, genome_fn, mapper_data, genome_index, - basecall_group, basecall_subgroups, corrected_group, - basecalls_q, overwrite, num_align_ps, bio_samp_type, in_place=True): - """ - Prepare FAST5s and then align the extracted sequences - """ - batch_prep_failed_reads = [] - fast5s_to_process = [] - for fast5_fn in fast5_batch: - prep_result = th.prep_fast5( - fast5_fn, corrected_group, overwrite, in_place, basecall_group) - if prep_result is None: - fast5s_to_process.append(fast5_fn) - else: - batch_prep_failed_reads.append(prep_result) - - batch_align_failed_reads, batch_align_data = align_and_parse( - fast5s_to_process, genome_fn, mapper_data, - genome_index, basecall_group, basecall_subgroups, num_align_ps, - bio_samp_type) - for fast5_fn, sgs_align_data in batch_align_data.iteritems(): - basecalls_q.put((fast5_fn, sgs_align_data)) - # uncomment to identify mysterious errors - #print "Prep reads fail: " + str(batch_prep_failed_reads) - #print "Align reads fail: " + str(batch_align_failed_reads) - - return batch_prep_failed_reads + batch_align_failed_reads - -def _alignment_worker( - fast5_q, basecalls_q, progress_q, failed_reads_q, genome_fn, - mapper_data, basecall_group, basecall_subgroups, - corrected_group, overwrite, num_align_ps, bio_samp_type): - # this is only needed for sam output format (not m5) - genome_index = th.parse_fasta(genome_fn) - while not fast5_q.empty(): - try: - fast5_batch = fast5_q.get(block=False) - except Queue.Empty: - break + try: + th.write_error_status( + fast5_fn, corr_grp, bc_subgrp, unicode(e)) + except: + pass + failed_reads_q.put(( + unicode(e), bc_subgrp + ':::' + fast5_fn)) - batch_failed_reads = align_reads( - fast5_batch, genome_fn, mapper_data, - genome_index, basecall_group, basecall_subgroups, - corrected_group, basecalls_q, overwrite, num_align_ps, bio_samp_type) - # if a read didn't fail here it will be counted in the resquiggle worker - progress_q.put(len(batch_failed_reads)) - for failed_read in batch_failed_reads: try: - sg_fn = failed_read[1].split(th.FASTA_NAME_JOINER) - if len(sg_fn) == 2: - subgroup, fast5_fn = sg_fn - else: - subgroup, fast5_fn = None, sg_fn - th.write_error_status( - fast5_fn, corrected_group, subgroup, failed_read[0]) + fast5_data.close() except: pass - failed_reads_q.put(failed_read) return -if _PROFILE_ALIGN: - _alignment_wrapper = _alignment_worker - def _alignment_worker(*args): + +if _PROFILE_RSQGL: + _resquiggle_wrapper = _resquiggle_worker + def _resquiggle_worker(*args): import cProfile - cProfile.runctx('_alignment_wrapper(*args)', globals(), locals(), - filename='resquiggle_align.prof') + cProfile.runctx('_resquiggle_wrapper(*args)', globals(), locals(), + filename='resquiggle_eventless_main.prof') return + +########################################### +########## Re-squiggle All Reads ########## +########################################### + def resquiggle_all_reads( - fast5_fns, genome_fn, mapper_data, - basecall_group, basecall_subgroups, corrected_group, tb_model_fn, - bio_samp_type, outlier_thresh, overwrite, align_batch_size, num_align_ps, - align_threads_per_proc, num_resquiggle_ps, compute_sd, skip_index, - skip_pen, match_evalue, bandwidth, obs_filter, const_scale): + fast5_fns, genome_fn, bc_grp, bc_subgrps, corr_grp, tb_model_fn, + bio_samp_type, outlier_thresh, overwrite, num_ps, compute_sd, skip_index, + sig_aln_params, obs_filter, const_scale, seg_params, mm_index): """ - Perform genomic alignment and event-less re-squiggle algorithm batched across reads + Perform genomic alignment and event-less re-squiggle algorithm + batched across reads """ manager = mp.Manager() fast5_q = manager.Queue() - # set maximum number of parsed basecalls to sit in the middle queue - basecalls_q = manager.Queue(align_batch_size * ALIGN_BATCH_MULTIPLIER) failed_reads_q = manager.Queue() - progress_q = manager.Queue() index_q = manager.Queue() if not skip_index else None - num_reads = 0 - fast5_batch = [] + progress_q = manager.Queue() for fast5_fn in fast5_fns: - num_reads += 1 - fast5_batch.append(fast5_fn) - # put batches of reads in queue - if num_reads % align_batch_size == 0: - fast5_q.put(fast5_batch) - fast5_batch = [] - if len(fast5_batch) > 0: - fast5_q.put(fast5_batch) - - if tb_model_fn is None: - tb_model_fn, bio_samp_type = ts.get_default_standard_ref_from_files( - fast5_fns, bio_samp_type) - - align_args = ( - fast5_q, basecalls_q, progress_q, failed_reads_q, genome_fn, - mapper_data, basecall_group, basecall_subgroups, - corrected_group, overwrite, align_threads_per_proc, bio_samp_type) - align_ps = [] - for p_id in xrange(num_align_ps): - p = mp.Process(target=_alignment_worker, args=align_args) - p.start() - align_ps.append(p) + fast5_q.put(fast5_fn) - rsqgl_args = (basecalls_q, progress_q, failed_reads_q, index_q, - basecall_group, corrected_group, tb_model_fn, outlier_thresh, - compute_sd, skip_pen, match_evalue, bandwidth, obs_filter, - const_scale, bio_samp_type) + rsqgl_args = (fast5_q, progress_q, failed_reads_q, index_q, + bc_grp, bc_subgrps, corr_grp, genome_fn, + mm_index, tb_model_fn, outlier_thresh, + compute_sd, sig_aln_params, obs_filter, const_scale, + bio_samp_type, seg_params, overwrite) resquiggle_ps = [] - for p_id in xrange(num_resquiggle_ps): + for p_id in range(num_ps): p = mp.Process(target=_resquiggle_worker, args=rsqgl_args) p.start() resquiggle_ps.append(p) if VERBOSE: sys.stderr.write( - 'Correcting ' + str(num_reads) + ' files with ' + - str(len(basecall_subgroups)) + ' subgroup(s)/read(s) ' + - 'each (Will print a dot for each ' + str(PROGRESS_INTERVAL) + + 'Correcting ' + unicode(len(fast5_fns)) + ' files with ' + + unicode(len(bc_subgrps)) + ' subgroup(s)/read(s) ' + + 'each (Will print a dot for each ' + unicode(PROGRESS_INTERVAL) + ' reads completed).\n') tot_num_rec_proc = 0 failed_reads = defaultdict(list) all_index_data = [] - while any(p.is_alive() for p in align_ps): - try: - errorType, fn = failed_reads_q.get(block=False) - failed_reads[errorType].append(fn) - except Queue.Empty: - try: - num_rec_proc = progress_q.get(block=False) - num_int_proc = ( - ((tot_num_rec_proc + num_rec_proc) / PROGRESS_INTERVAL) - - (tot_num_rec_proc / PROGRESS_INTERVAL)) - if num_int_proc > 0: - sys.stderr.write('.' * num_int_proc) - sys.stderr.flush() - tot_num_rec_proc += num_rec_proc - except Queue.Empty: - # don't need to check index queue since this is only - # filled once the resquiggle proceses are completed which is - # only after the alignment workers have finished - sleep(1) - continue - - # add None entried to basecalls_q to indicate that all reads have - # been basecalled and processed - for _ in xrange(num_resquiggle_ps): - basecalls_q.put((None, None)) - while any(p.is_alive() for p in resquiggle_ps): try: errorType, fn = failed_reads_q.get(block=False) failed_reads[errorType].append(fn) - except Queue.Empty: + except queue.Empty: try: num_rec_proc = progress_q.get(block=False) num_int_proc = ( - ((tot_num_rec_proc + num_rec_proc) / PROGRESS_INTERVAL) - - (tot_num_rec_proc / PROGRESS_INTERVAL)) + ((tot_num_rec_proc + num_rec_proc) // PROGRESS_INTERVAL) - + (tot_num_rec_proc // PROGRESS_INTERVAL)) if num_int_proc > 0: sys.stderr.write('.' * num_int_proc) sys.stderr.flush() tot_num_rec_proc += num_rec_proc - except Queue.Empty: + except queue.Empty: if index_q is not None: try: - proc_index_data = index_q.get(block=False) - all_index_data.extend(proc_index_data) - except Queue.Empty: + r_index_data = index_q.get(block=False) + all_index_data.append(r_index_data) + except queue.Empty: sleep(1) continue @@ -1709,8 +1084,8 @@ def resquiggle_all_reads( failed_reads[errorType].append(fn) if index_q is not None: while not index_q.empty(): - proc_index_data = index_q.get(block=False) - all_index_data.extend(proc_index_data) + r_index_data = index_q.get(block=False) + all_index_data.append(r_index_data) # print newline after read progress dots if VERBOSE: sys.stderr.write('\n') @@ -1721,10 +1096,8 @@ def parse_files(args): if VERBOSE: sys.stderr.write('Getting file list.\n') try: if not os.path.isdir(args.fast5_basedir): - sys.stderr.write( - '*' * 60 + '\nERROR: Provided [fast5-basedir] is ' + - 'not a directory.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Provided [fast5-basedir] is not a directory.') fast5_basedir = ( args.fast5_basedir if args.fast5_basedir.endswith('/') else args.fast5_basedir + '/') @@ -1735,36 +1108,29 @@ def parse_files(args): index_fn = th.get_index_fn(fast5_basedir, args.corrected_group) if os.path.exists(index_fn): os.remove(index_fn) except OSError: - sys.stderr.write( - '*' * 60 + '\nERROR: Reads base directory, a sub-directory ' + + th._error_message_and_exit( + 'Reads base directory, a sub-directory ' + 'or an old (hidden) index file does not appear to be ' + - 'accessible. Check directory permissions.\n' + '*' * 60 + '\n') - sys.exit() + 'accessible. Check directory permissions.') if len(files) < 1: - sys.stderr.write( - '*' * 60 + '\nERROR: No files identified in the specified ' + - 'directory or within immediate subdirectories.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'No files identified in the specified ' + + 'directory or within immediate subdirectories.') + + if not th.reads_contain_basecalls( + files, args.basecall_group, num_reads=1000): + th._error_message_and_exit( + 'Reads do not to contain basecalls. Check --basecall-group option ' + + 'if basecalls are stored in non-standard location or use ' + + '`tombo annotate_raw_with_fastqs` to add basecalls from FASTQ ' + + 'files to raw FAST5 files.') return files, fast5_basedir, index_fn -def get_mapper_data(args): - if all(map_exe is None for map_exe in ( - args.minimap2_executable, args.bwa_mem_executable, - args.graphmap_executable)): - sys.stderr.write( - '*' * 60 + '\nERROR: Must provide either a ' + \ - 'minimap2, graphmap or bwa-mem executable.\n' + '*' * 60 + '\n') - sys.exit() - if args.minimap2_executable is not None: - mapper_data = mapperData( - args.minimap2_executable, 'minimap2', args.minimap2_index) - elif args.bwa_mem_executable is not None: - mapper_data = mapperData(args.bwa_mem_executable, 'bwa_mem') - else: - mapper_data = mapperData(args.graphmap_executable, 'graphmap') - return mapper_data +################################### +########## Main Function ########## +################################### def eventless_resquiggle_main(args): """ @@ -1776,57 +1142,63 @@ def eventless_resquiggle_main(args): ts.VERBOSE = VERBOSE if args.basecall_group == args.corrected_group: - sys.stderr.write( - '********** ERROR *********\n\t--basecall-group and ' + - '--corrected-group must be different.\n') - sys.exit() + th._error_message_and_exit( + '--basecall-group and --corrected-group must ' + + 'be different.') + + # check simple arguments for validity first + outlier_thresh = args.outlier_threshold if ( + args.outlier_threshold > 0) else None + + obs_filter = th.parse_obs_filter(args.obs_per_base_filter) \ + if 'obs_per_base_filter' in args else None + aligner = load_minimap_index(args.genome_fasta, args.minimap2_index) + if not aligner: + th._error_message_and_exit( + 'Failed to load --genome-fasta or --minimap2-index for mapping.') + del aligner - mapper_data = get_mapper_data(args) + # load genome once here to index it if using pyfaidx so it isn't built in + # each process seperately. + genome_index = th.Fasta(args.genome_fasta, dry_run=True) + del genome_index files, fast5_basedir, index_fn = parse_files(args) + tb_model_fn = args.tombo_model_filename + bio_samp_type = args.bio_sample_type + if tb_model_fn is None: + tb_model_fn, bio_samp_type = ts.get_default_standard_ref_from_files( + files, bio_samp_type) + if not os.path.exists(tb_model_fn): + th._error_message_and_exit('Invalid tombo model file provided.') + const_scale = None if args.fixed_scale is not None: const_scale = args.fixed_scale elif not args.fit_scale_per_read: const_scale = th.estimate_global_scale(files) - outlier_thresh = args.outlier_threshold if ( - args.outlier_threshold > 0) else None - - # resolve processor and thread arguments - num_proc = 2 if args.processes < 2 else args.processes - align_threads_per_proc = int(num_proc / 2) \ - if args.align_threads_per_process is None else \ - args.align_threads_per_process - num_resquiggle_ps = int(num_proc / 2) \ - if args.resquiggle_processes is None \ - else args.resquiggle_processes - - obs_filter = th.parse_obs_filter(args.obs_per_base_filter) \ - if 'obs_per_base_filter' in args else None - failed_reads, all_index_data = resquiggle_all_reads( - files, args.genome_fasta, mapper_data, + files, args.genome_fasta, args.basecall_group, args.basecall_subgroups, args.corrected_group, - args.tombo_model_filename, args.bio_sample_type, outlier_thresh, - args.overwrite, args.alignment_batch_size, args.align_processes, - align_threads_per_proc, num_resquiggle_ps, args.include_event_stdev, - args.skip_index, args.skip_penalty, args.match_expected_value, - args.bandwidth, obs_filter, const_scale) + tb_model_fn, bio_samp_type, outlier_thresh, + args.overwrite, args.processes, args.include_event_stdev, + args.skip_index, args.signal_align_parameters, obs_filter, + const_scale, args.segmentation_parameters, args.minimap2_index) if not args.skip_index: th.write_index_file(all_index_data, index_fn, fast5_basedir) fail_summary = [(err, len(fns)) for err, fns in failed_reads.items()] if len(fail_summary) > 0: - total_num_failed = sum(zip(*fail_summary)[1]) - sys.stderr.write('Failed reads summary (' + str(total_num_failed) + + total_num_failed = sum(map(itemgetter(1), fail_summary)) + sys.stderr.write('Failed reads summary (' + unicode(total_num_failed) + ' total failed):\n' + '\n'.join( - "\t" + err + " :\t" + str(n_fns) + "\t" + err + " :\t" + unicode(n_fns) for err, n_fns in sorted(fail_summary)) + '\n') else: sys.stderr.write('All reads successfully re-squiggled!\n') if args.failed_reads_filename is not None: - with open(args.failed_reads_filename, 'w') as fp: + with io.open(args.failed_reads_filename, 'wt') as fp: fp.write('\n'.join(( err + '\t' + ', '.join(fns) for err, fns in failed_reads.items())) + '\n') diff --git a/tombo/tests/shell_tests.sh b/tombo/tests/shell_tests.sh index e10a119..afe594f 100755 --- a/tombo/tests/shell_tests.sh +++ b/tombo/tests/shell_tests.sh @@ -1,13 +1,15 @@ natDir='test_data/native_reads/' ampDir='test_data/amplified_reads/' rcsvDir='test_data/recursive_test/' +natFsq="test_data/fastqs.native.fasta" +natFqDir='test_data/native_reads.for_fastq_ann/' nrModFn='tombo_standard.DNA.model' altModFn='tombo_alt.5mC.model' poreModel="r9_250bps.nucleotide.5mer.template.model" genomeFn="e_coli.K12.NEB5alpha.fasta" mmiFn="e_coli.K12.NEB5alpha.mmi" genomeLocs='"CP017100.1:1505285" "CP017100.1:1504705"' -strandGenomeLocs='"CP017100.1:1505285:-" "CP017100.1:1504705:+"' +strandGenomeLocs='"CP017100.1:1505285:+" "CP017100.1:1504705:+"' runHelps=false runResquiggle=true @@ -35,6 +37,7 @@ tombo plot_per_read -h tombo plot_correction -h tombo plot_multi_correction -h +tombo plot_roc -h tombo plot_kmer -h tombo cluster_most_significant -h @@ -53,24 +56,33 @@ if [ $runResquiggle == true ] then printf "\n\n********* Testing re-squiggle command **********\n" tombo resquiggle \ - $natDir $genomeFn --minimap2-executable ./minimap2 \ + $natDir $genomeFn \ --failed-reads-filename testing.native.failed_read.txt \ --processes 4 --overwrite tombo resquiggle \ - $ampDir $genomeFn --minimap2-executable ./minimap2 \ + $ampDir $genomeFn \ --failed-reads-filename testing.amplified.failed_read.txt \ --processes 4 --overwrite +printf "\n\n********* Testing FASTQ annotation and re-squiggle **********\n" +tombo annotate_raw_with_fastqs --fast5-basedir $natFqDir \ + --fastq-filenames $natFsq --overwrite +tombo resquiggle \ + $natFqDir $genomeFn \ + --corrected-group FastqAnnotation \ + --failed-reads-filename testing.native.fastq_ann.failed_read.txt \ + --processes 4 --overwrite + printf "\n\n********* Testing re-squiggle command with filename **********\n" tombo resquiggle \ $natDir $genomeFn --tombo-model-filename $nrModFn \ --corrected-group RawWFilenameCorrected \ - --minimap2-executable ./minimap2 --processes 4 --overwrite \ + --processes 4 --overwrite \ --failed-reads-filename testing.native.fn_model.failed_read.txt tombo resquiggle \ $ampDir $genomeFn --tombo-model-filename $nrModFn \ --corrected-group RawWFilenameCorrected \ - --minimap2-executable ./minimap2 --processes 4 --overwrite \ + --processes 4 --overwrite \ --failed-reads-filename testing.amplified.fn_model.failed_read.txt printf "\n\n********* Testing event-based resquiggle **********\n" @@ -87,26 +99,14 @@ tombo model_resquiggle \ printf "\n\n********* Testing minimap2 index **********\n" tombo resquiggle \ - $natDir $genomeFn --minimap2-executable ./minimap2 \ + $natDir $genomeFn \ --corrected-group RawMinimapIndexCorrected \ --minimap2-index $mmiFn --processes 4 --overwrite \ --failed-reads-filename testing.native.failed_read.txt -printf "\n\n********* Testing BWA MEM and Graphmap Mappers **********\n" -tombo resquiggle \ - $natDir $genomeFn --bwa-mem-executable ./bwa \ - --corrected-group RawGenomeCorrected_bwamem --overwrite \ - --failed-reads-filename testing.native.failed_read.txt \ - --processes 4 --overwrite -tombo resquiggle \ - $natDir $genomeFn --graphmap-executable ./graphmap \ - --corrected-group RawGenomeCorrected_graphmap --overwrite \ - --failed-reads-filename testing.group1.graphmap.failed_read.txt \ - --processes 4 --overwrite - printf "\n\n********* Testing pA normalization **********\n" -tombo event_resquiggle \ - $natDir $genomeFn --minimap2-executable ./minimap2 \ +tombo event_resquiggle --minimap2-executable ./minimap2 \ + $natDir $genomeFn \ --normalization-type pA_raw --processes 4 \ --corrected-group RawGenomeCorrected_pA_raw_000 --overwrite \ --failed-reads-filename testing.native.pA_raw.failed_read.txt @@ -119,7 +119,7 @@ tombo event_resquiggle \ printf "\n\n********* Testing recursive resquiggle **********\n" tombo resquiggle \ - $rcsvDir $genomeFn --minimap2-executable ./minimap2 \ + $rcsvDir $genomeFn \ --failed-reads-filename testing.recursive.failed_read.txt \ --processes 4 --overwrite fi @@ -130,6 +130,7 @@ tombo filter_stuck --fast5-basedirs $natDir \ --obs-per-base-filter 99:200 100:5000 tombo filter_coverage --fast5-basedirs $natDir \ --percent-to-filter 10 +tombo clear_filters --fast5-basedirs $natDir printf "\n\n********* Testing single sample genome-anchored plotting functions **********\n" @@ -144,6 +145,10 @@ tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.motif_centered.1_samp.pdf +tombo plot_motif_centered --fast5-basedirs $natDir --motif TWA \ + --genome-fasta $genomeFn \ + --num-bases 21 --overplot-threshold 1000 \ + --pdf-filename testing.motif_centered.palindrome.1_samp.pdf tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --num-bases 21 --overplot-threshold 1000 --deepest-coverage \ @@ -153,7 +158,7 @@ tombo plot_max_coverage --fast5-basedirs $rcsvDir \ --pdf-filename testing.max_cov.1_samp.recursive.pdf -printf "\n\n********* Testing mutliple sample genome-anchored plotting functions **********\n" +printf "\n\n********* Testing multiple sample genome-anchored plotting functions **********\n" tombo plot_max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ @@ -168,6 +173,11 @@ tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.motif_centered.2_samp.pdf +tombo plot_motif_centered --fast5-basedirs $natDir --motif TWA \ + --genome-fasta $genomeFn \ + --control-fast5-basedirs $ampDir \ + --num-bases 21 --overplot-threshold 1000 \ + --pdf-filename testing.motif_centered.palindrome.2_samp.pdf tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --control-fast5-basedirs $ampDir \ @@ -176,30 +186,53 @@ tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ printf "\n\n********* Testing statistical testing. **********\n" rm test_stats.2samp.tombo.stats test_stats.model.tombo.stats \ - test_stats.alt_model.5mC.tombo.stats test_stats.alt_default_model.5mC.tombo.stats test_standard.model + test_stats.alt_model.5mC.tombo.stats \ + test_stats.alt_default_model.5mC.tombo.stats \ + test_stats.alt_default_model.6mA.tombo.stats \ + test_standard.model tombo test_significance --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --statistics-file-basename test_stats.2samp + --statistics-file-basename test_stats.2samp \ + --per-read-statistics-basename test_stats.2samp tombo test_significance --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ - --statistics-file-basename test_stats.model + --statistics-file-basename test_stats.model \ + --per-read-statistics-basename test_stats.model tombo test_significance --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --alternate-model-filenames $altModFn \ - --statistics-file-basename test_stats.alt_model + --statistics-file-basename test_stats.alt_model \ + --per-read-statistics-basename test_stats.alt_model tombo test_significance --fast5-basedirs $natDir \ - --alternate-bases 5mC \ + --alternate-bases 5mC 6mA \ --statistics-file-basename test_stats.alt_default_model tombo estimate_reference --fast5-basedirs $natDir \ --tombo-model-filename test_standard.model \ - --upstream-bases 1 --downstream-bases 2 --minimum-kmer-observations 1 + --upstream-bases 1 --downstream-bases 1 --minimum-kmer-observations 1 tombo estimate_alt_reference --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --tombo-model-filename $nrModFn \ + --tombo-model-filename test_standard.model \ + --alternate-model-filename test_alt.model \ + --alternate-model-name 5mC --alternate-model-base C \ + --minimum-kmer-observations 1 --save-density-basename test_save_dens +tombo estimate_alt_reference \ + --alternate-density-filename test_save_dens.alternate_density.txt \ + --control-density-filename test_save_dens.control_density.txt \ + --tombo-model-filename test_standard.model \ --alternate-model-filename test_alt.model \ --alternate-model-name 5mC --alternate-model-base C \ --minimum-kmer-observations 1 +printf "\n\n********* Testing ROC and Precision-Recall plotting **********\n" +tombo plot_roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ + --statistics-filenames test_stats.2samp.tombo.stats \ + test_stats.alt_default_model.5mC.tombo.stats \ + test_stats.alt_default_model.6mA.tombo.stats \ + test_stats.model.tombo.stats --motif-descriptions \ + CCWGG:2:"dcm 5mC Samp Comp"::GATC:2:"dam 6mA Samp Comp" \ + CCWGG:2:"dcm 5mC Alt Test" GATC:2:"dam 6mA Alt Test" \ + CCWGG:2:"dcm 5mC De Novo"::GATC:2:"dam 6mA De Novo" + printf "\n\n********* Testing mutliple sample statistical testing genome-anchored plotting functions **********\n" tombo plot_max_difference --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ @@ -208,8 +241,8 @@ tombo plot_max_difference --fast5-basedirs $natDir \ tombo plot_most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ - --statistics-filename test_stats.model.tombo.stats \ - --pdf-filename testing.most_signif.pdf + --statistics-filename test_stats.2samp.tombo.stats \ + --pdf-filename testing.most_signif.2samp.pdf tombo plot_most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ @@ -219,51 +252,60 @@ tombo plot_most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ --statistics-filename test_stats.alt_model.5mC.tombo.stats \ - --pdf-filename testing.most_signif.alt_model.pdf + --pdf-filename testing.most_signif.alt_model_5mC.pdf tombo plot_motif_with_stats --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir --motif ATC \ - --overplot-threshold 1000 \ + --genome-fasta $genomeFn --overplot-threshold 1000 \ --statistics-filename test_stats.model.tombo.stats \ --pdf-filename testing.motif_w_stats.pdf tombo plot_motif_with_stats --fast5-basedirs $natDir \ - --tombo-model-filename $nrModFn --motif ATC \ - --overplot-threshold 1000 \ + --tombo-model-filename $nrModFn --motif CCWGG \ + --genome-fasta $genomeFn --overplot-threshold 1000 \ --statistics-filename test_stats.model.tombo.stats \ --pdf-filename testing.motif_w_stats.model.pdf tombo plot_motif_with_stats --fast5-basedirs $natDir \ - --control-fast5-basedirs $ampDir --motif ATC \ - --overplot-threshold 1000 \ - --statistics-filename test_stats.model.tombo.stats \ - --statistic-order --pdf-filename testing.motif_w_stats.statistic.pdf + --control-fast5-basedirs $ampDir --motif CCWGG \ + --genome-fasta $genomeFn --overplot-threshold 1000 \ + --statistics-filename test_stats.2samp.tombo.stats \ + --pdf-filename testing.motif_w_stats.2samp.pdf +tombo plot_motif_with_stats --fast5-basedirs $natDir \ + --motif CCWGG --genome-fasta $genomeFn --overplot-threshold 1000 \ + --statistics-filename test_stats.alt_model.5mC.tombo.stats \ + --pdf-filename testing.motif_w_stats.alt_model_5mC.pdf +tombo plot_motif_with_stats --fast5-basedirs $natDir \ + --plot-alternate-model 6mA \ + --motif CCWGG --genome-fasta $genomeFn --overplot-threshold 1000 \ + --statistics-filename test_stats.alt_default_model.6mA.tombo.stats \ + --pdf-filename testing.motif_w_stats.alt_model_6mA.alt_dist.pdf printf "\n\n********* Testing overplotting options **********\n" tombo plot_max_coverage --fast5-basedirs $natDir \ - --num-bases 21 --overplot-threshold 5 --overplot-type Downsample \ + --num-bases 21 --overplot-threshold 1 --overplot-type Downsample \ --pdf-filename testing.max_coverage.Downsample.pdf tombo plot_max_coverage --fast5-basedirs $natDir \ - --num-bases 21 --overplot-threshold 5 --overplot-type Boxplot \ + --num-bases 21 --overplot-threshold 1 --overplot-type Boxplot \ --pdf-filename testing.max_coverage.Boxplot.pdf tombo plot_max_coverage --fast5-basedirs $natDir \ - --num-bases 21 --overplot-threshold 5 --overplot-type Quantile \ + --num-bases 21 --overplot-threshold 1 --overplot-type Quantile \ --pdf-filename testing.max_coverage.Quantile.pdf tombo plot_max_coverage --fast5-basedirs $natDir \ - --num-bases 21 --overplot-threshold 5 --overplot-type Density \ + --num-bases 21 --overplot-threshold 1 --overplot-type Density \ --pdf-filename testing.max_coverage.Density.pdf tombo plot_max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --num-bases 21 --overplot-threshold 5 --overplot-type Downsample \ + --num-bases 21 --overplot-threshold 1 --overplot-type Downsample \ --pdf-filename testing.max_coverage.2samp.Downsample.pdf tombo plot_max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --num-bases 21 --overplot-threshold 5 --overplot-type Boxplot \ + --num-bases 21 --overplot-threshold 1 --overplot-type Boxplot \ --pdf-filename testing.max_coverage.2samp.Boxplot.pdf tombo plot_max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --num-bases 21 --overplot-threshold 5 --overplot-type Quantile \ + --num-bases 21 --overplot-threshold 1 --overplot-type Quantile \ --pdf-filename testing.max_coverage.2samp.Quantile.pdf tombo plot_max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --num-bases 21 --overplot-threshold 5 --overplot-type Density \ + --num-bases 21 --overplot-threshold 1 --overplot-type Density \ --pdf-filename testing.max_coverage.2samp.Density.pdf printf "\n\n********* Testing model-based plotting **********\n" @@ -277,22 +319,22 @@ tombo plot_most_significant --fast5-basedirs $natDir \ tombo plot_most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --statistics-filename test_stats.model.tombo.stats \ - --overplot-threshold 15 --overplot-type Downsample \ + --overplot-threshold 1 --overplot-type Downsample \ --pdf-filename testing.model_plotting.downsample.pdf tombo plot_most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --statistics-filename test_stats.model.tombo.stats \ - --overplot-threshold 15 --overplot-type Boxplot \ + --overplot-threshold 1 --overplot-type Boxplot \ --pdf-filename testing.model_plotting.boxplot.pdf tombo plot_most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --statistics-filename test_stats.model.tombo.stats \ - --overplot-threshold 15 --overplot-type Quantile \ - --pdf-filename testing.model_plotting.quantile.pdf + --overplot-threshold 1 --overplot-type Quantile \ + --pdf-filename testing.model_plotting.quant.pdf tombo plot_most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --statistics-filename test_stats.model.tombo.stats \ - --overplot-threshold 15 --overplot-type Density \ + --overplot-threshold 1 --overplot-type Density \ --pdf-filename testing.model_plotting.density.pdf tombo plot_genome_location --fast5-basedirs $ampDir \ --tombo-model-filename $nrModFn \ @@ -322,7 +364,7 @@ tombo plot_correction --fast5-basedirs $natDir --region-type random \ --pdf-filename testing.event_corr.pdf tombo plot_correction --fast5-basedirs $natDir --region-type end \ --corrected-group RawEventCorrected \ - --pdf-filename testing.event_corr.pdf + --pdf-filename testing.event_corr.end.pdf tombo plot_multi_correction --fast5-basedirs $natDir \ --corrected-group RawEventCorrected \ --pdf-filename testing.multi_event_corr.pdf @@ -332,14 +374,19 @@ tombo plot_multi_correction --fast5-basedirs $natDir \ --pdf-filename testing.multi_event_corr.locs.pdf printf "\n\n********* Testing per-read testing plot **********\n" +tombo plot_per_read --genome-locations $genomeLocs --num-bases 101 \ + --per-read-statistics-filename test_stats.2samp.tombo.per_read_stats \ + --genome-fasta $genomeFn --pdf-filename testing.per_read.pdf +tombo plot_per_read --genome-locations $genomeLocs --num-bases 101 \ + --per-read-statistics-filename test_stats.model.tombo.per_read_stats \ + --genome-fasta $genomeFn --pdf-filename testing.de_novo.per_read.pdf tombo plot_per_read --fast5-basedirs $natDir \ - --genome-locations $genomeLocs --num-bases 101 \ - --tombo-model-filename $nrModFn \ - --num-bases 101 --pdf-filename testing.per_read.pdf -tombo plot_per_read --fast5-basedirs $natDir \ - --genome-locations $genomeLocs --num-bases 101 \ - --tombo-model-filename $nrModFn --alternate-model-filename $altModFn \ - --num-bases 101 --pdf-filename testing.per_read.w_alt.pdf + --genome-locations $genomeLocs --num-bases 101 \ + --per-read-statistics-filename test_stats.alt_model.5mC.tombo.per_read_stats \ + --pdf-filename testing.per_read.w_alt.pdf +tombo plot_per_read --genome-locations $genomeLocs --num-bases 101 \ + --per-read-statistics-filename test_stats.alt_model.5mC.tombo.per_read_stats \ + --pdf-filename testing.per_read.wo_seq.pdf printf "\n\n********* Testing auxilliary commands **********\n" tombo write_most_significant_fasta --fast5-basedirs $natDir $ampDir \ @@ -351,10 +398,10 @@ tombo write_most_significant_fasta \ --genome-fasta $genomeFn tombo write_wiggles --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --wiggle-types coverage fraction signal signal_sd length stat \ - mt_stat difference \ + --wiggle-types coverage fraction signal signal_sd dwell \ + difference \ --statistics-filename test_stats.2samp.tombo.stats -tombo write_wiggles --wiggle-types fraction stat mt_stat \ +tombo write_wiggles --wiggle-types fraction dampened_fraction \ --statistics-filename test_stats.2samp.tombo.stats printf "\n\n********* Testing other plotting commands **********\n" diff --git a/tombo/text_output_commands.py b/tombo/text_output_commands.py index 164297d..0d13646 100644 --- a/tombo/text_output_commands.py +++ b/tombo/text_output_commands.py @@ -1,11 +1,22 @@ -import sys, os +from __future__ import division, unicode_literals, absolute_import + +from builtins import int, range, dict, map, zip + +import io +import sys import numpy as np from collections import defaultdict -import tombo_stats as ts -import tombo_helper as th +if sys.version_info[0] > 2: + unicode = str + +# import tombo functions +from . import tombo_stats as ts +from . import tombo_helper as th + +from ._default_parameters import SMALLEST_PVAL VERBOSE = False @@ -19,20 +30,20 @@ def _write_wiggle(wig_base, group_text, data_values, type_name, group_w_dot = '' if group_text == '' else '.' + group_text group_w_us = '' if group_text == '' else '_' + group_text group_w_space = '' if group_text == '' else ' ' + group_text - plus_wig_fp = open( - wig_base + '.' + type_name + group_w_dot + '.plus.wig', 'w') - minus_wig_fp = open( - wig_base + '.' + type_name + group_w_dot + '.minus.wig', 'w') + plus_wig_fp = io.open( + wig_base + '.' + type_name + group_w_dot + '.plus.wig', 'wt') + minus_wig_fp = io.open( + wig_base + '.' + type_name + group_w_dot + '.minus.wig', 'wt') plus_wig_fp.write(WIG_HEADER.format( wig_base, type_name, 'fwd_strand', group_w_us, group_w_space)) minus_wig_fp.write(WIG_HEADER.format( wig_base, type_name, 'rev_strand', group_w_us, group_w_space)) - for (chrm, strand), chrm_values in data_values.iteritems(): + for (chrm, strand), cs_values in data_values.items(): wig_fp = plus_wig_fp if strand == '+' else minus_wig_fp wig_fp.write("variableStep chrom={} span=1\n".format(chrm)) wig_fp.write('\n'.join([ - str(int(pos) + 1) + " " + str(round(val, 4)) - for pos, val in enumerate(chrm_values) + unicode(int(pos) + 1) + " " + unicode(round(val, 4)) + for pos, val in enumerate(cs_values) if not (np.isnan(val) or ( filter_zeros and np.equal(val, 0.0)))]) + '\n') @@ -41,55 +52,49 @@ def _write_wiggle(wig_base, group_text, data_values, type_name, return -def write_stat_wigs(all_stats, wig_base, write_pvals, write_qvals, write_frac, - stat_type): +def write_frac_wigs(all_stats, wig_base, do_frac, do_damp, cov_damp_counts): if VERBOSE: sys.stderr.write('Parsing statistics.\n') raw_chrm_strand_stats = defaultdict(list) for stat in all_stats: - raw_chrm_strand_stats[(stat['chrm'], stat['strand'])].append( - (stat['pos'], stat['stat'], stat['mt_stat'], stat['frac'])) - - all_stats = {} - all_mt_stats = {} - all_frac = {} - for chrm_strand, stats in raw_chrm_strand_stats.iteritems(): - cs_poss, raw_cs_stats, raw_cs_mt_stat, raw_cs_frac = map( - np.array, zip(*stats)) + raw_chrm_strand_stats[( + stat['chrm'].decode(), stat['strand'].decode())].append( + (stat['pos'], stat['frac'], stat['valid_cov'])) + + if do_frac: + all_frac = {} + if do_damp: + all_damp_frac = {} + for chrm_strand, stats in raw_chrm_strand_stats.items(): + cs_poss, raw_cs_frac, raw_cs_cov = map(np.array, zip(*stats)) max_pos = max(cs_poss) - # arrange and store p-values - cs_stats = np.empty(max_pos + 1) - cs_stats[:] = np.nan - np.put(cs_stats, cs_poss, raw_cs_stats) - if stat_type != 'model_compare': - # ignore errors when taking maximum over NA - with np.errstate(invalid='ignore'): - cs_stats = -np.log10(np.maximum(th.SMALLEST_PVAL, cs_stats)) - all_stats[chrm_strand] = cs_stats - - # arrange and store q-values - cs_mt_stat = np.empty(max_pos + 1) - cs_mt_stat[:] = np.nan - np.put(cs_mt_stat, cs_poss, raw_cs_mt_stat) - if stat_type != 'model_compare': - with np.errstate(invalid='ignore'): - chrm_mt_stat= -np.log10(np.maximum(th.SMALLEST_PVAL, cs_mt_stat)) - all_mt_stats[chrm_strand] = cs_mt_stat - - cs_frac = np.empty(max_pos + 1) - cs_frac[:] = np.nan - # fraction is stored as fraction of unmodified bases, but - # higher values show better in a wig file, so flip the fractions - np.put(cs_frac, cs_poss, 1 - raw_cs_frac) - all_frac[chrm_strand] = cs_frac - - if VERBOSE: sys.stderr.write('Writing statistics wig(s).\n') - if write_pvals: - _write_wiggle(wig_base, '', all_stats, 'statistic') - if write_qvals: - _write_wiggle(wig_base, '', all_mt_stats, 'multiple_testing_statistic') - if write_frac: - _write_wiggle(wig_base, '', all_frac, 'fraction_signif_reads') + if do_frac: + cs_frac = np.empty(max_pos + 1) + cs_frac[:] = np.nan + # fraction is stored as fraction of unmodified bases, but + # higher values show better in a wig file, so flip the fractions + np.put(cs_frac, cs_poss, 1 - raw_cs_frac) + all_frac[chrm_strand] = cs_frac + if do_damp: + cs_damp_frac = np.empty(max_pos + 1) + cs_damp_frac[:] = np.nan + non_mod_counts = np.round(raw_cs_frac * raw_cs_cov) + # compute dampened fraction of modified reads by adding psuedo-counts + # to the modified and un-modified counts (equivalent to a beta prior + # on the fraction estimation as a binomial variable) + raw_cs_damp_frac = (non_mod_counts + cov_damp_counts[0]) / ( + raw_cs_cov + sum(cov_damp_counts)) + # fraction is stored as fraction of unmodified bases, but + # higher values show better in a wig file, so flip the fractions + np.put(cs_damp_frac, cs_poss, 1 - raw_cs_damp_frac) + all_damp_frac[chrm_strand] = cs_damp_frac + + if VERBOSE: sys.stderr.write('Writing fraction wigs.\n') + if do_frac: + _write_wiggle(wig_base, '', all_frac, 'fraction_modified_reads') + if do_damp: + _write_wiggle(wig_base, '', all_damp_frac, + 'dampened_fraction_modified_reads') return @@ -99,7 +104,7 @@ def write_length_wig( base_lens = th.get_all_mean_lengths(raw_read_coverage, chrm_sizes) if VERBOSE: sys.stderr.write('Writing length wig.\n') - _write_wiggle(wig_base, group_name, base_lens, 'length') + _write_wiggle(wig_base, group_name, base_lens, 'dwell') return @@ -125,8 +130,7 @@ def write_signal_and_diff_wigs( if VERBOSE: sys.stderr.write( 'Calculating signal differences.\n') sig_diffs = {} - for chrm, strand in [(c, s) for c in chrm_sizes.keys() - for s in ('+', '-')]: + for chrm, strand in [(c, s) for c in chrm_sizes for s in ('+', '-')]: # calculate difference and set no coverage # (nan) values to zero sig_diffs[(chrm, strand)] \ @@ -153,20 +157,13 @@ def write_cov_wig(raw_read_coverage, wig_base, group_text): def write_all_wiggles( f5_dirs1, f5_dirs2, corrected_group, basecall_subgroups, - stats_fn, wig_base, wig_types): - if any(stat_name in wig_types for stat_name in - ['stat', 'mt_stat', 'fraction']): - if VERBOSE: sys.stderr.write('Loading statistics from file.\n') - all_stats, stat_type = ts.parse_stats(stats_fn) - + stats_fn, wig_base, wig_types, cov_damp_counts): if f5_dirs1 is not None: raw_read_coverage1 = th.parse_fast5s( f5_dirs1, corrected_group, basecall_subgroups) if len(raw_read_coverage1) == 0: - sys.stderr.write( - '*' * 60 + '\nERROR: No reads present in --fast5-basedirs.\n' + - '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'No reads present in --fast5-basedirs.') group1_name = '' if f5_dirs2 is None else GROUP1_NAME if f5_dirs2 is not None: @@ -181,7 +178,7 @@ def write_all_wiggles( if 'signal_sd' in wig_types: write_signal_sd_wig( raw_read_coverage2, chrm_sizes, wig_base, GROUP2_NAME) - if 'length' in wig_types: + if 'dwell' in wig_types: write_length_wig(raw_read_coverage2, chrm_sizes, wig_base, GROUP2_NAME) @@ -205,25 +202,25 @@ def write_all_wiggles( if 'signal_sd' in wig_types: write_signal_sd_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name) - if 'length' in wig_types: + if 'dwell' in wig_types: write_length_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name) - if any(stat_name in wig_types for stat_name in - ['stat', 'mt_stat', 'fraction']): - write_stat_wigs( - all_stats, wig_base, 'stat' in wig_types, - 'mt_stat' in wig_types, 'fraction' in wig_types, stat_type) + if any(wig_type in wig_types for wig_type in ( + 'fraction', 'dampened_fraction')): + if VERBOSE: sys.stderr.write('Loading statistics from file.\n') + all_stats, stat_type = ts.parse_stats(stats_fn) + write_frac_wigs(all_stats, wig_base, 'fraction' in wig_types, + 'dampened_fraction' in wig_types, cov_damp_counts) return def write_most_signif( - f5_dirs, fasta_fn, num_regions, qval_thresh, corrected_group, - basecall_subgroups, seqs_fn, num_bases, stat_order, stats_fn): + f5_dirs, fasta_fn, num_regions, corrected_group, + basecall_subgroups, seqs_fn, num_bases, stats_fn): if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats, stat_type = ts.parse_stats(stats_fn) plot_intervals = ts.get_most_signif_regions( - all_stats, num_bases, num_regions, qval_thresh, - fraction_order=not stat_order) + all_stats, num_bases, num_regions) # get each regions sequence either from reads or fasta index if fasta_fn is None: @@ -231,22 +228,20 @@ def write_most_signif( f5_dirs, corrected_group, basecall_subgroups) all_reg_data = th.get_region_sequences(plot_intervals, raw_read_coverage) else: - fasta_records = th.parse_fasta(fasta_fn) + genome_index = th.Fasta(fasta_fn) all_reg_data = [ - th.intervalData( - int_i.reg_id, int_i.chrm, int_i.start, int_i.end, int_i.strand, - int_i.reg_text, int_i.reads, - fasta_records[int_i.chrm][int_i.start:int_i.end]) - for int_i in plot_intervals if int_i.chrm in fasta_records] + int_i._replace( + seq=genome_index.get_seq(int_i.chrm, int_i.start, int_i.end)) + for int_i in plot_intervals if int_i.chrm in genome_index.index] if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') - with open(seqs_fn, 'w') as seqs_fp: + with io.open(seqs_fn, 'wt') as seqs_fp: for int_i in all_reg_data: reg_seq = int_i.seq if int_i.strand == '-': reg_seq = th.rev_comp(reg_seq) seqs_fp.write('>{0}:{1:d}:{2} {3}\n{4}\n'.format( - int_i.chrm, int(int_i.start + (num_bases / 2)), + int_i.chrm, int(int_i.start + (num_bases // 2)), int_i.strand, int_i.reg_text, ''.join(reg_seq))) return @@ -260,38 +255,32 @@ def wiggle_main(args): if (any(data_type in args.wiggle_types for data_type in ['signal', 'difference', 'coverage', - 'signal_sd', 'length']) and + 'signal_sd', 'dwell']) and args.fast5_basedirs is None): - sys.stderr.write( - '*' * 60 + '\nERROR: Must provide a fast5 basedir to output ' + - 'signal, difference, coverage, signal_sd and/or length wiggle ' + - 'files.\n' + '*' * 60 + '\n') - sys.exit() - if (any(data_type in args.wiggle_types - for data_type in ['stat', 'mt_stat', 'fraction']) and + th._error_message_and_exit( + 'Must provide a fast5 basedir to output signal, difference, ' + + 'coverage, signal_sd and/or length wiggle files.') + if (any(wig_type in args.wiggle_types for wig_type in ( + 'fraction', 'dampened_fraction')) and args.statistics_filename is None): - sys.stderr.write( - '*' * 60 + '\nERROR: Must provide a statistics filename to output ' + - 'stat and/or mt_stat wiggle files.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Must provide a statistics filename to output ' + + 'fraction wiggle files.') if ('difference' in args.wiggle_types and args.control_fast5_basedirs is None): - sys.stderr.write( - '*' * 60 + '\nERROR: Must provide two sets of FAST5s ' + \ - 'to output difference wiggle files.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Must provide two sets of FAST5s ' + \ + 'to output difference wiggle files.') if (args.control_fast5_basedirs is not None and args.fast5_basedirs is None): - sys.stderr.write( - '*' * 60 + '\nERROR: Cannot provide a control FAST5 set of ' + - 'directories without a sample set of FAST5 directories.\n' + - '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Cannot provide a control FAST5 set of directories ' + + 'without a sample set of FAST5 directories.') write_all_wiggles( args.fast5_basedirs, args.control_fast5_basedirs, args.corrected_group, args.basecall_subgroups, args.statistics_filename, args.wiggle_basename, - args.wiggle_types) + args.wiggle_types, args.coverage_dampen_counts) return @@ -302,21 +291,17 @@ def write_signif_diff_main(args): ts.VERBOSE = VERBOSE if args.fast5_basedirs is None and args.genome_fasta is None: - sys.stderr.write( - '*' * 60 + '\nERROR: Must provide either FAST5 ' + - 'directory(ies) or a fasta file.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Must provide either FAST5 directory(ies) or a fasta file.') write_most_signif( - args.fast5_basedirs, args.genome_fasta, - args.num_regions, args.q_value_threshold, - args.corrected_group, args.basecall_subgroups, - args.sequences_filename, args.num_bases, - args.statistic_order, args.statistics_filename) + args.fast5_basedirs, args.genome_fasta, args.num_regions, + args.corrected_group, args.basecall_subgroups, args.sequences_filename, + args.num_bases, args.statistics_filename) return if __name__ == '__main__': - raise NotImplementedError, ( + raise NotImplementedError( 'This is a module. See commands with `tombo -h`') diff --git a/tombo/tombo_helper.py b/tombo/tombo_helper.py index b2b6022..89ede72 100644 --- a/tombo/tombo_helper.py +++ b/tombo/tombo_helper.py @@ -1,25 +1,50 @@ -import sys, os +from __future__ import division, unicode_literals, absolute_import +from builtins import int, range, dict, map, zip + +import os +import io import re -import h5py -import string +import sys +import random import fnmatch +# Future warning from cython in h5py +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) +import h5py + import numpy as np from glob import glob from operator import itemgetter -from itertools import izip, repeat, islice +from itertools import repeat, islice from collections import defaultdict, namedtuple -from _version import TOMBO_VERSION -from c_helper import c_new_mean_stds, c_new_means, c_apply_outlier_thresh +if sys.version_info[0] > 2: + unicode = str + +# import tombo functions +from ._version import TOMBO_VERSION +from .c_helper import c_new_mean_stds, c_new_means, c_apply_outlier_thresh +from ._default_parameters import ROBUST_QUANTS, NUM_READS_FOR_SCALE + +VERBOSE = False + + +################################ +###### Global Namedtuples ###### +################################ -SMALLEST_PVAL = 1e-50 +alignInfo = namedtuple( + 'alignInfo', + ('ID', 'Subgroup', 'ClipStart', 'ClipEnd', + 'Insertions', 'Deletions', 'Matches', 'Mismatches')) readData = namedtuple('readData', ( 'start', 'end', 'filtered', 'read_start_rel_to_raw', 'strand', 'fn', 'corr_group', 'rna')) + intervalData = namedtuple('intervalData', ( 'reg_id', 'chrm', 'start', 'end', 'strand', 'reg_text', 'reads', 'seq')) # set default values for strand, text, reads and seq @@ -28,17 +53,16 @@ channelInfo = namedtuple( 'channelInfo', ('offset', 'range', 'digitisation', 'number', 'sampling_rate')) + scaleValues = namedtuple( 'scaleValues', ('shift', 'scale', 'lower_lim', 'upper_lim')) + genomeLoc = namedtuple( 'genomeLoc', ('Start', 'Strand', 'Chrom')) NORM_TYPES = ('none', 'pA', 'pA_raw', 'median', 'robust_median', 'median_const_scale') -STANDARD_MODELS = {'DNA':'tombo.DNA.model', - 'RNA':'tombo.RNA.200mV.model'} -ALTERNATE_MODELS = {'DNA_5mC':'tombo.DNA.5mC.model',} # single base conversion for motifs SINGLE_LETTER_CODE = { @@ -46,21 +70,27 @@ 'D':'[AGT]', 'H':'[ACT]', 'K':'[GT]', 'M':'[AC]', 'N':'[ACGT]', 'R':'[AG]', 'S':'[CG]', 'V':'[ACG]', 'W':'[AT]', 'Y':'[CT]'} - -FN_SPACE_FILLER = '|||' -FASTA_NAME_JOINER = ':::' - -VERBOSE = False - -# got quantiles from analysis of stability after shift-only normalization -robust_quantiles = (46.5, 53.5) +INVALID_BASES = re.compile('[^ACGT]') ###################################### ###### Various Helper Functions ###### ###################################### -COMP_BASES = string.maketrans('ACGT', 'TGCA') +def _warning_message(message): + sys.stderr.write( + '*' * 20 + ' WARNING ' + '*' * 20 + '\n\t' + + message + '\n') + return + +def _error_message_and_exit(message): + sys.stderr.write( + '*' * 20 + ' ERROR ' + '*' * 20 + '\n\t' + + message + '\n') + sys.exit() + return + +COMP_BASES = dict(zip(map(ord, 'ACGT'), map(ord, 'TGCA'))) def comp_seq(seq): """ Complement DNA sequence @@ -72,82 +102,135 @@ def rev_comp(seq): """ return seq.translate(COMP_BASES)[::-1] -U_TO_T = string.maketrans('U', 'T') +U_TO_T = {ord('U'):ord('T')} def rev_transcribe(seq): """ Convert U bases to T """ return seq.translate(U_TO_T) -def parse_fasta(fasta_fn): - """ - Parse a fasta file for sequence extraction (mostly for BAM processing) - """ - # Tried Biopython index and that opened the fail again for each - # record access request and was thus far too slow - - # could consider a conditional dependence on pyfaix if on-disk - # indexing is required for larger genomes - # testing shows that human genome only takes 3.2 GB with raw parser - # so raw parsing is probably fine - fasta_fp = open(fasta_fn) - - fasta_records = {} - curr_id = None - curr_seq = '' - for line in fasta_fp: - if line.startswith('>'): - if (curr_id is not None and - curr_seq is not ''): - fasta_records[curr_id] = curr_seq - curr_seq = '' - curr_id = line.replace(">","").strip().split()[0] - else: - curr_seq += line.strip() - - # add last record - if (curr_id is not None and - curr_seq is not ''): - fasta_records[curr_id] = curr_seq - - fasta_fp.close() - - return fasta_records - def get_chrm_sizes(raw_read_coverage, raw_read_coverage2=None): """ Get covered chromosome sizes from a set of reads """ strand_chrm_sizes = defaultdict(list) for (chrm, strand), cs_read_cov in \ - raw_read_coverage.iteritems(): + raw_read_coverage.items(): strand_chrm_sizes[chrm].append(max( r_data.end for r_data in cs_read_cov)) if raw_read_coverage2 is not None: for (chrm, strand), cs_read_cov in \ - raw_read_coverage2.iteritems(): + raw_read_coverage2.items(): strand_chrm_sizes[chrm].append(max( r_data.end for r_data in cs_read_cov)) return dict((chrm, max(strnd_sizes)) for chrm, strnd_sizes in - strand_chrm_sizes.iteritems()) + strand_chrm_sizes.items()) + +class TomboMotif(object): + def _parse_motif(self, rev_comp_motif=False): + """ + Parse a single letter code motif into a pattern for matching + """ + conv_motif = ''.join(SINGLE_LETTER_CODE[letter] + for letter in self.raw_motif) + if rev_comp_motif: + # reverse complement and then flip any group brackets + conv_motif = rev_comp(conv_motif).translate({ + ord('['):']', ord(']'):'['}) + return re.compile(conv_motif) + + def __init__(self, raw_motif, mod_pos=None): + invalid_chars = re.findall( + '[^' + ''.join(SINGLE_LETTER_CODE) + ']', raw_motif) + if len(invalid_chars) > 0: + _error_message_and_exit( + 'Invalid characters in motif: ' + ', '.join(invalid_chars)) + + # basic motif parsing + self.raw_motif = raw_motif + self.motif_len = len(raw_motif) + self.motif_pat = self._parse_motif() + self.rev_comp_pat = self._parse_motif(True) + + self.is_palindrome = self.motif_pat == self.rev_comp_pat + + # parse modified position from motif if provided + self.mod_pos = mod_pos + if mod_pos is None: + self.mod_base = None + else: + self.mod_base = raw_motif[mod_pos - 1] + if INVALID_BASES.match(self.mod_base): + _warning_message( + 'Provided modified position is not a single base, which ' + + 'is likely an error. Specified modified base is one of: ' + + ' '.join(SINGLE_LETTER_CODE[self.mod_base][1:-1])) + + +def invalid_seq(seq): + return bool(INVALID_BASES.search(seq)) + + +########################### +###### FASTA Parsing ###### +########################### + +class Fasta(object): + """ + Fasta sequence format wrapper class. + + Will load faidx via pyfaidx package if installed, else the fasta will be + loaded into memory for sequence extraction + """ + def _load_in_mem(self): + genome_index = {} + curr_id = None + curr_seq = [] + with io.open(self.fasta_fn) as fasta_fp: + for line in fasta_fp: + if line.startswith('>'): + if (curr_id is not None and + curr_seq is not []): + genome_index[curr_id] = ''.join(curr_seq) + curr_seq = [] + curr_id = line.replace(">","").split()[0] + else: + curr_seq.append(line.strip()) + # add last record + if (curr_id is not None and + curr_seq is not []): + genome_index[curr_id] = ''.join(curr_seq) -def parse_motif(motif): - """ - Parse a single letter code motif into a pattern for matching - """ - invalid_chars = re.findall( - '[^' + ''.join(SINGLE_LETTER_CODE.keys()) + ']', - motif) - if len(invalid_chars) > 0: - sys.stderr.write( - '********* ERROR *********\n\tInvalid characters in motif: ' + - ', '.join(invalid_chars) + '\n') - sys.exit() + return genome_index - return re.compile(''.join( - SINGLE_LETTER_CODE[letter] for letter in motif)) + def __init__(self, fasta_fn, dry_run=False, force_in_mem=False): + self.fasta_fn = fasta_fn + try: + if force_in_mem: raise ImportError + import pyfaidx + self.has_pyfaidx = True + self.index = pyfaidx.Fasta(fasta_fn) + except: + self.has_pyfaidx = False + if not dry_run: + self.index = self._load_in_mem() + + def get_seq(self, chrm, start=None, end=None): + if self.has_pyfaidx: + if not (start or end): + return self.index[chrm][:].seq + return self.index[chrm][start:end].seq + return self.index[chrm][start:end] + + def iter_chrms(self): + if self.has_pyfaidx: + for chrm in self.index: + yield unicode(chrm.name) + else: + for chrm in self.index: + yield chrm ############################################# @@ -163,6 +246,10 @@ def is_read_rna(fast5_data): try: exp_type = fast5_data['UniqueGlobalKey/context_tags'].attrs[ 'experiment_type'] + try: + exp_type = exp_type.decode() + except (AttributeError, TypeError): + pass # remove the word internal since it contains rna. exp_type = exp_type.replace('internal', '') except: @@ -170,6 +257,10 @@ def is_read_rna(fast5_data): try: exp_kit = fast5_data['UniqueGlobalKey/context_tags'].attrs[ 'experiment_kit'] + try: + exp_kit = exp_kit.decode() + except (AttributeError, TypeError): + pass # remove the word internal since it contains rna. exp_kit = exp_kit.replace('internal', '') except: @@ -189,7 +280,7 @@ def is_rna(raw_read_coverage, n_reads=10): Determine if a set of reads are RNA or DNA from a small sample """ proc_reads = 0 - for cs_reads in raw_read_coverage.itervalues(): + for cs_reads in raw_read_coverage.values(): for r_data in cs_reads: if not r_data.rna: return False @@ -266,8 +357,9 @@ def write_index_file(all_index_data, index_fn, basedir): index_data[chrm_strand].append(( from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna)) - with open(index_fn, 'w') as index_fp: - pickle.dump(dict(index_data), index_fp) + with io.open(index_fn, 'wb') as index_fp: + # note protocol 2 for py2/3 compatibility + pickle.dump(dict(index_data), index_fp, protocol=2) return @@ -283,22 +375,22 @@ def clear_filters(fast5s_dir, corr_grp): except: import pickle try: - with open(index_fn, 'rb') as index_fp: + with io.open(index_fn, 'rb') as index_fp: index_data = pickle.load(index_fp) except IOError: - sys.stderr.write( - '******** ERRROR *******\n\tFilters can only be applied to runs ' + + _error_message_and_exit( + 'Filters can only be applied to runs ' + 'with a Tombo index file. Re-run resquiggle without the ' + - '--skip-index option to apply filters.\n') - sys.exit() + '--skip-index option to apply filters.') new_index_data = [] - for chrm_strand, cs_raw_data in index_data.iteritems(): + for chrm_strand, cs_raw_data in index_data.items(): new_index_data.extend([(chrm_strand, ( from_base_fn, start, end, rsrtr, corr_grp, s_grp, False, rna)) for from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna in cs_raw_data]) write_index_file(new_index_data, index_fn, fast5s_dir) + sys.stderr.write('All filters successfully cleared!\n') return @@ -311,16 +403,13 @@ def parse_obs_filter(obs_filter): # parse obs_filter try: - obs_filter = [map(int, pctl_nobs.split(':')) + obs_filter = [list(map(int, pctl_nobs.split(':'))) for pctl_nobs in obs_filter] except: - raise RuntimeError, 'Invalid format for observation filter' + raise RuntimeError('Invalid format for observation filter') - if any(pctl < 0 or pctl > 100 for pctl in zip(*obs_filter)[0]): - sys.stderr.write( - '********* ERROR ********* Invalid percentile value. ' + - ' *********\n') - sys.exit() + if any(pctl < 0 or pctl > 100 for pctl in map(itemgetter(0), obs_filter)): + _error_message_and_exit('Invalid percentile value.') return obs_filter @@ -331,11 +420,8 @@ def filter_reads(fast5s_dir, corr_grp, obs_filter): def read_is_stuck(fast5_fn, s_grp): try: fast5_data = h5py.File(fast5_fn, 'r') - event_data = fast5_data['/Analyses/' + corr_grp + '/' + s_grp + - '/Events'].value - events_end = event_data[-1]['start'] + event_data[-1]['length'] - base_lens = np.diff(np.concatenate([ - event_data['start'], [events_end,]])) + base_lens = fast5_data['/Analyses/' + corr_grp + '/' + s_grp + + '/Events']['length'] return any(np.percentile(base_lens, pctl) > thresh for pctl, thresh in obs_filter) except: @@ -349,17 +435,16 @@ def read_is_stuck(fast5_fn, s_grp): except: import pickle try: - with open(index_fn, 'rb') as index_fp: + with io.open(index_fn, 'rb') as index_fp: index_data = pickle.load(index_fp) except IOError: sys.stderr.write( '******** ERRROR *******\n\tFilters can only be applied to runs ' + 'with a Tombo index file. Re-run resquiggle without the ' + '--skip-index option to apply filters.\n') - sys.exit() filt_index_data = [] num_reads, num_filt_reads = 0, 0 - for chrm_strand, cs_raw_data in index_data.iteritems(): + for chrm_strand, cs_raw_data in index_data.items(): cs_filt_reads = [ (chrm_strand, ( from_base_fn, start, end, rsrtr, corr_grp, s_grp, @@ -371,9 +456,9 @@ def read_is_stuck(fast5_fn, s_grp): filt_index_data.extend(cs_filt_reads) sys.stderr.write( - 'Filtered ' + str(num_filt_reads) + + 'Filtered ' + unicode(num_filt_reads) + ' reads due to observations per base filter from a ' + - 'total of ' + str(num_reads) + ' reads in ' + fast5s_dir + '.\n') + 'total of ' + unicode(num_reads) + ' reads in ' + fast5s_dir + '.\n') write_index_file(filt_index_data, index_fn, fast5s_dir) @@ -388,20 +473,19 @@ def filter_reads_for_coverage(fast5s_dir, corr_grp, frac_to_filter): except: import pickle try: - with open(index_fn, 'rb') as index_fp: + with io.open(index_fn, 'rb') as index_fp: index_data = pickle.load(index_fp) except IOError: sys.stderr.write( '******** ERRROR *******\n\tFilters can only be applied to runs ' + 'with a Tombo index file. Re-run resquiggle without the ' + '--skip-index option to apply filters.\n') - sys.exit() unfilt_data = [] unfilt_reads_cov = [] prev_filt_data = [] - for chrm_strand, cs_raw_data in index_data.iteritems(): + for chrm_strand, cs_raw_data in index_data.items(): max_end = max(end for (_, _, end, _, _, _, _, _) in cs_raw_data) - cs_coverage = np.zeros(max_end, dtype=np.int_) + cs_coverage = np.zeros(max_end, dtype=np.int64) for (from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna) in cs_raw_data: if filtered: continue @@ -416,16 +500,16 @@ def filter_reads_for_coverage(fast5s_dir, corr_grp, frac_to_filter): continue # add approximate coverage from middle of read # faster than mean over the whole read - unfilt_reads_cov.append(cs_coverage[start + int((end - start)/2)]) + unfilt_reads_cov.append(cs_coverage[start + ((end - start) // 2)]) unfilt_data.append((chrm_strand, ( from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna))) num_reads = len(unfilt_data) num_filt_reads = int(frac_to_filter * num_reads) sys.stderr.write( - 'Filtered ' + str(num_filt_reads) + + 'Filtered ' + unicode(num_filt_reads) + ' reads due to observations per base filter from a ' + - 'total of ' + str(num_reads) + ' reads in ' + fast5s_dir + '.\n') + 'total of ' + unicode(num_reads) + ' reads in ' + fast5s_dir + '.\n') # create probabilities array with coverage values normalized to sum to 1 unfilt_reads_cov = np.array(unfilt_reads_cov, dtype=np.float) @@ -454,7 +538,7 @@ def annotate_with_fastqs(fastq_fns, fast5s_read_ids, fastq_slot): for fastq_fn in fastq_fns: n_recs = 0 been_warned_ids = False - with open(fastq_fn) as fastq_fp: + with io.open(fastq_fn) as fastq_fp: while True: fastq_rec = list(islice(fastq_fp, 4)) # if record contains fewer than 4 lines this indicates the @@ -465,11 +549,11 @@ def annotate_with_fastqs(fastq_fns, fast5s_read_ids, fastq_slot): # corrupted, so don't process any more records if (re.match('@', fastq_rec[0]) is None or re.match('\+', fastq_rec[2]) is None): - sys.stderr.write( - '********* WARNING ********\n\tSuccessfully parsed ' + - str(n_recs) + 'FASTQ records from ' + fastq_fn + - ' before encountering an invalid record. The rest of ' + - 'this file will not be processed.\n') + _warning_message( + 'Successfully parsed ' + unicode(n_recs) + + 'FASTQ records from ' + fastq_fn + ' before ' + + 'encountering an invalid record. The rest of ' + + 'this file will not be processed.') break # extract read_id from fastq (which should be the first text @@ -479,18 +563,34 @@ def annotate_with_fastqs(fastq_fns, fast5s_read_ids, fastq_slot): if read_id not in fast5s_read_ids: if not been_warned_ids: been_warned_ids = True - sys.stderr.write( - '********* WARNING ********\n\tSome records from ' + - fastq_fn + ' contain read identifiers not found ' + - 'in any FAST5 files.\n') + _warning_message( + 'Some records from ' + fastq_fn + ' contain read ' + + 'identifiers not found in any FAST5 files.') continue with h5py.File(fast5s_read_ids[read_id]) as fast5_data: bc_slot = fast5_data[fastq_slot] - bc_slot.create_dataset('Fastq', data=''.join(fastq_rec)) + bc_slot.create_dataset( + 'Fastq', data=''.join(fastq_rec), + dtype=h5py.special_dtype(vlen=unicode)) return +def reads_contain_basecalls(fast5_fns, bc_grp, num_reads): + test_fns = random.sample( + fast5_fns, num_reads) if len(fast5_fns) > num_reads else fast5_fns + for fast5_fn in test_fns: + try: + with h5py.File(fast5_fn, 'r') as fast5_data: + fast5_data['/Analyses/' + bc_grp] + except: + continue + # if the basecall group is accessible for a single file return true + return True + + # else if all tested reads did not contain the basecall group return False + return False + def get_files_list(fast5s_dir): """ Get all fast5 files recursively listed below the directory @@ -503,11 +603,23 @@ def get_files_list(fast5s_dir): return all_fast5s +def get_raw_read_slot(fast5_data): + try: + raw_read_slot = list(fast5_data['/Raw/Reads'].values())[0] + except: + raise NotImplementedError( + 'Raw data is not found in /Raw/Reads/Read_[read#]') + + return raw_read_slot + def prep_fast5_for_fastq( fast5_data, basecall_group, basecall_subgroup, overwrite): try: - read_id = fast5_data[ - '/Raw/Reads/'].values()[0].attrs['read_id'] + read_id = get_raw_read_slot(fast5_data).attrs['read_id'] + try: + read_id = read_id.decode() + except (AttributeError, TypeError): + pass except: return None @@ -529,7 +641,7 @@ def prep_fast5_for_fastq( bc_grp = analyses_grp.create_group(basecall_group) bc_subgrp = bc_grp.create_group(basecall_subgroup) else: - raise NotImplementedError, ( + raise NotImplementedError( basecall_group + ' exists and --overwrite is not set.') return read_id @@ -556,10 +668,10 @@ def get_read_ids_and_prep_fastq_slot( except NotImplementedError: if VERBOSE and not been_warned_overwrite: been_warned_overwrite = True - sys.stderr.write( - '********* WARNING ********\n\tBasecalls exsit in ' + - basecall_group + ' slot. Set --overwrite option ' + - 'to overwrite these basecalls in this slot.\n') + _warning_message( + 'Basecalls exsit in ' + basecall_group + ' slot. ' + + 'Set --overwrite option to overwrite these ' + + 'basecalls in this slot.') continue if read_id is None: continue @@ -567,10 +679,10 @@ def get_read_ids_and_prep_fastq_slot( # Warn non-unique read_ids in directory if VERBOSE and not been_warned_unique: been_warned_unique = True - sys.stderr.write( - '******** WARNING *********\n\tMultiple FAST5 files ' + - 'contain the same read identifiers. Ensure that ' + - 'FAST5 files are from a single run.\n') + _warning_message( + 'Multiple FAST5 files contain the same read ' + + 'identifiers. Ensure that FAST5 files are from ' + + 'a single run.') continue fast5s_read_ids[read_id] = fast5_fn @@ -606,14 +718,18 @@ def parse_fast5s_wo_index( # don't warn here since errored out reads will have get here, but # not have alignment and events stored, so just skip these reads continue - raw_read_coverage[( - align_data['mapped_chrom'], - align_data['mapped_strand'])].append( - readData( - align_data['mapped_start'], align_data['mapped_end'], - False, read_start_rel_to_raw, - align_data['mapped_strand'], read_fn, - corrected_group + '/' + basecall_subgroup, rna)) + chrm = align_data['mapped_chrom'] + strand = align_data['mapped_strand'] + try: + chrm = chrm.decode() + strand = strand.decode() + except: + pass + raw_read_coverage[(chrm, strand)].append( + readData( + align_data['mapped_start'], align_data['mapped_end'], + False, read_start_rel_to_raw, strand, read_fn, + corrected_group + '/' + basecall_subgroup, rna)) read_data.close() @@ -625,7 +741,7 @@ def convert_index(index_data, fast5s_dir, corr_grp, new_corr_grp): model_resquiggle """ new_index_data = [] - for (chrm, strand), cs_raw_data in index_data.iteritems(): + for (chrm, strand), cs_raw_data in index_data.items(): for (from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna) in cs_raw_data: if c_grp != corr_grp: continue @@ -651,20 +767,17 @@ def parse_fast5s_w_index(fast5s_dir, corr_grp, subgroups, new_corr_grp): import cPickle as pickle except: import pickle - with open(index_fn, 'rb') as index_fp: + with io.open(index_fn, 'rb') as index_fp: index_data = pickle.load(index_fp) raw_read_coverage = {} - for (chrm, strand), cs_raw_data in index_data.iteritems(): - # TODO temporary check that filtered is a boolean value so that old - # len_percentiles slots will be handled correctly should be removed + for (chrm, strand), cs_raw_data in index_data.items(): cs_data = [ readData(start, end, filtered, rsrtr, strand, os.path.join(fast5s_dir, from_base_fn), corr_grp + '/' + s_grp, rna) for from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna in cs_raw_data - if c_grp == corr_grp and s_grp in subgroups and - not (isinstance(filtered, bool) and filtered)] + if c_grp == corr_grp and s_grp in subgroups and not filtered] raw_read_coverage[(chrm, strand)] = cs_data if new_corr_grp is not None: # convert corrected group to new corrected group for @@ -679,8 +792,7 @@ def merge_cov(w_index_covs, wo_index_cov): """ all_covs = w_index_covs + [wo_index_cov,] raw_read_coverage = defaultdict(list) - for chrm_strand in set([cs for d_cov in all_covs - for cs in d_cov.keys()]): + for chrm_strand in set([cs for d_cov in all_covs for cs in d_cov]): for dir_cov in all_covs: if chrm_strand not in dir_cov: continue raw_read_coverage[chrm_strand].extend(dir_cov[chrm_strand]) @@ -706,16 +818,18 @@ def parse_fast5s(fast5_basedirs, corrected_group, basecall_subgroups, fast5s_dir, corrected_group, basecall_subgroups, new_corr_grp)) except: - sys.stderr.write('WARNING: Failed to parse tombo index ' + - 'file for ' + fast5s_dir + ' directory.\n') + raise + _warning_message( + 'Failed to parse tombo index file for ' + + fast5s_dir + ' directory.') wo_index_dirs.append(fast5s_dir) else: if not warn_index: - sys.stderr.write( - 'WARNING: Index file does not exist for one or more ' + - 'directories. For optimal performance, either re-run ' + - 're-squiggle without --skip-index flag or point to ' + - 'top level fast5 directory of recursive directories.\n') + _warning_message( + 'Tombo index file does not exist for one or more ' + + 'directories. If --skip-index was not set for ' + + 're-squiggle command, ensure that the specified ' + + 'directory is the same as for the re-squiggle command.\n') warn_index = True wo_index_dirs.append(fast5s_dir) wo_index_cov = parse_fast5s_wo_index( @@ -736,7 +850,7 @@ def parse_pore_model(pore_model_fn): Parse pore model for pA normalization (Deprecated) """ pore_model = {'mean':{}, 'inv_var':{}} - with open(pore_model_fn) as fp: + with io.open(pore_model_fn) as fp: for line in fp: if line.startswith('#'): continue try: @@ -774,37 +888,38 @@ def calc_kmer_fitted_shift_scale(pore_model, events_means, events_kmers): return shift, scale -def get_valid_cpts(norm_signal, min_base_obs, num_events): +def get_valid_cpts(norm_signal, running_stat_width, num_events): """ - Get valid changepoints given largest differences in neighboring moving windows + Get valid changepoints given largest differences in neighboring + moving windows Note that this method is completely vectorized, but allows segments as small as 2 observations. This should be okay R9+, but is problematic for <=R7 and RNA """ raw_cumsum = np.cumsum(np.concatenate([[0], norm_signal[:-1]])) - # get difference between all neighboring min_base_obs regions + # get difference between all neighboring running_stat_width regions running_diffs = np.abs( - (2 * raw_cumsum[min_base_obs:-min_base_obs]) - - raw_cumsum[:-2*min_base_obs] - - raw_cumsum[2*min_base_obs:]) + (2 * raw_cumsum[running_stat_width:-running_stat_width]) - + raw_cumsum[:-2*running_stat_width] - + raw_cumsum[2*running_stat_width:]) not_peaks = np.logical_not(np.logical_and( running_diffs > np.concatenate([[0], running_diffs[:-1]]), running_diffs > np.concatenate([running_diffs[1:], [0]]))) running_diffs[not_peaks] = 0 valid_cpts = np.argsort( - running_diffs)[::-1][:num_events].astype(np.int32) + min_base_obs + running_diffs)[::-1][:num_events].astype(np.int64) + running_stat_width return valid_cpts -def estimate_global_scale(fast5_fns, num_reads=500): +def estimate_global_scale(fast5_fns, num_reads=NUM_READS_FOR_SCALE): sys.stderr.write('Estimating global scale parameter\n') np.random.shuffle(fast5_fns) read_mads = [] for fast5_fn in fast5_fns: try: with h5py.File(fast5_fn, 'r') as fast5_data: - all_sig = fast5_data['/Raw/Reads'].values()[0]['Signal'].value + all_sig = get_raw_read_slot(fast5_data)['Signal'].value shift = np.median(all_sig) read_mads.append(np.median(np.abs(all_sig - shift))) except: @@ -813,14 +928,13 @@ def estimate_global_scale(fast5_fns, num_reads=500): break if len(read_mads) == 0: - sys.stderr.write( - '******** ERROR *********\n\tNo reads contain raw signal for ' + - 'global scale parameter estimation.\n') - sys.exit() + _error_message_and_exit( + 'No reads contain raw signal for ' + + 'global scale parameter estimation.') if len(read_mads) < num_reads: - sys.stderr.write( - '******** WARNING *********\n\tFew reads contain raw signal for ' + - 'global scale parameter estimation. Results may not be optimal.\n') + _warning_message( + 'Few reads contain raw signal for global scale parameter ' + + 'estimation. Results may not be optimal.') return np.mean(read_mads) @@ -834,7 +948,7 @@ def normalize_raw_signal( Apply scaling and windsorizing parameters to normalize raw signal """ if norm_type not in NORM_TYPES and (shift is None or scale is None): - raise NotImplementedError, ( + raise NotImplementedError( 'Normalization type ' + norm_type + ' is not a valid ' + 'option and shift or scale parameters were not provided.') @@ -860,9 +974,6 @@ def normalize_raw_signal( # conditional model after raw DAC scaling shift = shift + (fit_shift * scale) scale = scale * fit_scale - # print fitted shift and scale for comparisons - #print 'shift: ' + str(fit_shift) + \ - # '\tscale: ' + str(fit_scale) elif norm_type == 'median': shift = np.median(raw_signal) scale = np.median(np.abs(raw_signal - shift)) @@ -871,8 +982,7 @@ def normalize_raw_signal( shift = np.median(raw_signal) scale = const_scale elif norm_type == 'robust_median': - shift = np.mean(np.percentile( - raw_signal, robust_quantiles)) + shift = np.mean(np.percentile(raw_signal, ROBUST_QUANTS)) scale = np.median(np.abs(raw_signal - read_robust_med)) raw_signal = (raw_signal - shift) / scale @@ -893,33 +1003,43 @@ def normalize_raw_signal( ###### Events Table Access Functions ###### ########################################### -def get_multiple_slots_read_centric(r_data, slot_names): +def get_multiple_slots_read_centric(r_data, slot_names, corr_grp=None): """ Extract read-centric slot_names from this read's Events table """ try: - with h5py.File(r_data.fn, 'r') as read_data: - # note that it's more efficient to try to access the slot - # and except the error that check if the slot exists first - r_events = read_data['/'.join(( - '/Analyses', r_data.corr_group, 'Events'))].value + do_close = False + if not isinstance(r_data, h5py.File): + do_close = True + corr_grp = r_data.corr_group + r_data = h5py.File(r_data.fn, 'r') + event_slot_name = '/'.join(('/Analyses', corr_grp, 'Events')) + # note that it's more efficient to try to access the slot + # and except the error that check if the slot exists first + r_event_data = r_data[event_slot_name].value + if do_close: r_data.close() except: # probably truncated file or events don't exist return [None,] * len(slot_names) - return [r_events[slot_name] for slot_name in slot_names] + return [r_event_data[slot_name] for slot_name in slot_names] -def get_single_slot_read_centric(r_data, slot_name): +def get_single_slot_read_centric(r_data, slot_name, corr_grp=None): """ Extract read-centric slot_name from this read's Events table """ try: - with h5py.File(r_data.fn, 'r') as read_data: - # note that it's more efficient to try to access the slot - # and except the error that check if the slot exists first - r_events = read_data['/'.join(( - '/Analyses', r_data.corr_group, 'Events'))].value - r_slot_values = r_events[slot_name] + # if r_data is an open h5py object then don't open the filename + do_close = False + if not isinstance(r_data, h5py.File): + do_close = True + corr_grp = r_data.corr_group + r_data = h5py.File(r_data.fn, 'r') + # note that it's more efficient to try to access the slot + # and except the error that check if the slot exists first + r_slot_values = r_data['/'.join(('/Analyses', corr_grp, 'Events'))][ + slot_name] + if do_close: r_data.close() except: # probably truncated file or events don't exist return None @@ -934,18 +1054,18 @@ def get_single_slot_genome_centric(r_data, slot_name): if r_slot_values is None: return None - if ((r_data.strand == '-' and not r_data.rna) or - (r_data.strand == '+' and r_data.rna)): + if r_data.strand == '-': r_slot_values = r_slot_values[::-1] return r_slot_values def get_mean_slot_genome_centric(cs_reads, chrm_len, slot_name): """ - Get the mean over all reads at each covered genomic location for this slots value + Get the mean over all reads at each covered genomic location for this + slots value """ base_sums = np.zeros(chrm_len) - base_cov = np.zeros(chrm_len, dtype=np.int_) + base_cov = np.zeros(chrm_len, dtype=np.int64) for r_data in cs_reads: # extract read means data so data across all chrms is not # in RAM at one time @@ -960,7 +1080,8 @@ def get_mean_slot_genome_centric(cs_reads, chrm_len, slot_name): def get_all_mean_slot_values(raw_read_coverage, chrm_sizes, slot_name): """ - Get the mean over all reads at each covered genomic location for this slots value over all covered chromosomes and strands + Get the mean over all reads at each covered genomic location for this + slots value over all covered chromosomes and strands """ # ignore divide by zero errors that occur where there is no # coverage. Need to correct nan values after subtracting two sets of @@ -968,8 +1089,7 @@ def get_all_mean_slot_values(raw_read_coverage, chrm_sizes, slot_name): old_err_settings = np.seterr(all='ignore') # take the mean over all signal overlapping each base all_mean_values = {} - for chrm, strand in [(c, s) for c in chrm_sizes.keys() - for s in ('+', '-')]: + for chrm, strand in [(c, s) for c in chrm_sizes for s in ('+', '-')]: if (chrm, strand) in raw_read_coverage: cs_mean_values = get_mean_slot_genome_centric( raw_read_coverage[(chrm, strand)], chrm_sizes[chrm], slot_name) @@ -1011,13 +1131,13 @@ def get_channel_info(read_data): try: fast5_info = read_data['UniqueGlobalKey/channel_id'].attrs except: - raise RuntimeError, ("No channel_id group in HDF5 file. " + - "Probably mux scan HDF5 file.") + raise NotImplementedError("No channel_id group in HDF5 file. " + + "Probably mux scan HDF5 file.") channel_info = channelInfo( fast5_info['offset'], fast5_info['range'], fast5_info['digitisation'], fast5_info['channel_number'], - fast5_info['sampling_rate'].astype('int_')) + fast5_info['sampling_rate'].astype(np.int64)) return channel_info @@ -1028,15 +1148,15 @@ def get_raw_signal(r_data, int_start, int_end): with h5py.File(r_data.fn, 'r') as fast5_data: # retrieve shift and scale computed in correction script corr_subgrp = fast5_data['/Analyses/' + r_data.corr_group] - event_data = corr_subgrp['Events'].value - events_end = event_data[-1]['start'] + event_data[-1]['length'] - segs = np.concatenate([event_data['start'], [events_end,]]) + event_starts = corr_subgrp['Events']['start'] + events_end = event_starts[-1] + corr_subgrp['Events']['length'][-1] + segs = np.concatenate([event_starts, [events_end,]]) shift = corr_subgrp.attrs['shift'] scale = corr_subgrp.attrs['scale'] lower_lim = corr_subgrp.attrs['lower_lim'] upper_lim = corr_subgrp.attrs['upper_lim'] - all_sig = fast5_data['/Raw/Reads'].values()[0]['Signal'].value + all_sig = get_raw_read_slot(fast5_data)['Signal'].value rsrtr = r_data.read_start_rel_to_raw if r_data.rna: @@ -1077,12 +1197,18 @@ def parse_read_correction_data(r_data): """ try: with h5py.File(r_data.fn, 'r') as fast5_data: - raw_grp = fast5_data['/Raw/Reads'].values()[0] corr_grp = fast5_data['/Analyses/' + r_data.corr_group] events_grp = corr_grp['Events'] - event_data = events_grp.value + event_starts = events_grp['start'] + events_end = event_starts[-1] + events_grp['length'][-1] + new_segs = np.concatenate([event_starts, [events_end,]]) + raw_grp = get_raw_read_slot(fast5_data) read_id = raw_grp.attrs['read_id'] + try: + read_id = read_id.decode() + except (AttributeError, TypeError): + pass signal_data = raw_grp['Signal'].value raw_offset = events_grp.attrs['read_start_rel_to_raw'] @@ -1091,14 +1217,15 @@ def parse_read_correction_data(r_data): 'shift', 'scale', 'lower_lim', 'upper_lim')] old_segs = corr_grp['Alignment/read_segments'].value - old_align_vals = corr_grp['Alignment/read_alignment'].value - new_align_vals = corr_grp['Alignment/genome_alignment'].value + old_align_vals = list(map( + lambda x: x.decode(), + corr_grp['Alignment/read_alignment'].value)) + new_align_vals = list(map( + lambda x: x.decode(), + corr_grp['Alignment/genome_alignment'].value)) except: return None - events_end = event_data['start'][-1] + event_data['length'][-1] - new_segs = np.concatenate([event_data['start'], [events_end,]]) - if r_data.rna: signal_data = signal_data[::-1] @@ -1111,24 +1238,25 @@ def get_all_read_data(r_data): Extract most relevant read data from this read """ try: - with h5py.File(r_data.fn, 'r') as read_data: + with h5py.File(r_data.fn, 'r') as fast5_data: # note that it's more efficient to try to access the slot # and except the error that check if the slot exists first - corr_subgrp = read_data['/Analyses/' + r_data.corr_group] + corr_subgrp = fast5_data['/Analyses/' + r_data.corr_group] algn_subgrp = dict(corr_subgrp['Alignment'].attrs.items()) event_data = corr_subgrp['Events'].value r_attrs = dict(corr_subgrp.attrs.items()) - all_sig = read_data['/Raw/Reads'].values()[0]['Signal'].value + all_sig = get_raw_read_slot(fast5_data)['Signal'].value except: # probably truncated file or Events slot doesn't exist return None if r_data.rna: all_sig = all_sig[::-1] - r_means, r_seq = event_data['norm_mean'], event_data['base'] + r_means = event_data['norm_mean'] + r_seq = b''.join(event_data['base']).decode() events_end = event_data[-1]['start'] + event_data[-1]['length'] - segs = np.concatenate([event_data['start'], [events_end,]]).astype(np.int32) + segs = np.concatenate([event_data['start'], [events_end,]]).astype(np.int64) r_sig, scale_values = normalize_raw_signal( all_sig, r_data.read_start_rel_to_raw, segs[-1] - segs[0], shift=r_attrs['shift'], scale=r_attrs['scale'], @@ -1146,15 +1274,16 @@ def get_coverage(raw_read_coverage): if VERBOSE: sys.stderr.write('Calculating read coverage.\n') read_coverage = {} for (chrm, strand), reads_data in raw_read_coverage.items(): + if len(reads_data) == 0: continue max_end = max(r_data.end for r_data in reads_data) - chrm_coverage = np.zeros(max_end, dtype=np.int_) + chrm_coverage = np.zeros(max_end, dtype=np.int64) for r_data in reads_data: chrm_coverage[r_data.start:r_data.end] += 1 read_coverage[(chrm, strand)] = chrm_coverage return read_coverage -def get_reads_events(cs_reads, rev_strand): +def get_reads_events(cs_reads): """ Extract read base levels split by genomic position """ @@ -1195,7 +1324,7 @@ def update_seq(r_data, reg_base_data, int_start, int_end): """ Update the sequence for the region based on this read """ - r_seq = ''.join(get_single_slot_read_centric(r_data, 'base')) + r_seq = b''.join(get_single_slot_read_centric(r_data, 'base')).decode() if r_seq is None: # probably a corrupt file so return that the region is only # up to the start of this read so the next valid read will be added @@ -1229,7 +1358,8 @@ def update_seq(r_data, reg_base_data, int_start, int_end): def get_seq_from_reads(int_start, int_end, reg_reads): """ - Extract the forward strand genomic sequence for an interval from a set of reads + Extract the forward strand genomic sequence for an interval from + a set of reads """ # handle case where no read overlaps whole region # let each read contibute its sequence and fill the rest @@ -1272,15 +1402,14 @@ def get_seq_from_reads(int_start, int_end, reg_reads): def add_reg_seq(all_reg_data): """ - Add the region sequence to the region data by extraction from a minimal set of reads + Add the region sequence to the region data by extraction from a minimal + set of reads """ all_reg_base_data = [] for reg_data in all_reg_data: # add text to each regions data - all_reg_base_data.append(intervalData( - reg_data.reg_id, reg_data.chrm, reg_data.start, reg_data.end, - reg_data.strand, reg_data.reg_text, reg_data.reads, - get_seq_from_reads(reg_data.start, reg_data.end, reg_data.reads))) + all_reg_base_data.append(reg_data._replace(seq=get_seq_from_reads( + reg_data.start, reg_data.end, reg_data.reads))) return all_reg_base_data @@ -1306,16 +1435,13 @@ def get_c_s_data(chrm, strand, start, end): # full coverage as previous versions of code did if int_i.strand is None: # if strand is None, get data from both strands - all_reg_data.append(intervalData( - int_i.reg_id, int_i.chrm, int_i.start, int_i.end, int_i.strand, - int_i.reg_text, - get_c_s_data(int_i.chrm, '+', int_i.start, int_i.end) + + all_reg_data.append(int_i._replace( + reads=get_c_s_data(int_i.chrm, '+', int_i.start, int_i.end) + get_c_s_data(int_i.chrm, '-', int_i.start, int_i.end))) else: - all_reg_data.append(intervalData( - int_i.reg_id, int_i.chrm, int_i.start, int_i.end, int_i.strand, - int_i.reg_text, - get_c_s_data(int_i.chrm, int_i.strand, int_i.start, int_i.end))) + all_reg_data.append(int_i._replace( + reads=get_c_s_data(int_i.chrm, int_i.strand, + int_i.start, int_i.end))) if add_seq: all_reg_data = add_reg_seq(all_reg_data) @@ -1327,13 +1453,13 @@ def get_c_s_data(chrm, strand, start, end): reg_data for reg_data in all_reg_data if len(reg_data.reads) > 0] no_cov_regions = [ - (len(reg_data.reads) == 0, str(reg_data.chrm) + ':' + str(reg_data.start)) + (len(reg_data.reads) == 0, unicode(reg_data.chrm) + ':' + + unicode(reg_data.start)) for reg_data in all_reg_data] if any(no_cov[0] for no_cov in no_cov_regions): - sys.stderr.write( - '**** WARNING **** No coverage in regions: ' + - '; '.join([reg for no_cov, reg in no_cov_regions - if no_cov]) + '\n') + _warning_message( + 'No coverage in regions: ' + '; '.join([ + reg for no_cov, reg in no_cov_regions if no_cov])) return all_reg_data @@ -1348,10 +1474,8 @@ def get_region_sequences( all_reg_data2 = get_region_reads( plot_intervals, raw_read_coverage2, filter_no_cov=False, add_seq=False) - all_reg_data = [ - intervalData(r1.reg_id, r1.chrm, r1.start, r1.end, r1.strand, - r1.reg_text, r1.reads + r2.reads) - for r1, r2 in zip(all_reg_data, all_reg_data2)] + all_reg_data = [r1._replace(reads=r1.reads + r2.reads) + for r1, r2 in zip(all_reg_data, all_reg_data2)] all_reg_data = add_reg_seq(all_reg_data) return all_reg_data @@ -1361,10 +1485,19 @@ def get_region_sequences( ###### FAST5 Write Functions ###### ################################### -def prep_fast5(fast5_fn, corr_grp, overwrite, in_place, bc_grp=None): +def prep_fast5(fast5_fn, corr_grp, overwrite, in_place, + bc_grp=None, return_fp=False): """ - Prepare a read for re-squiggle processing (This deletes old info for this reads + Prepare a read for re-squiggle processing (This deletes old re-squiggle + info for this read) """ + def try_close_prep_err(fast5_data, err_str): + try: + fast5_data.close() + except: + pass + return (err_str, fast5_fn) + # several checks to prepare the FAST5 file for correction before # processing to save compute if not in_place: @@ -1376,37 +1509,47 @@ def prep_fast5(fast5_fn, corr_grp, overwrite, in_place, bc_grp=None): try: # create group to store data - with h5py.File(fast5_fn, 'r+') as fast5_data: - try: - analyses_grp = fast5_data['/Analyses'] - except: - return 'Analysis group not found at root of FAST5', fast5_fn - try: - # check that the requested basecalls group exsists - if bc_grp is not None: - _ = analyses_grp[bc_grp] - except: - return 'Basecall group not found at [--basecall-group]', fast5_fn + fast5_data = h5py.File(fast5_fn, 'r+') + try: + analyses_grp = fast5_data['/Analyses'] + except: + return try_close_prep_err( + fast5_data, 'Analysis group not found at root of FAST5') + try: + # check that the requested basecalls group exsists + if bc_grp is not None: + analyses_grp[bc_grp] + except: + return try_close_prep_err( + fast5_data, 'Basecall group not found at [--basecall-group]') - try: - corr_grp_ptr = analyses_grp[corr_grp] - if not overwrite: - return ( - "Tombo data exsists in [--corrected-group] and " + - "[--overwrite] is not set", fast5_fn) - del analyses_grp[corr_grp] - except: - # if the corr_grp isn't there we will write it now, but - # it's more efficient to try than to check if the slot is there - pass + try: + corr_grp_ptr = analyses_grp[corr_grp] + if not overwrite: + return try_close_prep_err( + fast5_data, "Tombo data exsists in [--corrected-group] " + + "and [--overwrite] is not set") + del analyses_grp[corr_grp] + except: + # if the corr_grp isn't there we will write it now, but + # it's more efficient to try than to check if the slot is there + pass - corr_grp = analyses_grp.create_group(corr_grp) - corr_grp.attrs['tombo_version'] = TOMBO_VERSION - corr_grp.attrs['basecall_group'] = bc_grp + corr_grp = analyses_grp.create_group(corr_grp) + corr_grp.attrs['tombo_version'] = TOMBO_VERSION + corr_grp.attrs['basecall_group'] = bc_grp except: return ( 'Error opening or writing to fast5 file', fast5_fn) + if return_fp: + return fast5_data + + try: + fast5_data.close() + except: + return 'Error closing fast5 file', fast5_fn + return def write_error_status( @@ -1427,7 +1570,7 @@ def write_error_status( return def write_new_fast5_group( - filename, genome_location, read_start_rel_to_raw, + fast5_data, genome_location, read_start_rel_to_raw, new_segs, align_seq, norm_signal, scale_values, corrected_group, basecall_subgroup, norm_type, outlier_thresh, compute_sd, alignVals=None, align_info=None, old_segs=None, rna=False): @@ -1442,29 +1585,34 @@ def write_new_fast5_group( norm_means = c_new_means(norm_signal, new_segs) norm_stds = repeat(np.NAN) + # had to shift to names formats numpy array specification due to + # python2 numpy unicode issues. See discussion here: + # https://github.com/numpy/numpy/issues/2407 event_data = np.array( - zip(norm_means, norm_stds, - new_segs[:-1], np.diff(new_segs), list(align_seq)), - dtype=[('norm_mean', ' 2: + unicode = str + +# import tombo functions +from . import tombo_helper as th -from c_helper import c_mean_std, c_calc_llh_ratio +from .c_helper import c_mean_std, c_calc_llh_ratio +from ._default_parameters import SMALLEST_PVAL, MIN_POSITION_SD, \ + STANDARD_MODELS, ALTERNATE_MODELS, MIN_KMER_OBS_TO_EST, ALT_EST_BATCH, \ + MAX_KMER_OBS, NUM_DENS_POINTS, LLR_THRESH, HYPO_THRESH, KERNEL_DENSITY_RANGE VERBOSE = False -PROFILE_SIGNIF = False -PROFILE_EST_KMER = False +_PROFILE_SIGNIF = False +_PROFILE_EST_REF = False +_PROFILE_ALT_EST = False -DEBUG_EST_STD = False -DEBUG_EST_BW = 0.05 -DEBUG_EST_NUM_KMER_SAVE = 500 +_DEBUG_EST_STD = False +_DEBUG_EST_BW = 0.05 +_DEBUG_EST_NUM_KMER_SAVE = 500 DNA_BASES = ['A','C','G','T'] -MIN_POSITION_SD = 0.1 -####################################################### -##### Pair-wise distance functions for clustering ##### -####################################################### +HALF_NORM_EXPECTED_VAL = stats.halfnorm.expect() + +STANDARD_MODEL_NAME = 'standard' + +SAMP_COMP_TXT = 'sample_compare' +DE_NOVO_TXT = 'de_novo' +ALT_MODEL_TXT = 'model_compare' + + +############################################# +##### Pair-wise Distance and Clustering ##### +############################################# def order_reads(log_r_pvals): """ @@ -76,7 +101,7 @@ def get_pairwise_dists(reg_sig_diffs, index_q, dists_q, slide_span=None): while not index_q.empty(): try: index = index_q.get(block=False) - except Queue.Empty: + except queue.Empty: break if slide_span > 0: @@ -96,137 +121,165 @@ def get_pairwise_dists(reg_sig_diffs, index_q, dists_q, slide_span=None): return -################################################# -##### Significant region selection function ##### -################################################# +######################################## +##### Significant Region Selection ##### +######################################## -def get_most_signif_regions( - all_stats, num_bases, num_regions, qval_thresh=None, unique_pos=True, - fraction_order=False): +def get_most_signif_regions(all_stats, num_bases, num_regions, unique_pos=True): """ Select the most significant genome locations based on some criteria """ - # applied threshold for scores on each chromosome, so now - # we include all here - if qval_thresh is not None: - num_regions = np.argmax(np.logical_and( - np.logical_not(np.isnan(all_stats['mt_stat'])), - all_stats['mt_stat'] > qval_thresh)) - if num_regions == 0: - sys.stderr.write( - '*' * 60 + '\nERROR: No regions identified q-value ' + - 'below thresh. Minumum q-value: {:.2g}\n'.format( - all_stats[0]['mt_stat']) + '*' * 60 + '\n') - sys.exit() - - if fraction_order: - # TODO consider adding valid fraction as in ROC curve script analysis - # first reverse sort by alternative fraction so largest alt fractions - # show up first as opposed to all 0 fractions randomly sorted - all_stats[::-1].sort(order='alt_frac') - all_stats.sort(order='frac') - else: - all_stats.sort(order='stat') + all_stats.sort(order=str('frac')) plot_intervals = [] used_intervals = defaultdict(set) for i, stat in enumerate(all_stats): int_start = max(0, stat['pos'] - int(num_bases / 2.0)) - interval_poss = range(int_start, int_start + num_bases) - if not unique_pos or \ - stat['pos'] not in used_intervals[(stat['chrm'], stat['strand'])]: - used_intervals[(stat['chrm'], stat['strand'])].update(interval_poss) + chrm = stat['chrm'].decode() + strand = stat['strand'].decode() + if not unique_pos or stat['pos'] not in used_intervals[(chrm, strand)]: + used_intervals[(chrm, strand)].update( + range(int_start, int_start + num_bases)) plot_intervals.append(th.intervalData( - '{:03d}'.format(i), stat['chrm'], int_start, - int_start + num_bases, stat['strand'], - 'Frac Standard:{1:.2g} Frac. Alternate:{2:.2g}'.format( - stat['stat'], stat['frac'], stat['alt_frac'], - stat['cov']))) + '{:03d}'.format(i), chrm, int_start, + int_start + num_bases, strand, + 'Frac. Alternate: {0:.2g} Coverage: {1}'.format( + 1 - stat[str('frac')], stat[str('cov')]))) if len(plot_intervals) >= num_regions: break + if len(plot_intervals) == 0: + th._error_message_and_exit( + 'No locations identified. Most likely an empty statistics file.') + if len(plot_intervals) < num_regions: + th._warning_message( + 'Fewer unique significant locations more than [--num-bases]/2 ' + + 'apart were identified. Continuing with ' + + str(len(plot_intervals)) + ' unique locations.') + return plot_intervals -####################################### -##### K-mer model estimation code ##### -####################################### +############################# +##### Tombo Model Class ##### +############################# + +class TomboModel(object): + """ + Load, store and access Tombo model attributes and sequence-based expected + mean and standard deviation levels (median normalization only) + """ + def _parse_tombo_model(self): + """ + Parse a tombo model file + """ + try: + with h5py.File(self.ref_fn, 'r') as ref_fp: + ref_raw = ref_fp['model'].value + central_pos = ref_fp.attrs['central_pos'] + model_name = ref_fp.attrs['model_name'] + + try: + model_name = model_name.decode() + except (AttributeError, TypeError): + pass + + try: + alt_base = ref_fp.attrs['alt_base'] + except: + alt_base = None + try: + alt_base = alt_base.decode() + except (AttributeError, TypeError): + pass + + except: + th._error_message_and_exit( + 'Invalid tombo kmer model file provided: ' + + unicode(self.ref_fn)) + + mean_ref = {} + sd_ref = {} + for kmer, kmer_mean, kmer_std in ref_raw: + kmer = kmer.decode() + mean_ref[kmer] = kmer_mean + sd_ref[kmer] = kmer_std + + self.means = mean_ref + self.sds = sd_ref + self.central_pos = central_pos + self.alt_base = alt_base + self.name = model_name + + self.kmer_width = len(next(k for k in mean_ref)) + + return + + def __init__(self, ref_fn): + self.ref_fn = ref_fn + self._parse_tombo_model() + self.is_std_model = (self.name == STANDARD_MODEL_NAME and + self.alt_base is None) + self.is_alt_model = not self.is_std_model -def write_tombo_model(kmer_ref, kmer_ref_fn, central_pos, + +############################ +##### Model Estimation ##### +############################ + +def write_tombo_model(kmer_ref, ref_fn, central_pos, alt_base=None, alt_name=None): """ Write a tombo model file """ - with h5py.File(kmer_ref_fn, 'w') as kmer_ref_fp: - kmer_ref_fp.create_dataset('model', data=kmer_ref, compression="gzip") - kmer_ref_fp.attrs['central_pos'] = central_pos + with h5py.File(ref_fn, 'w') as ref_fp: + ref_fp.create_dataset('model', data=kmer_ref, compression="gzip") + ref_fp.attrs['central_pos'] = central_pos if alt_base is None: - kmer_ref_fp.attrs['model_name'] = 'standard' + ref_fp.attrs['model_name'] = STANDARD_MODEL_NAME else: - kmer_ref_fp.attrs['model_name'] = alt_name - kmer_ref_fp.attrs['alt_base'] = alt_base + ref_fp.attrs['model_name'] = alt_name + ref_fp.attrs['alt_base'] = alt_base return -def parse_tombo_model(kmer_ref_fn): - """ - Parse a tombo model file - """ - try: - with h5py.File(kmer_ref_fn, 'r') as kmer_ref_fp: - kmer_ref_raw = kmer_ref_fp['model'].value - central_pos = kmer_ref_fp.attrs['central_pos'] - model_name = kmer_ref_fp.attrs['model_name'] - try: - alt_base = kmer_ref_fp.attrs['alt_base'] - except: - alt_base = None - except: - sys.stderr.write( - '********* ERROR *********\n\tInvalid tombo kmer model ' + - 'file provided: ' + str(kmer_ref_fn) + '\n') - sys.exit() - - kmer_ref = dict((kmer, (kmer_mean, kmer_std)) - for kmer, kmer_mean, kmer_std in kmer_ref_raw) - - return kmer_ref, central_pos, alt_base, model_name - -def parse_tombo_models(alt_fns, ref_upstrm_bases, ref_kmer_width): +def parse_tombo_models(alt_fns, std_ref): """ Parse several alternative tombo model files """ - alt_refs = [] + alt_refs = {} for alt_model_fn in alt_fns: - alt_ref, alt_upstrm_bases, alt_base, alt_name = parse_tombo_model( - alt_model_fn) - if (ref_upstrm_bases != alt_upstrm_bases or - ref_kmer_width != len(next(alt_ref.iterkeys()))): - sys.stderr.write( - '********* WARNING *********\n\tStandard and ' + alt_model_fn + - ' alternative base models must be estimated using the same ' + - 'k-mer positions.\n') + alt_ref = TomboModel(alt_model_fn) + if (std_ref.central_pos != alt_ref.central_pos or + std_ref.kmer_width != alt_ref.kmer_width): + th._warning_message( + 'Standard and ' + alt_model_fn + ' alternative base ' + + 'models must be estimated using the same k-mer positions.') continue - if alt_base is None: - sys.stderr.write( - '********* WARNING *********\n\tAlternative model ' + - alt_model_fn + - ' appears to be a standard model and will not be processed.\n') + if not alt_ref.is_alt_model: + th._warning_message( + 'Alternative model ' + alt_model_fn + ' appears to be a ' + + 'standard model and will not be processed.') continue - alt_refs.append((alt_name, alt_ref, alt_base)) + if alt_ref.name in alt_refs: + th._warning_message( + alt_ref.name + ' alternative model found in more than one ' + + 'model file. Ignoring: ' + alt_model_fn) + continue + alt_refs[alt_ref.name] = alt_ref return alt_refs def get_default_standard_ref(raw_read_coverage, bio_samp_type=None): if bio_samp_type is not None: - standard_ref_fn = th.STANDARD_MODELS[bio_samp_type] + standard_ref_fn = STANDARD_MODELS[bio_samp_type] elif th.is_rna(raw_read_coverage): if VERBOSE: sys.stderr.write( 'Using default canonical ***** RNA ***** model.\n') - standard_ref_fn = th.STANDARD_MODELS['RNA'] + standard_ref_fn = STANDARD_MODELS['RNA'] else: if VERBOSE: sys.stderr.write( 'Using default canonical ***** DNA ***** model.\n') - standard_ref_fn = th.STANDARD_MODELS['DNA'] + standard_ref_fn = STANDARD_MODELS['DNA'] # get full filename path with setuptools standard_ref_fn = pkg_resources.resource_filename( 'tombo', 'tombo_models/' + standard_ref_fn) @@ -235,15 +288,15 @@ def get_default_standard_ref(raw_read_coverage, bio_samp_type=None): def get_default_standard_ref_from_files(fast5_fns, bio_samp_type=None): if bio_samp_type is not None: - standard_ref_fn = th.STANDARD_MODELS[bio_samp_type] + standard_ref_fn = STANDARD_MODELS[bio_samp_type] elif th.is_rna_from_files(fast5_fns): if VERBOSE: sys.stderr.write( 'Using default canonical ***** RNA ***** model.\n') - standard_ref_fn = th.STANDARD_MODELS['RNA'] + standard_ref_fn = STANDARD_MODELS['RNA'] else: if VERBOSE: sys.stderr.write( 'Using default canonical ***** DNA ***** model.\n') - standard_ref_fn = th.STANDARD_MODELS['DNA'] + standard_ref_fn = STANDARD_MODELS['DNA'] # get full filename path with setuptools standard_ref_fn = pkg_resources.resource_filename( 'tombo', 'tombo_models/' + standard_ref_fn) @@ -253,19 +306,19 @@ def get_default_standard_ref_from_files(fast5_fns, bio_samp_type=None): def get_default_alt_ref(alt_name, raw_read_coverage, bio_samp_type=None): if bio_samp_type is not None: try: - alt_model_fn = th.ALTERNATE_MODELS[bio_samp_type + '_' + alt_name] + alt_model_fn = ALTERNATE_MODELS[bio_samp_type + '_' + alt_name] except KeyError: alt_model_fn = None elif th.is_rna(raw_read_coverage): bio_samp_type = 'RNA' try: - alt_model_fn = th.ALTERNATE_MODELS['RNA_' + alt_name] + alt_model_fn = ALTERNATE_MODELS['RNA_' + alt_name] except KeyError: alt_model_fn = None else: bio_samp_type = 'DNA' try: - alt_model_fn = th.ALTERNATE_MODELS['DNA_' + alt_name] + alt_model_fn = ALTERNATE_MODELS['DNA_' + alt_name] except KeyError: alt_model_fn = None if alt_model_fn is not None: @@ -273,15 +326,14 @@ def get_default_alt_ref(alt_name, raw_read_coverage, bio_samp_type=None): alt_model_fn = pkg_resources.resource_filename( 'tombo', 'tombo_models/' + alt_model_fn) if alt_model_fn is None or not os.path.isfile(alt_model_fn): - sys.stderr.write( - '******** WARNING *********\n\tTombo default model for ' + - alt_name + ' in ' + bio_samp_type + ' does not exists.\n') + th._warning_message( + 'Tombo default model for ' + alt_name + ' in ' + + bio_samp_type + ' does not exists.') return None, None return alt_model_fn, bio_samp_type -def load_alt_refs(alt_names, raw_read_coverage, - ref_upstrm_bases, ref_kmer_width, bio_samp_type=None): +def load_alt_refs(alt_names, raw_read_coverage, std_ref, bio_samp_type=None): """ Load several default alternative tombo models """ @@ -293,7 +345,24 @@ def load_alt_refs(alt_names, raw_read_coverage, continue alt_fns.append(alt_model_fn) - return parse_tombo_models(alt_fns, ref_upstrm_bases, ref_kmer_width) + return parse_tombo_models(alt_fns, std_ref) + +def get_ref_from_seq(seq, std_ref, rev_strand=False, alt_ref=None): + seq_kmers = [seq[i:i + std_ref.kmer_width] + for i in range(len(seq) - std_ref.kmer_width + 1)] + # get stat lookups from seq on native strand then flip if rev_strand + if rev_strand: + seq_kmers = seq_kmers[::-1] + + ref_means = np.array([std_ref.means[kmer] for kmer in seq_kmers]) + ref_sds = np.array([std_ref.sds[kmer] for kmer in seq_kmers]) + if alt_ref is None: + alt_means, alt_sds = None, None + else: + alt_means = np.array([alt_ref.means[kmer] for kmer in seq_kmers]) + alt_sds = np.array([alt_ref.sds[kmer] for kmer in seq_kmers]) + + return ref_means, ref_sds, alt_means, alt_sds def calc_med_sd(vals): """ @@ -320,13 +389,14 @@ def get_region_kmer_levels( except StopIteration: # if threshold is not met use all reads from region pass - base_events = th.get_reads_events(reg_reads, strand == '-') + base_events = th.get_reads_events(reg_reads) if len(base_events) == 0: return - # get intervals where coverage is high enough for model estimation + # get intervals within the region where coverage is high enough + # for model estimation reg_cov = np.array([len(base_events[pos]) if pos in base_events else 0 - for pos in xrange(reg_start, reg_start + region_size)]) + for pos in range(reg_start, reg_start + region_size)]) cov_intervals = np.where(np.diff(np.concatenate( [[False], reg_cov > cov_thresh])))[0] if reg_cov[-1] > cov_thresh: @@ -347,21 +417,21 @@ def get_region_kmer_levels( (dnstrm_bases, upstrm_bases) for int_start, int_end in cov_intervals: int_seq = th.get_seq_from_reads( - reg_start + int_start, reg_start + int_end, reg_reads) + reg_start + int_start - bb, reg_start + int_end + ab, reg_reads) if strand == '-': - int_seq = th.rev_comp(int_seq) + int_seq = th.comp_seq(int_seq) int_len = int_end - int_start - for pos in range(int_len - kmer_width + 1): - pos_kmer = int_seq[pos:pos+kmer_width] \ - if strand == '+' else \ - int_seq[int_len-kmer_width-pos:int_len-pos] + for pos in range(int_len): + pos_kmer = int_seq[pos:pos+kmer_width] + if strand == '-': + pos_kmer = pos_kmer[::-1] try: if est_mean: reg_kmer_levels[pos_kmer].append(c_mean_std( - base_events[reg_start+pos+int_start+bb])) + base_events[reg_start+pos+int_start])) else: reg_kmer_levels[pos_kmer].append(calc_med_sd( - base_events[reg_start+pos+int_start+bb])) + base_events[reg_start+pos+int_start])) except KeyError: continue @@ -373,7 +443,7 @@ def _est_kmer_model_worker( while not region_q.empty(): try: chrm, strand, reg_start = region_q.get(block=False) - except Queue.Empty: + except queue.Empty: break reg_reads = [r_data for r_data in raw_read_coverage[(chrm, strand)] @@ -396,7 +466,7 @@ def _est_kmer_model_worker( return -if PROFILE_EST_KMER: +if _PROFILE_EST_REF: _est_kmer_model_wrapper = _est_kmer_model_worker def _est_kmer_model_worker(*args): import cProfile @@ -431,14 +501,13 @@ def estimate_kmer_model( num_regions +=1 if VERBOSE: sys.stderr.write( - 'Extracting average kmer levels across ' + str(num_regions) + + 'Extracting average kmer levels across ' + unicode(num_regions) + ' regions. (Will print a dot or each batch completed)\n') est_args = ( region_q, kmer_level_q, raw_read_coverage, cov_thresh, - upstrm_bases, dnstrm_bases, - cs_cov_thresh, est_mean, region_size) + upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size) est_ps = [] - for p_id in xrange(num_processes): + for p_id in range(num_processes): p = mp.Process(target=_est_kmer_model_worker, args=est_args) p.start() est_ps.append(p) @@ -448,7 +517,7 @@ def estimate_kmer_model( try: reg_kmer_levels = kmer_level_q.get(block=False) all_reg_kmer_levels.append(reg_kmer_levels) - except Queue.Empty: + except queue.Empty: sleep(1) continue while not kmer_level_q.empty(): @@ -457,17 +526,17 @@ def estimate_kmer_model( if VERBOSE: sys.stderr.write('\n') if len(all_reg_kmer_levels) == 0: - sys.stderr.write('********** ERROR *********\n\tNo genomic positions ' + - 'contain --minimum-test-reads.\n') - sys.exit() + th._error_message_and_exit( + 'No genomic positions contain --minimum-test-reads. Consider ' + + 'setting this option to a lower value.') if VERBOSE: sys.stderr.write('Tabulating k-mer model statistics.\n') all_kmer_mean_sds = [] kmer_width = upstrm_bases + dnstrm_bases + 1 - if DEBUG_EST_STD: - from sklearn.neighbors import KernelDensity + if _DEBUG_EST_STD: kmer_dens = [] - save_x = np.linspace(-5, 5, DEBUG_EST_NUM_KMER_SAVE)[:, np.newaxis] + save_x = np.linspace(KERNEL_DENSITY_RANGE[0], KERNEL_DENSITY_RANGE[1], + _DEBUG_EST_NUM_KMER_SAVE) for kmer in product(DNA_BASES, repeat=kmer_width): kmer = ''.join(kmer) kmer_levels = np.concatenate([ @@ -478,120 +547,179 @@ def estimate_kmer_model( sum(len(reg_levs[''.join(kmer)]) for reg_levs in all_reg_kmer_levels) for kmer in product(DNA_BASES, repeat=kmer_width)) - sys.stderr.write( - 'ERROR: K-mers represeneted in fewer observations than ' + + th._error_message_and_exit( + 'K-mers represeneted in fewer observations than ' + 'requested in the provided reads. Consider a shorter ' + - 'k-mer or providing more reads.\n\t' + str(min_obs) + - ' observations found in least common kmer.\n') - sys.exit() + 'k-mer or providing more reads.\n\t' + unicode(min_obs) + + ' observations found in least common kmer.') all_kmer_mean_sds.append((kmer, np.median(kmer_levels[:,0]), np.mean(kmer_levels[:,1]))) - if DEBUG_EST_STD: - kmer_kde = KernelDensity( - kernel='gaussian', bandwidth=DEBUG_EST_BW).fit( - kmer_levels[:,0][:,np.newaxis]) + if _DEBUG_EST_STD: + kmer_kde = stats.gaussian_kde( + kmer_levels[:,0], + bw_method=_DEBUG_EST_BW / kmer_levels[:,0].std(ddof=1)) with np.errstate(under='ignore'): - kmer_dens.append(( - kmer, np.exp(kmer_kde.score_samples(save_x)))) + kmer_dens.append((kmer, kmer_kde.evaluate(save_x))) - if DEBUG_EST_STD: - with open('debug_est_standard_ref.density.txt', 'w') as fp: + if _DEBUG_EST_STD: + with io.open('debug_est_standard_ref.density.txt', 'wt') as fp: fp.write('Kmer\tSignal\tDensity\n') fp.write('\n'.join('\t'.join(map(str, (kmer, x, y))) for kmer, dens_i in kmer_dens - for x, y in zip(save_x[:,0], dens_i)) + '\n') + for x, y in zip(save_x, dens_i)) + '\n') + + # Explicity use btype string names for py3 compatiblity as well as + # pickle-ability of numpy arrays for consistency. See discussion here: + # https://github.com/numpy/numpy/issues/2407 kmer_ref = np.array( - all_kmer_mean_sds, - dtype=[('kmer', 'S' + str(kmer_width)), - ('mean', 'f8'), ('sd', 'f8')]) + all_kmer_mean_sds, dtype=[(str('kmer'), 'S' + unicode(kmer_width)), + (str('mean'), 'f8'), (str('sd'), 'f8')]) if not kmer_specific_sd: kmer_ref['sd'] = np.median(kmer_ref['sd']) write_tombo_model(kmer_ref, kmer_ref_fn, upstrm_bases) return + +######################################## +##### Alternative Model Estimation ##### +######################################## + +def _parse_base_levels_worker( + reads_q, kmer_level_q, kmer_width, central_pos, completed_kmers): + dnstrm_bases = kmer_width - central_pos - 1 + proc_kmer_levels = dict( + (''.join(kmer), []) + for kmer in product(DNA_BASES, repeat=kmer_width)) + while not reads_q.empty(): + try: + r_fn, corr_slot = reads_q.get(block=False) + except queue.Empty: + break + + with h5py.File(r_fn) as fast5_data: + r_means, r_seq = th.get_multiple_slots_read_centric( + fast5_data, ['norm_mean', 'base'], corr_slot) + if r_means is None: continue + r_seq = b''.join(r_seq).decode() + for kmer, level in zip( + (r_seq[i:i + kmer_width] + for i in range(len(r_seq) - kmer_width + 1)), + r_means[central_pos:-dnstrm_bases]): + if kmer in completed_kmers: continue + proc_kmer_levels[kmer].append(level) + + kmer_level_q.put(proc_kmer_levels) + + return + +def get_batch_kmer_levels( + reads_q, kmer_level_q, all_reads, parse_levels_batch_size, + std_ref, completed_kmers, num_processes): + no_more_reads = False + try: + for _ in range(parse_levels_batch_size): + r_data = next(all_reads) + reads_q.put((r_data.fn, r_data.corr_group)) + except StopIteration: + no_more_reads = True + + base_lev_args = (reads_q, kmer_level_q, std_ref.kmer_width, + std_ref.central_pos, completed_kmers) + base_lev_ps = [] + for p_id in range(num_processes): + p = mp.Process(target=_parse_base_levels_worker, args=base_lev_args) + p.start() + base_lev_ps.append(p) + + batch_kmer_levels = [] + while any(p.is_alive() for p in base_lev_ps): + try: + proc_kmer_levels = kmer_level_q.get(block=False) + batch_kmer_levels.append(proc_kmer_levels) + except queue.Empty: + sleep(1) + continue + while not kmer_level_q.empty(): + proc_kmer_levels = kmer_level_q.get(block=False) + batch_kmer_levels.append(proc_kmer_levels) + + return batch_kmer_levels, no_more_reads + def parse_base_levels( - all_reads, kmer_width, upstrm_bases, check_min_kmer_batch, - kmer_obs_thresh, max_kmer_obs, min_kmer_obs_to_est): + all_reads, std_ref, parse_levels_batch_size, kmer_obs_thresh, + max_kmer_obs, min_kmer_obs_to_est, num_processes): """ Parse base levels and store grouped by k-mer """ - dnstrm_bases = kmer_width - upstrm_bases - 1 - mixed_base_levels = dict( + manager = mp.Manager() + reads_q = manager.Queue() + kmer_level_q = manager.Queue() + + all_kmer_levels = dict( (''.join(kmer), []) - for kmer in product(DNA_BASES, repeat=kmer_width)) + for kmer in product(DNA_BASES, repeat=std_ref.kmer_width)) # store set of k-mers with enough observations to save on memory footprint # while filling more rare k-mers completed_kmers = set() - n_reads = 0 - for r_data in all_reads: - r_means, r_seq = th.get_multiple_slots_read_centric( - r_data, ['norm_mean', 'base']) - if r_means is None: continue - r_seq = ''.join(r_seq) - n_reads += 1 - r_kmers = [r_seq[i:i+kmer_width] - for i in range(len(r_seq)-kmer_width+1)] - r_base_levels = defaultdict(list) - for kmer, level in zip(r_kmers, r_means[upstrm_bases:-dnstrm_bases]): - r_base_levels[kmer].append(level) + all_reads = iter(all_reads) + n_batches = 0 + while True: + batch_kmer_levels, no_more_reads = get_batch_kmer_levels( + reads_q, kmer_level_q, all_reads, parse_levels_batch_size, + std_ref, completed_kmers, num_processes) + # only add observations for k-mers that have not been seen enough times # save memory for abundent k-mers - for kmer in set(r_base_levels.keys()).difference(completed_kmers): - mixed_base_levels[kmer].extend(r_base_levels[kmer]) - - # every check_min_kmer_batch check to see if there are enough - # observations of each kmer to continue to estimation - if n_reads % check_min_kmer_batch == 0: - kmer_levels_counts = sorted([ - (len(kmer_levels), kmer) - for kmer, kmer_levels in mixed_base_levels.iteritems()]) - if kmer_levels_counts[0][0] > kmer_obs_thresh: - break - if VERBOSE: sys.stderr.write( - '\t' + str(n_reads) + ' reads processed. Current ' + - 'minimum k-mer observations: ' + - str(kmer_levels_counts[0][0]) + ' towards goal of ' + - str(kmer_obs_thresh) + '\n') - # re-compute completed kmers after each batch - completed_kmers = set() - for kmer_count, kmer in kmer_levels_counts[::-1]: - if kmer_count < max_kmer_obs: - break + batch_total_kmers = [] + for kmer in set(all_kmer_levels).difference(completed_kmers): + all_kmer_levels[kmer].extend(( + kmer_level for proc_kmer_levels in batch_kmer_levels + for kmer_level in proc_kmer_levels[kmer])) + batch_total_kmers.append(len(all_kmer_levels[kmer])) + if batch_total_kmers[-1] > max_kmer_obs: completed_kmers.add(kmer) - if min(len(base_levels) for base_levels in - mixed_base_levels.itervalues()) < kmer_obs_thresh: - fewest_kmer_obs = min(len(kmer_levels) for kmer_levels in - mixed_base_levels.itervalues()) + curr_min_kmer_count = min(batch_total_kmers) + if curr_min_kmer_count > kmer_obs_thresh or no_more_reads: + break + + n_batches += 1 + if VERBOSE: sys.stderr.write( + '\t' + unicode(n_batches * parse_levels_batch_size) + + ' reads processed. Current minimum k-mer observations: ' + + unicode(curr_min_kmer_count) + ' towards goal of ' + + unicode(kmer_obs_thresh) + '\n') + + fewest_kmer_obs = min(len(kmer_levels) for kmer_levels in + all_kmer_levels.values()) + if fewest_kmer_obs < kmer_obs_thresh: if fewest_kmer_obs < min_kmer_obs_to_est: - sys.stderr.write( - '********* ERROR ********\n\tToo few minimal k-mer ' + - 'observations to continue to alternative estimation. ' + - 'Minimal k-mer has ' + str(fewest_kmer_obs) + ' total ' + - 'observations and ' + str(min_kmer_obs_to_est) + - ' observations per k-mer are required.\n') - sys.exit() - sys.stderr.write( - '********* WARNING ********\n\tToo few minimal k-mer ' + - 'observations. ' + + th._error_message_and_exit( + 'Too few minimal k-mer observations to continue to ' + + 'alternative estimation. Minimal k-mer has ' + + unicode(fewest_kmer_obs) + ' total observations and ' + + unicode(min_kmer_obs_to_est) + + ' observations per k-mer are required.') + th._warning_message( + 'Requested minimal k-mer observations not found in all reads. ' + 'Continuing to estimation using a k-mer with ' + - str(fewest_kmer_obs) + ' total observations\n') + unicode(fewest_kmer_obs) + ' total observations') - return mixed_base_levels + return all_kmer_levels -def write_kmer_densities_file(dens_fn, kmer_dens, dens_i): - with open(dens_fn, 'w') as fp: +def write_kmer_densities_file(dens_fn, kmer_dens, save_x): + with io.open(dens_fn, 'wt') as fp: fp.write('Kmer\tSignal\tDensity\n') fp.write('\n'.join('\t'.join(map(str, (kmer, x, y))) for kmer, dens_i in kmer_dens.items() - for x, y in zip(save_x[:,0], dens_i)) + '\n') + for x, y in zip(save_x, dens_i)) + '\n') return def parse_kmer_densities_file(dens_fn): kmer_dens_raw = defaultdict(list) - with open(dens_fn) as dens_fp: + with io.open(dens_fn) as dens_fp: # read in header dens_fp.readline() for line in dens_fp: @@ -603,34 +731,31 @@ def parse_kmer_densities_file(dens_fn): for kmer, dens_i in kmer_dens_raw.items(): if first_len is None: first_len = len(dens_i) if len(dens_i) != first_len: - sys.stderr.write( - '******** ERROR *********\n\tDensity file is valid.\n') - sys.exit() + th._error_message_and_exit('Density file is valid.') kmer_dens[kmer] = np.array(dens_i) return kmer_dens def est_kernel_density( - raw_read_coverage, kmer_width, upstrm_bases, kmer_obs_thresh, - density_basename, save_x, kernel_dens_bw, alt_or_stnd_name='alt', - check_min_kmer_batch=1000, max_kmer_obs=10000, min_kmer_obs_to_est=50): - from sklearn.neighbors import KernelDensity + raw_read_coverage, std_ref, kmer_obs_thresh, + density_basename, save_x, kernel_dens_bw, num_processes, + alt_or_stnd_name='alt', parse_levels_batch_size=ALT_EST_BATCH, + max_kmer_obs=MAX_KMER_OBS, min_kmer_obs_to_est=MIN_KMER_OBS_TO_EST): all_reads = [r_data for cs_reads in raw_read_coverage.values() for r_data in cs_reads] np.random.shuffle(all_reads) base_levels = parse_base_levels( - all_reads, kmer_width, upstrm_bases, check_min_kmer_batch, - kmer_obs_thresh, max_kmer_obs, min_kmer_obs_to_est) + all_reads, std_ref, parse_levels_batch_size, kmer_obs_thresh, + max_kmer_obs, min_kmer_obs_to_est, num_processes) if VERBOSE: sys.stderr.write('Fitting kernel densities for k-mer levels\n') kmer_dens = {} for kmer, norm_levels in base_levels.items(): - # reshape norm levels for sklearn format and conver to numpy array - norm_levels = np.array(norm_levels).reshape(-1,1) - kmer_kde = KernelDensity( - kernel='gaussian', bandwidth=kernel_dens_bw).fit(norm_levels) + norm_levels = np.array(norm_levels) + kmer_kde = stats.gaussian_kde( + norm_levels, bw_method=kernel_dens_bw / norm_levels.std(ddof=1)) with np.errstate(under='ignore'): - kmer_dens[kmer] = np.exp(kmer_kde.score_samples(save_x)) + kmer_dens[kmer] = kmer_kde.evaluate(save_x) if density_basename is not None: write_kmer_densities_file( @@ -642,7 +767,7 @@ def est_kernel_density( def estimate_kmer_densities( f5_dirs, control_dirs, corrected_group, basecall_subgroups, standard_ref_fn, bio_samp_type, kmer_obs_thresh, density_basename, - kernel_dens_bw, save_x): + kernel_dens_bw, save_x, num_processes): raw_read_coverage = th.parse_fast5s( f5_dirs, corrected_group, basecall_subgroups) cntrl_read_coverage = th.parse_fast5s( @@ -652,61 +777,82 @@ def estimate_kmer_densities( if standard_ref_fn is None: standard_ref_fn, bio_samp_type = get_default_standard_ref( raw_read_coverage, bio_samp_type) - standard_ref, upstrm_bases, _, _ = parse_tombo_model(standard_ref_fn) - kmer_width = len(next(standard_ref.iterkeys())) + std_ref = TomboModel(standard_ref_fn) if VERBOSE: sys.stderr.write('Parsing base levels from alternative reads\n') alt_dens = est_kernel_density( - raw_read_coverage, kmer_width, upstrm_bases, kmer_obs_thresh, - density_basename, save_x, kernel_dens_bw, 'alternate') + raw_read_coverage, std_ref, kmer_obs_thresh, density_basename, + save_x, kernel_dens_bw, num_processes, 'alternate') if VERBOSE: sys.stderr.write('Parsing base levels from standard reads\n') - standard_dens = est_kernel_density( - cntrl_read_coverage, kmer_width, upstrm_bases, kmer_obs_thresh, - density_basename, save_x, kernel_dens_bw, 'control') + std_dens = est_kernel_density( + cntrl_read_coverage, std_ref, kmer_obs_thresh, density_basename, + save_x, kernel_dens_bw, num_processes, 'control') - return alt_dens, standard_dens, standard_ref, upstrm_bases + return alt_dens, std_dens, std_ref def load_kmer_densities( - alt_dens_fn, standard_dens_fn, f5_dirs, corrected_group, - basecall_subgroups, standard_ref_fn, bio_samp_type): + alt_dens_fn, std_dens_fn, f5_dirs, corrected_group, + basecall_subgroups, std_ref_fn, bio_samp_type): if VERBOSE: sys.stderr.write('Parsing standard model file\n') - if standard_ref_fn is None: + if std_ref_fn is None: if f5_dirs is None and bio_samp_type is None: - sys.stderr.write( - '******** ERROR ********\n\tMust provide a FAST5s ' + - 'directory, a canonical model file or spcify the ' + - 'biological sample type.\n') - sys.exit() + th._error_message_and_exit( + 'Must provide a FAST5s directory, a canonical model ' + + 'file or spcify the biological sample type.') raw_read_coverage = None if f5_dirs is not None: raw_read_coverage = th.parse_fast5s( f5_dirs, corrected_group, basecall_subgroups) - standard_ref_fn, bio_samp_type = get_default_standard_ref( + std_ref_fn, bio_samp_type = get_default_standard_ref( raw_read_coverage, bio_samp_type) - standard_ref, upstrm_bases, _, _ = parse_tombo_model(standard_ref_fn) + std_ref = TomboModel(std_ref_fn) if VERBOSE: sys.stderr.write('Parsing density files\n') alt_dens = parse_kmer_densities_file(alt_dens_fn) - standard_dens = parse_kmer_densities_file(standard_dens_fn) - num_dens_points = alt_dens.values()[0].shape[0] - if num_dens_points != standard_dens.values()[0].shape[0]: - sys.stderr.write( - '******** ERROR *********\n\tAlternative and standard density ' + - 'estimates do not correspond.\n') - sys.exit() + std_dens = parse_kmer_densities_file(std_dens_fn) + num_dens_points = next(v for v in alt_dens.values()).shape[0] + if num_dens_points != next(v for v in std_dens.values()).shape[0]: + th._error_message_and_exit( + 'Alternative and standard density ' + + 'estimates do not correspond.') - save_x = np.linspace(-5, 5, num_dens_points)[:, np.newaxis] + save_x = np.linspace(KERNEL_DENSITY_RANGE[0], KERNEL_DENSITY_RANGE[1], + num_dens_points) - return alt_dens, standard_dens, standard_ref, upstrm_bases, save_x + return alt_dens, std_dens, std_ref, save_x def isolate_alt_density( - alt_dens, standard_dens, alt_base, alt_frac_pctl, standard_ref, save_x): - def get_alt_shift(kmer): - kmer_standard_dens = standard_dens[kmer] - kmer_alt_dens = alt_dens[kmer] + alt_dens, std_dens, alt_base, alt_frac_pctl, std_ref, save_x): + def calc_mean(dens): + return np.average(save_x[dens>1e-10], weights=dens[dens>1e-10]) + + + # estimate density shift from k-mers without the alternate base + no_alt_std_means, no_alt_mean_diffs = [], [] + for kmer in std_dens: + if alt_base in kmer: continue + no_alt_std_means.append(calc_mean(std_dens[kmer])) + kmer_alt_mean = calc_mean(alt_dens[kmer]) + no_alt_mean_diffs.append(kmer_alt_mean - no_alt_std_means[-1]) + calc_offset = np.poly1d(np.polyfit(no_alt_std_means, no_alt_mean_diffs, 2)) + save_x_unit = save_x[1] - save_x[0] + + shifted_alt_dens = {} + for kmer, kmer_alt_dens in alt_dens.items(): + est_offset = int(calc_offset(calc_mean(std_dens[kmer])) / save_x_unit) + # if std density mean is estimated to be greater + if est_offset < 0: + # shift alt dens to the right + shifted_alt_dens[kmer] = np.concatenate([ + [0.0,] * -est_offset, kmer_alt_dens[:est_offset]]) + else: + # else shift alt dens to the left + shifted_alt_dens[kmer] = np.concatenate([ + kmer_alt_dens[est_offset:], [0.0,] * est_offset]) + def get_peak_frac(kmer_std_dens, kmer_alt_dens): # find highest peak in standard density - standard_peak = np.argmax(kmer_standard_dens) + std_peak = np.argmax(kmer_std_dens) # find closest peak in alternative density alt_local_peaks = np.where(np.concatenate([ @@ -714,102 +860,89 @@ def get_alt_shift(kmer): kmer_alt_dens[1:-1] > kmer_alt_dens[:-2], kmer_alt_dens[1:-1] > kmer_alt_dens[2:]) + [False,]]))[0] matched_alt_peak = alt_local_peaks[ - np.argmin(abs(alt_local_peaks - standard_peak))] - - # shift alternative density so these peaks match - peak_offset = matched_alt_peak - standard_peak - if peak_offset < 0: - shifted_kmer_alt_dens = np.concatenate([ - [0.0,] * -peak_offset, - kmer_alt_dens[:peak_offset]]) - else: - shifted_kmer_alt_dens = np.concatenate([ - kmer_alt_dens[peak_offset:], - [0.0,] * peak_offset]) + np.argmin(abs(alt_local_peaks - std_peak))] + return kmer_alt_dens[matched_alt_peak] / kmer_std_dens[std_peak] - return ( - kmer_alt_dens[matched_alt_peak] / kmer_standard_dens[standard_peak], - shifted_kmer_alt_dens) - - # estimate alternative base incorporation rate based on single alt base - all_kmers = standard_dens.keys() - alt_peak_frac, shifted_alt_dens = zip(*[ - get_alt_shift(kmer) for kmer in all_kmers]) - shifted_alt_dens = dict(zip(all_kmers, shifted_alt_dens)) # estimate the alternative base incorporation rate - standard_frac = np.percentile([ - peak_frac for peak_frac, kmer in zip(alt_peak_frac, all_kmers) - if kmer.count(alt_base) == 1], alt_frac_pctl) + std_frac = np.percentile([ + get_peak_frac(std_dens[kmer], shifted_alt_dens[kmer]) + for kmer in std_dens if kmer.count(alt_base) == 1], alt_frac_pctl) if VERBOSE: sys.stderr.write( 'Alternative base incorporation rate estimate: ' + - str(1 - standard_frac) + '\n') - if standard_frac >= 1: - sys.stderr.write( - '******** WARNING *******\n\tAlternative base incorporation rate ' + + unicode(1 - std_frac) + '\n') + if std_frac >= 1: + th._warning_message( + 'Alternative base incorporation rate ' + 'estimate is approximately 0. Consider lowering ' + - '--alt-fraction-percentile.\n') + '--alt-fraction-percentile.') # get mean model SD. most models will be constant, but use mean in case - model_sd = np.mean(zip(*standard_ref.values())[1]) + model_sd = np.mean(list(std_ref.sds.values())) # subtract off fraction of standard density from alt density # to estimate mean of isolated alternative distribution alt_ref = [] - for kmer, (standard_level, _) in standard_ref.iteritems(): + for kmer, std_level in std_ref.means.items(): if kmer.count(alt_base) == 0: - alt_ref.append((kmer, standard_level, model_sd)) + alt_ref.append((kmer, std_level, model_sd)) continue # assuming random incorporation the prortion of standard base # observations at this k-mer is the standard fraction raised - # to the number of alt_base occurences in the sample - kmer_standard_frac = standard_frac**kmer.count(alt_base) + # to the number of alt_base occurences in the k-mer + kmer_std_frac = std_frac**kmer.count(alt_base) with np.errstate(under='ignore'): diff_dens = shifted_alt_dens[kmer] - ( - standard_dens[kmer] * kmer_standard_frac) + std_dens[kmer] * kmer_std_frac) diff_dens[diff_dens < 0] = 0 - alt_level = np.average(save_x[:,0], weights=diff_dens) + alt_level = np.average(save_x, weights=diff_dens) alt_ref.append((kmer, alt_level, model_sd)) - kmer_width = len(next(standard_ref.iterkeys())) - alt_ref = np.array(alt_ref, dtype=[('kmer', 'S' + str(kmer_width)), - ('mean', 'f8'), ('sd', 'f8')]) + alt_ref = np.array(alt_ref, dtype=[ + (str('kmer'), 'S' + unicode(std_ref.kmer_width)), + (str('mean'), 'f8'), (str('sd'), 'f8')]) return alt_ref def estimate_alt_model( f5_dirs, control_dirs, corrected_group, basecall_subgroups, - standard_ref_fn, bio_samp_type, alt_base, alt_frac_pctl, kmer_obs_thresh, - density_basename, kernel_dens_bw, alt_dens_fn, standard_dens_fn, - num_dens_points=500): + std_ref_fn, bio_samp_type, alt_base, alt_frac_pctl, + kmer_obs_thresh, density_basename, kernel_dens_bw, alt_dens_fn, + std_dens_fn, num_processes, num_dens_points=NUM_DENS_POINTS): """ Estimate an alternative model from a sample with a single, known, randomly-incorporated alternative base """ - if alt_dens_fn is not None and standard_dens_fn is not None: - (alt_dens, standard_dens, standard_ref, - upstrm_bases, save_x) = load_kmer_densities( - alt_dens_fn, standard_dens_fn, f5_dirs, corrected_group, - basecall_subgroups, standard_ref_fn, bio_samp_type) + if alt_dens_fn is None or std_dens_fn is None: + save_x = np.linspace(KERNEL_DENSITY_RANGE[0], KERNEL_DENSITY_RANGE[1], + num_dens_points) + alt_dens, std_dens, std_ref = estimate_kmer_densities( + f5_dirs, control_dirs, corrected_group, basecall_subgroups, + std_ref_fn, bio_samp_type, kmer_obs_thresh, density_basename, + kernel_dens_bw, save_x, num_processes) else: - save_x = np.linspace(-5, 5, num_dens_points)[:, np.newaxis] - (alt_dens, standard_dens, standard_ref, - upstrm_bases) = estimate_kmer_densities( - f5_dirs, control_dirs, corrected_group, basecall_subgroups, - standard_ref_fn, bio_samp_type, kmer_obs_thresh, density_basename, - kernel_dens_bw, save_x) + alt_dens, std_dens, std_ref, save_x = load_kmer_densities( + alt_dens_fn, std_dens_fn, f5_dirs, corrected_group, + basecall_subgroups, std_ref_fn, bio_samp_type) if VERBOSE: sys.stderr.write('Isolating alternative base distribtuions\n') # perform alternative density isolation algorithm alt_ref = isolate_alt_density( - alt_dens, standard_dens, alt_base, alt_frac_pctl, standard_ref, - save_x) + alt_dens, std_dens, alt_base, alt_frac_pctl, std_ref, save_x) - return alt_ref, upstrm_bases + return alt_ref, std_ref.central_pos +if _PROFILE_ALT_EST: + _est_alt_wrapper = estimate_alt_model + def estimate_alt_model(*args): + import cProfile + cProfile.runctx('_est_alt_wrapper(*args)', globals(), locals(), + filename='est_alt_model.prof') + return None, None -######################################### -##### Base statistical testing code ##### -######################################### + +#################################### +##### Core Statistical Testing ##### +#################################### def p_value_to_z_score(pvalue): """ @@ -834,7 +967,7 @@ def correct_multiple_testing(pvals): sortrevind = pvals_sortind.argsort() nobs = len(pvals) - ecdffcator = np.arange(1,nobs+1)/float(nobs) + ecdffcator = np.arange(1, nobs + 1) / nobs # ignore underflow values with np.errstate(under='ignore'): pvals_corrected_raw = pvals_sorted / ecdffcator @@ -870,11 +1003,11 @@ def calc_window_fishers_method(pvals, lag): assert lag > 0, 'Invalid p-value window provided.' width = (lag * 2) + 1 if pvals.shape[-1] < width: - raise NotImplementedError, ( + raise NotImplementedError( "P-values vector too short for Fisher's Method " + "window compuation.") with np.errstate(invalid='ignore'): - pvals = np.maximum(pvals, th.SMALLEST_PVAL) + pvals = np.maximum(pvals, SMALLEST_PVAL) log_sums = np.lib.stride_tricks.as_strided( np.log(pvals), shape=pvals.shape[:-1] + (pvals.shape[-1] - width + 1, width), @@ -926,32 +1059,17 @@ def calc_mann_whitney_z_score(samp1, samp2): return z -def add_multiple_testing(all_stats): - """ - Add multiple testing to a set of statistics - """ - if len(all_stats) == 0: - sys.stderr.write( - '*' * 60 + '\nERROR: No regions contain minimum ' + - 'number of reads.\n' + '*' * 60 + '\n') - sys.exit() - - # get FDR corrected q-values - all_stats.sort(order='stat') - all_stats['mt_stat'] = correct_multiple_testing(all_stats['stat']) - - return all_stats - -################################### -##### Model-based re-squiggle ##### -################################### +######################################### +##### Local Model-based Re-squiggle ##### +######################################### def get_dynamic_prog_params(match_evalue): """ - Compute dynamic programming shift parameters from an expected match expected value + Compute dynamic programming shift parameters from an expected match + expected value """ - z_shift = stats.halfnorm.expect() + match_evalue + z_shift = HALF_NORM_EXPECTED_VAL + match_evalue stay_pen = match_evalue return z_shift, stay_pen @@ -968,7 +1086,8 @@ def get_begin_nan(arr): def get_read_signif_shift_regions( z_scores, z_thresh, context_bases, signif_shift_len_thresh=None): """ - Identify regions along a read that do not match well with the genomic reference tombo model + Identify regions along a read that do not match well with the genomic + reference tombo model """ # extend NANs by context_bases to avoid regions extending outside of # valid regions over which statistics were computed @@ -986,8 +1105,8 @@ def get_read_signif_shift_regions( signif_shift_regs = zip(signif_shift_chngpnts[:-1:2], signif_shift_chngpnts[1::2]) signif_shift_cntxt_regs = [] - curr_start, curr_end = signif_shift_regs[0] - for reg_start, reg_end in signif_shift_regs[1:]: + curr_start, curr_end = next(signif_shift_regs) + for reg_start, reg_end in signif_shift_regs: # if next region overlaps the current region with context if reg_start - (context_bases * 2) <= curr_end: # extend the current region to cover both regions @@ -1011,30 +1130,179 @@ def get_read_signif_shift_regions( return signif_shift_cntxt_regs -################################ -##### Base-by-base Testing ##### -################################ +########################## +##### Statistics I/O ##### +########################## -def calc_llh_ratio(reg_means, reg_ref_means, reg_ref_vars, - reg_alt_means, reg_alt_vars): +def parse_stats(stats_fn): """ - Compute log likelihood ratio + Parse a tombo statistics file """ - # TODO fix cython implementation - # compute log likelihood ratio - # negative value means standard base fits data better - # positive value means alternative base fits data better - return (np.sum(np.square(reg_means - reg_alt_means) / reg_alt_vars) + - np.sum(np.log(reg_alt_vars))) - ( - np.sum(np.square(reg_means - reg_ref_means) / reg_ref_vars) + - np.sum(np.log(reg_ref_vars))) + if stats_fn is None or not os.path.isfile(stats_fn): + th._error_message_and_exit( + 'Statistics file not provided or provided file does not exist.') -def get_reads_ref(ctrl_reg_reads, reverse_strand, reg_start, region_size, + try: + with h5py.File(stats_fn, 'r') as stats_fp: + all_stats = stats_fp['stats'].value + try: + stat_type = stats_fp.attrs['stat_type'] + except: + # if this is the old stats file assume sample compare + stat_type = SAMP_COMP_TXT + except: + th._error_message_and_exit( + 'Attempt to load statistics file failed. May be an old ' + + 'version of statistics file. Try deleting statistics ' + + 'file and recalculating using current tombo version.') + + return all_stats, stat_type + +def write_stats(all_stats, stats_bsnm, stat_type): + """ + Write a tombo statistics file + """ + if VERBOSE: sys.stderr.write( + 'Saving signal shift significance testing results.\n') + if stat_type == ALT_MODEL_TXT: + # for alternative model testing, write one stats file per + # alternative model + for alt_name, alt_stats in all_stats: + with h5py.File(stats_bsnm + '.' + alt_name + + '.tombo.stats', 'w') as stats_fp: + stats_fp.create_dataset( + 'stats', data=alt_stats, compression="gzip") + stats_fp.attrs['stat_type'] = stat_type + else: + with h5py.File(stats_bsnm + '.tombo.stats', 'w') as stats_fp: + stats_fp.create_dataset('stats', data=all_stats, compression="gzip") + stats_fp.attrs['stat_type'] = stat_type + + return + +class PerReadStats(object): + def __init__(self, per_read_stats_fn, stat_type=None, region_size=None): + """ + Open per-read statistics file. If stat_type and region_size are provided + the file is opened for writing, else it is opened for random access. + + WARNING: If stat_type and region_size are provided the current file's + contents will be deleted. + """ + if stat_type is None or region_size is None: + # open file for reading + try: + self.fp = h5py.File(per_read_stats_fn, 'r') + self.stat_type = self.fp.attrs['stat_type'] + self.region_size = self.fp.attrs['block_size'] + self.per_read_blocks = self.fp['Statistic_Blocks'] + blocks_index = defaultdict(list) + for block_name, block_data in self.per_read_blocks.items(): + blocks_index[ + (block_data.attrs['chrm'], + block_data.attrs['strand'])].append(( + block_data.attrs['start'], + block_data.attrs['start'] + self.region_size, + block_name)) + self.blocks_index = dict(blocks_index) + except: + th._error_message_and_exit( + 'Non-existent or invalid per-read statistics file provided.') + else: + # set class attributes + self.stat_type = stat_type + self.region_size = region_size + self.curr_block_num = 0 + + # try to remove file for overwriting old results + try: + os.remove(per_read_stats_fn) + except: + pass + # open file for writing + self.fp = h5py.File(per_read_stats_fn, 'w') + + # save attributes to file and open stats blocks group + self.fp.attrs['stat_type'] = stat_type + self.fp.attrs['block_size'] = region_size + self.per_read_blocks = self.fp.create_group('Statistic_Blocks') + + self.are_pvals = self.stat_type != ALT_MODEL_TXT + + return + + def write_per_read_block(self, per_read_block, chrm, strand, start): + """ + Write region statistics block to file. + """ + try: + block_data = self.per_read_blocks.create_group( + 'Block_' + unicode(self.curr_block_num)) + self.curr_block_num += 1 + except: + th._warning_message( + 'Per-read statistics file not opened for writing.') + return + + block_data.attrs['chrm'] = chrm + block_data.attrs['strand'] = strand + block_data.attrs['start'] = start + block_data.create_dataset( + 'block_stats', data=per_read_block, compression="gzip") + + return + + def get_region_stats(self, interval_data, num_reads=None): + """ + Get per-read statistics from the specifed interval for a random selection + of num_reads. + """ + try: + cs_blocks = self.blocks_index[( + interval_data.chrm, interval_data.strand)] + except KeyError: + return + + int_block_stats = [ + self.per_read_blocks[block_data[2]]['block_stats'].value + for block_data in cs_blocks + if not(interval_data.end < block_data[0] or + interval_data.start > block_data[1])] + if len(int_block_stats) == 0: + return + + if len(int_block_stats) == 1: + int_block_stats = int_block_stats[0] + else: + int_block_stats = np.concatenate(int_block_stats) + + all_int_stats = int_block_stats[ + (int_block_stats['pos'] >= interval_data.start) & + (int_block_stats['pos'] < interval_data.end)] + + read_ids = set(all_int_stats['read_id']) + if num_reads is not None and num_reads < len(read_ids): + int_plot_reads = set(random.sample(read_ids, num_reads)) + all_int_stats = all_int_stats[ + all_int_stats['read_id'] in int_plot_reads] + + return all_int_stats + + def close(self): + self.fp.close() + return + + +################################ +##### Base-by-base Testing ##### +################################ + +def get_reads_ref(ctrl_reg_reads, reg_start, region_size, min_test_vals, fm_offset): """ Get mean and standard deviation of levels from a sample across the genome """ - ctrl_base_events = th.get_reads_events(ctrl_reg_reads, reverse_strand) + ctrl_base_events = th.get_reads_events(ctrl_reg_reads) if ctrl_base_events is None: raise NotImplementedError arr_size = region_size + (fm_offset * 2) @@ -1042,7 +1310,7 @@ def get_reads_ref(ctrl_reg_reads, reverse_strand, reg_start, region_size, ctrl_means[:] = np.NAN ctrl_sds[:] = np.NAN ctrl_cov = {} - for pos, pos_events in sorted(ctrl_base_events.iteritems()): + for pos, pos_events in sorted(ctrl_base_events.items()): # if position is past the end of the region return if pos - fm_offset >= reg_start + region_size: break @@ -1058,246 +1326,324 @@ def get_reads_ref(ctrl_reg_reads, reverse_strand, reg_start, region_size, return ctrl_means, ctrl_sds, ctrl_cov -def get_region_stats( - chrm, strand, reg_start, reg_reads, - fm_offset, min_test_vals, region_size, single_read_thresh, - ctrl_reg_reads, kmer_ref, upstrm_bases, alt_ref, alt_base): +def compute_sample_compare_read_stats( + r_data, ctrl_means, ctrl_sds, fm_offset, reg_start, region_size): """ - Compute requested statistics for a specific region of the genome + Compute signficance statistics using comparison of two sequenceing samples + method for a single read within a specified genomic region. """ - if ctrl_reg_reads is None: - # compute begin and end lag wrt the genome from upstream and downstream - # which are wrt to the read - kmer_width = len(next(kmer_ref.iterkeys())) - dnstrm_bases = kmer_width - upstrm_bases - 1 - begin_lag = upstrm_bases if strand == '+' else dnstrm_bases - end_lag = dnstrm_bases if strand == '+' else upstrm_bases - - def get_read_comp_stats(r_data, ctrl_means, ctrl_sds): - def comp_clip_and_flip(): - r_means = th.get_single_slot_read_centric(r_data, 'norm_mean') - if r_means is None: - raise NotImplementedError, ( - 'Read does not contain re-squiggled level means.') - - read_start, read_end = r_data.start, r_data.end - if read_start + fm_offset < reg_start: - num_start_clip = reg_start - (read_start + fm_offset) - read_start = reg_start - fm_offset - if strand == '+': - r_means = r_means[num_start_clip:] - else: - r_means = r_means[:-num_start_clip] - if read_end - fm_offset > reg_start + region_size: - num_end_clip = (read_end - fm_offset) - (reg_start + region_size) - read_end = reg_start + region_size + fm_offset - if strand == '+': - r_means = r_means[:-num_end_clip] - else: - r_means = r_means[num_end_clip:] + def comp_clip_and_flip(): + with h5py.File(r_data.fn, 'r') as fast5_data: + r_means = th.get_single_slot_read_centric( + fast5_data, 'norm_mean', r_data.corr_group) + read_id = th.get_raw_read_slot(fast5_data).attrs['read_id'] + if r_means is None: + raise NotImplementedError( + 'Read does not contain re-squiggled level means.') + + read_start, read_end = r_data.start, r_data.end + if read_start + fm_offset < reg_start: + num_start_clip = reg_start - (read_start + fm_offset) + read_start = reg_start - fm_offset + if r_data.strand == '+': + r_means = r_means[num_start_clip:] + else: + r_means = r_means[:-num_start_clip] + if read_end - fm_offset > reg_start + region_size: + num_end_clip = (read_end - fm_offset) - (reg_start + region_size) + read_end = reg_start + region_size + fm_offset + if r_data.strand == '+': + r_means = r_means[:-num_end_clip] + else: + r_means = r_means[num_end_clip:] - # flip means to match genomic positions - if strand == '-': - r_means = r_means[::-1] - - return r_means, read_start, read_end - - def get_read_comp_z_score(r_means, read_start, read_end): - r_z_scores = np.abs( - r_means - ctrl_means[read_start-reg_start+fm_offset: - read_end-reg_start+fm_offset]) / ctrl_sds[ - read_start-reg_start+fm_offset: - read_end-reg_start+fm_offset] - - return r_z_scores - - def get_pvals(r_z_scores): - # mask out nan z-scores for efficient CDF computation - r_poss = np.where(np.logical_not(np.isnan(r_z_scores)))[0] - r_pvals = np.empty(r_z_scores.shape) - r_pvals[:] = np.NAN - valid_r_pvals = stats.norm.cdf(-r_z_scores[r_poss]) * 2.0 - r_pvals[r_poss] = valid_r_pvals - - return r_pvals, r_poss - - r_means, read_start, read_end = comp_clip_and_flip() - r_z_scores = get_read_comp_z_score(r_means, read_start, read_end) - - if np.sum(np.logical_not(np.isnan(r_z_scores))) == 0: - raise NotImplementedError, 'No valid z-scores in read.' - r_pvals, r_poss = get_pvals(r_z_scores) - if fm_offset > 0: - r_pvals = calc_window_fishers_method(r_pvals, fm_offset) - r_poss = np.where(np.logical_not(np.isnan(r_pvals)))[0] - r_pvals = r_pvals[r_poss] - else: - r_pvals = r_pvals[r_poss] + # flip means to match genomic positions + if r_data.strand == '-': + r_means = r_means[::-1] + + return r_means, read_start, read_end, read_id - r_poss += read_start + def get_read_comp_z_score(r_means, read_start, read_end): + r_z_scores = np.abs( + r_means - ctrl_means[read_start-reg_start+fm_offset: + read_end-reg_start+fm_offset]) / ctrl_sds[ + read_start-reg_start+fm_offset: + read_end-reg_start+fm_offset] + + return r_z_scores + + def get_pvals(r_z_scores): + # mask out nan z-scores for efficient CDF computation + r_poss = np.where(np.logical_not(np.isnan(r_z_scores)))[0] + r_pvals = np.empty(r_z_scores.shape) + r_pvals[:] = np.NAN + valid_r_pvals = stats.norm.cdf(-r_z_scores[r_poss]) * 2.0 + r_pvals[r_poss] = valid_r_pvals return r_pvals, r_poss - def clip_and_flip_data(r_data): - def get_mean_seq(): - r_means, r_seq = th.get_multiple_slots_read_centric( - r_data, ['norm_mean', 'base']) - if r_means is None or r_seq is None or len(r_seq) <= kmer_width: - raise NotImplementedError, ( - 'Read does not contain valid re-squiggled data.') - r_seq = ''.join(r_seq) - - read_start, read_end = r_data.start, r_data.end - # clip read if it extends outside the current genomic region, so - # stats are only computed within this region - if read_start + begin_lag + fm_offset < reg_start: - num_start_clip = reg_start - (read_start + begin_lag + fm_offset) - read_start = reg_start - begin_lag - fm_offset - if strand == '+': - r_means = r_means[num_start_clip:] - r_seq = r_seq[num_start_clip:] - else: - r_means = r_means[:-num_start_clip] - r_seq = r_seq[:-num_start_clip] - if read_end - end_lag - fm_offset > reg_start + region_size: - num_end_clip = (read_end - end_lag - fm_offset) - ( - reg_start + region_size) - read_end = reg_start + region_size + end_lag + fm_offset - if strand == '+': - r_means = r_means[:-num_end_clip] - r_seq = r_seq[:-num_end_clip] - else: - r_means = r_means[num_end_clip:] - r_seq = r_seq[num_end_clip:] + r_means, read_start, read_end, read_id = comp_clip_and_flip() + r_z_scores = get_read_comp_z_score(r_means, read_start, read_end) - # if this read does not cover enough of this region for stat - # computation raise an error to be handled below - if len(r_seq) < kmer_width: - raise NotImplementedError, ( - 'Read does not contain information in this region.') + if np.sum(np.logical_not(np.isnan(r_z_scores))) == 0: + raise NotImplementedError('No valid z-scores in read.') + r_pvals, r_poss = get_pvals(r_z_scores) + if fm_offset > 0: + r_pvals = calc_window_fishers_method(r_pvals, fm_offset) + r_poss = np.where(np.logical_not(np.isnan(r_pvals)))[0] + r_pvals = r_pvals[r_poss] + else: + r_pvals = r_pvals[r_poss] - return r_means, r_seq, read_start, read_end + r_poss += read_start - def get_refs(r_seq): - # get stat lookups from read in native strand then flip if - # read is on reverse strand - if strand == '-': - r_ref_means, r_ref_sds = map(np.array, zip(*[ - kmer_ref[r_seq[i:i+kmer_width]] - for i in range(len(r_seq)-kmer_width+1)][::-1])) - if alt_ref is not None: - r_alt_means, r_alt_sds = map(np.array, zip(*[ - alt_ref[r_seq[i:i+kmer_width]] - for i in range(len(r_seq)-kmer_width+1)][::-1])) + return r_pvals, r_poss, read_id + +def compute_de_novo_read_stats( + r_data, gnm_begin_lag, gnm_end_lag, fm_offset, reg_start, + region_size, std_ref): + """ + Compute signficance statistics using de novo comparison to a canonical model + method for a single read within a specified genomic region. + """ + def de_novo_clip_and_flip(): + with h5py.File(r_data.fn, 'r') as fast5_data: + r_means, r_seq = th.get_multiple_slots_read_centric( + fast5_data, ['norm_mean', 'base'], r_data.corr_group) + read_id = th.get_raw_read_slot(fast5_data).attrs['read_id'] + + if r_means is None or r_seq is None: + raise NotImplementedError( + 'Read does not contain valid re-squiggled data.') + r_seq = b''.join(r_seq).decode() + + read_start, read_end = r_data.start, r_data.end + # clip read if it extends outside the current genomic region, so + # stats are only computed within this region + if read_start + gnm_begin_lag + fm_offset < reg_start: + num_start_clip = reg_start - (read_start + gnm_begin_lag + fm_offset) + read_start = reg_start - gnm_begin_lag - fm_offset + if r_data.strand == '+': + r_means = r_means[num_start_clip:] + r_seq = r_seq[num_start_clip:] + else: + r_means = r_means[:-num_start_clip] + r_seq = r_seq[:-num_start_clip] + if read_end - gnm_end_lag - fm_offset > reg_start + region_size: + num_end_clip = (read_end - gnm_end_lag - fm_offset) - ( + reg_start + region_size) + read_end = reg_start + region_size + gnm_end_lag + fm_offset + if r_data.strand == '+': + r_means = r_means[:-num_end_clip] + r_seq = r_seq[:-num_end_clip] else: - r_ref_means, r_ref_sds = map(np.array, zip(*[ - kmer_ref[r_seq[i:i+kmer_width]] - for i in range(len(r_seq)-kmer_width+1)])) - if alt_ref is not None: - r_alt_means, r_alt_sds = map(np.array, zip(*[ - alt_ref[r_seq[i:i+kmer_width]] - for i in range(len(r_seq)-kmer_width+1)])) - if alt_ref is None: - return r_ref_means, r_ref_sds, None, None - return r_ref_means, r_ref_sds, r_alt_means, r_alt_sds - - r_means, r_seq, read_start, read_end = get_mean_seq() - r_ref_means, r_ref_sds, r_alt_means, r_alt_sds = get_refs(r_seq) + r_means = r_means[num_end_clip:] + r_seq = r_seq[num_end_clip:] - if strand == '-': - # reverse means and seq to match genomic order + # if this read does not cover enough of this region for stat + # computation raise an error to be handled below + if len(r_seq) < std_ref.kmer_width: + raise NotImplementedError( + 'Read does not contain information in this region.') + + r_ref_means, r_ref_sds, _, _ = get_ref_from_seq( + r_seq, std_ref, r_data.strand == '-') + + if r_data.strand == '-': + # reverse means to match genomic order r_means = r_means[::-1] - r_seq = r_seq[::-1] - # clip means and seq that won't have model info due to only having read - # sequence - r_means = r_means[begin_lag:-end_lag] - r_seq = r_seq[begin_lag:-end_lag] - read_start += begin_lag - read_end -= end_lag + # clip means that don't have testing model data + # note that this is still extended by fm_offset + r_means = r_means[gnm_begin_lag:-gnm_end_lag] + read_start += gnm_begin_lag + read_end -= gnm_end_lag + + return (r_means, r_ref_means, r_ref_sds, read_start, + read_end, read_id) - return (r_means, r_seq, r_ref_means, r_ref_sds, read_start, - read_end, r_alt_means, r_alt_sds) - def get_read_stats(r_data): - (r_means, r_seq, r_ref_means, r_ref_sds, - read_start, read_end, _, _) = clip_and_flip_data(r_data) + (r_means, r_ref_means, r_ref_sds, + read_start, read_end, read_id) = de_novo_clip_and_flip() - z_scores = np.abs(r_means - r_ref_means) / r_ref_sds - r_pvals = stats.norm.cdf(-z_scores) * 2.0 - if fm_offset > 0: - r_pvals = calc_window_fishers_method(r_pvals, fm_offset) + z_scores = np.abs(r_means - r_ref_means) / r_ref_sds + r_pvals = stats.norm.cdf(-z_scores) * 2.0 + if fm_offset > 0: + r_pvals = calc_window_fishers_method(r_pvals, fm_offset) - # ignore errors in max over NAN values if fisher's method was used - with np.errstate(invalid='ignore'): - r_pvals = np.maximum(r_pvals, th.SMALLEST_PVAL) + # ignore errors in max over NAN values if fisher's method was used + with np.errstate(invalid='ignore'): + r_pvals = np.maximum(r_pvals, SMALLEST_PVAL) - r_poss = np.arange(read_start, read_end) + r_poss = np.arange(read_start, read_end) - return r_pvals, r_poss + return r_pvals, r_poss, read_id - def get_read_alt_stats(r_data): - (r_means, r_seq, r_ref_means, r_ref_sds, read_start, read_end, - r_alt_means, r_alt_sds) = clip_and_flip_data(r_data) - r_ref_vars = np.square(r_ref_sds) - r_alt_vars = np.square(r_alt_sds) - - alt_base_poss = [] - log_lh_ratios = [] - # note search space is clipped since all k-mers covering the position - # of interest must be valid - for alt_base_pos in re.finditer(alt_base, r_seq[end_lag:-begin_lag]): - alt_pos = alt_base_pos.start() + end_lag - alt_base_poss.append(alt_pos + read_start) - # TODO cython version needs to be checked for bugs - # probably overflow of typed cython variables - #pos_lh_ratio = c_calc_llh_ratio( - pos_lh_ratio = calc_llh_ratio( - r_means[alt_pos-end_lag:alt_pos+begin_lag], - r_ref_means[alt_pos-end_lag:alt_pos+begin_lag], - r_ref_vars[alt_pos-end_lag:alt_pos+begin_lag], - r_alt_means[alt_pos-end_lag:alt_pos+begin_lag], - r_alt_vars[alt_pos-end_lag:alt_pos+begin_lag]) - log_lh_ratios.append(pos_lh_ratio) - - return log_lh_ratios, alt_base_poss - - - if ctrl_reg_reads is not None: - try: - ctrl_means, ctrl_sds, ctrl_cov = get_reads_ref( - ctrl_reg_reads, strand == '-', reg_start, region_size, - min_test_vals, fm_offset) - except NotImplementedError: - # if there are no events in this window return - return None +def calc_llh_ratio(reg_means, reg_ref_means, reg_ref_vars, + reg_alt_means, reg_alt_vars): + """ + Compute log likelihood ratio + + This is about 10X slower than the cython version in tombo.c_helper, but + has been kept for debugging purposes + """ + # compute log likelihood ratio + # positive value means standard base fits data better + # negative value means alternative base fits data better + return (np.sum(np.square(reg_means - reg_alt_means) / reg_alt_vars) + + np.sum(np.log(reg_alt_vars))) - ( + np.sum(np.square(reg_means - reg_ref_means) / reg_ref_vars) + + np.sum(np.log(reg_ref_vars))) + +def compute_alt_model_read_stats( + r_data, gnm_begin_lag, gnm_end_lag, reg_start, region_size, + std_ref, alt_ref): + """ + Compute signficance statistics using comparison of read signal to canonical + and alternative models method for a single read within a specified genomic + region. + """ + motif_width = gnm_begin_lag + gnm_end_lag + 1 + def alt_clip_and_flip(): + with h5py.File(r_data.fn, 'r') as fast5_data: + r_means, r_seq = th.get_multiple_slots_read_centric( + fast5_data, ['norm_mean', 'base'], r_data.corr_group) + read_id = th.get_raw_read_slot(fast5_data).attrs['read_id'] + + if r_means is None or r_seq is None: + raise NotImplementedError( + 'Read does not contain valid re-squiggled data.') + r_seq = b''.join(r_seq).decode() + + read_start = r_data.start + # clip read if it extends outside the current genomic region, so + # stats are only computed within this region + if read_start + motif_width - 1 < reg_start: + num_start_clip = reg_start - (read_start + motif_width - 1) + read_start = reg_start - (motif_width - 1) + if r_data.strand == '+': + r_means = r_means[num_start_clip:] + r_seq = r_seq[num_start_clip:] + else: + r_means = r_means[:-num_start_clip] + r_seq = r_seq[:-num_start_clip] + if r_data.end - (motif_width - 1) > reg_start + region_size: + num_end_clip = (r_data.end - (motif_width - 1)) - ( + reg_start + region_size) + if r_data.strand == '+': + r_means = r_means[:-num_end_clip] + r_seq = r_seq[:-num_end_clip] + else: + r_means = r_means[num_end_clip:] + r_seq = r_seq[num_end_clip:] + + # if this read does not cover enough of this region for stat + # computation raise an error to be handled below + if len(r_seq) < std_ref.kmer_width: + raise NotImplementedError( + 'Read does not contain information in this region.') - reg_stats, reg_poss = [], [] + r_ref_means, r_ref_sds, r_alt_means, r_alt_sds = get_ref_from_seq( + r_seq, std_ref, r_data.strand == '-', alt_ref) + + if r_data.strand == '-': + # reverse means and seq to match genomic order + r_means = r_means[::-1] + r_seq = r_seq[::-1] + # clip means to individual tested positions + r_means = r_means[gnm_begin_lag:-gnm_end_lag] + # trim seq to positions with valid llh ratio test results + # this is shorter than the means and model + r_seq = r_seq[(motif_width - 1):-(motif_width - 1)] + read_start += motif_width - 1 + + return (r_means, r_seq, r_ref_means, r_ref_sds, read_start, + r_alt_means, r_alt_sds, read_id) + + + (r_means, r_seq, r_ref_means, r_ref_sds, read_start, + r_alt_means, r_alt_sds, read_id) = alt_clip_and_flip() + r_ref_vars = np.square(r_ref_sds) + r_alt_vars = np.square(r_alt_sds) + + alt_base_poss = [] + log_lh_ratios = [] + # note search space is clipped since all k-mers covering the position + # of interest must be valid + for alt_base_pos in re.finditer(alt_ref.alt_base, r_seq): + alt_pos = alt_base_pos.start() + alt_base_poss.append(alt_pos + read_start) + pos_lh_ratio = c_calc_llh_ratio( + r_means[alt_pos:alt_pos + motif_width], + r_ref_means[alt_pos:alt_pos + motif_width], + r_ref_vars[alt_pos:alt_pos + motif_width], + r_alt_means[alt_pos:alt_pos + motif_width], + r_alt_vars[alt_pos:alt_pos + motif_width]) + log_lh_ratios.append(pos_lh_ratio) + + return np.array(log_lh_ratios), np.array(alt_base_poss), read_id + +def compute_read_stats( + chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, + region_size, single_read_thresh, ctrl_reg_reads, std_ref, + alt_ref, per_read_q, stat_type): + if stat_type == SAMP_COMP_TXT: + ctrl_means, ctrl_sds, ctrl_cov = get_reads_ref( + ctrl_reg_reads, reg_start, region_size, + min_test_vals, fm_offset) + else: + ctrl_cov = None + # compute begin and end lag wrt the genome from upstream and downstream + # which are wrt to the read + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + gnm_begin_lag = std_ref.central_pos if strand == '+' else dnstrm_bases + gnm_end_lag = dnstrm_bases if strand == '+' else std_ref.central_pos + + reg_read_stats, reg_poss, reg_ids = [], [], [] for r_data in reg_reads: try: - if ctrl_reg_reads is not None: - r_stats, r_poss = get_read_comp_stats( - r_data, ctrl_means, ctrl_sds) - elif alt_ref is None: - r_stats, r_poss = get_read_stats(r_data) + if stat_type == SAMP_COMP_TXT: + r_stats, r_poss, read_id = compute_sample_compare_read_stats( + r_data, ctrl_means, ctrl_sds, fm_offset, reg_start, + region_size) + elif stat_type == DE_NOVO_TXT: + r_stats, r_poss, read_id = compute_de_novo_read_stats( + r_data, gnm_begin_lag, gnm_end_lag, fm_offset, + reg_start, region_size, std_ref) else: - r_stats, r_poss = get_read_alt_stats(r_data) + r_stats, r_poss, read_id = compute_alt_model_read_stats( + r_data, gnm_begin_lag, gnm_end_lag, reg_start, region_size, + std_ref, alt_ref) except NotImplementedError: continue if r_stats is None: continue - reg_stats.append(r_stats) + reg_read_stats.append(r_stats) reg_poss.append(r_poss) + reg_ids.append(read_id) - if len(reg_stats) == 0: - return None + if len(reg_read_stats) == 0: + raise NotImplementedError - reg_stats = np.concatenate(reg_stats) + if per_read_q is not None: + # compile read_ids vector for per-read output + reg_ids = np.concatenate([list(repeat(r_id, r_poss.shape[0])) + for r_id, r_poss, in zip(reg_ids, reg_poss)]) + reg_read_stats = np.concatenate(reg_read_stats) reg_poss = np.concatenate(reg_poss) # remove nans possibly introduced by fisher's method calculcations - valid_poss = ~np.isnan(reg_stats) + valid_poss = ~np.isnan(reg_read_stats) reg_poss = reg_poss[valid_poss] - reg_stats = reg_stats[valid_poss] - assert reg_poss.shape[0] == reg_stats.shape[0], '\t'.join(map(str, ( - reg_poss.shape[0], reg_stats.shape[0]))) + reg_read_stats = reg_read_stats[valid_poss] + assert reg_poss.shape[0] == reg_read_stats.shape[0], '\t'.join(map(str, ( + reg_poss.shape[0], reg_read_stats.shape[0]))) + + if per_read_q is not None: + reg_ids = reg_ids[valid_poss] + assert reg_ids.shape[0] == reg_poss.shape[0] + per_read_block = np.array( + list(zip(reg_poss, reg_read_stats, reg_ids)), + dtype=[(str('pos'), 'u4'), (str('stat'), 'f8'), + (str('read_id'), h5py.special_dtype(vlen=str))]) + per_read_q.put((per_read_block, chrm, strand, reg_start)) # get order of all bases from position array as_reg_poss = np.argsort(reg_poss) @@ -1308,61 +1654,76 @@ def get_read_alt_stats(r_data): # then sort the stats array by genomic position and # split into stats by genomic base position - reg_stats = np.split( - reg_stats[as_reg_poss], + reg_base_stats = np.split( + reg_read_stats[as_reg_poss], np.where(np.concatenate([[0,], np.diff(reg_poss)]) > 0)[0]) - if alt_ref is None: - reg_combine_read_stats = calc_vectorized_fm_pvals( - reg_stats, filter_nan=False) - else: - with np.errstate(invalid='ignore'): - reg_combine_read_stats = [ - np.mean(base_stats) for base_stats in reg_stats] + reg_cov = [base_stats.shape[0] for base_stats in reg_base_stats] - reg_cov = [base_stats.shape[0] for base_stats in reg_stats] - - reg_frac_standard_base = np.array([ - np.sum(base_stats > single_read_thresh) / float(base_stats.shape[0]) - for base_stats in reg_stats]) - if alt_ref is None: - reg_frac_alt_base = 1 - reg_frac_standard_base + if stat_type == ALT_MODEL_TXT: + # filter base statistics that fall between the upper and lower + # stat threshold for the log likelihood statistic + reg_base_stats = [base_stats[np.abs(base_stats) >= single_read_thresh] + for base_stats in reg_base_stats] + valid_cov = [base_stats.shape[0] for base_stats in reg_base_stats] else: - reg_frac_alt_base = np.array([ - np.sum(base_stats < -single_read_thresh) / float(base_stats.shape[0]) - for base_stats in reg_stats]) + valid_cov = reg_cov - if ctrl_reg_reads is None: - ctrl_cov = repeat(0) - else: + if stat_type == SAMP_COMP_TXT: ctrl_cov = [ctrl_cov[pos] if pos in ctrl_cov else 0 - for pos in us_reg_poss] + for pos in reg_poss] + else: + ctrl_cov = repeat(0) + + return reg_base_stats, us_reg_poss, reg_cov, ctrl_cov, valid_cov + +def get_region_stats( + chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, + region_size, single_read_thresh, ctrl_reg_reads, std_ref, + alt_ref, per_read_q, stat_type): + """ + Compute requested statistics for a specific region of the genome + """ + try: + (reg_base_stats, reg_poss, + reg_cov, ctrl_cov, valid_cov) = compute_read_stats( + chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, + region_size, single_read_thresh, ctrl_reg_reads, std_ref, + alt_ref, per_read_q, stat_type) + except NotImplementedError: + return None + + reg_frac_standard_base = np.array([ + np.greater_equal( + base_stats, single_read_thresh).sum() / base_stats.shape[0] + if base_stats.shape[0] > 0 else np.NAN + for base_stats in reg_base_stats]) reg_stats = np.array( [pos_stats for pos_stats in zip( - reg_combine_read_stats, repeat(np.NAN), - reg_frac_standard_base, reg_frac_alt_base, - us_reg_poss, repeat(chrm), repeat(strand), reg_cov, ctrl_cov) - if pos_stats[-2] >= min_test_vals and + reg_frac_standard_base, + reg_poss, repeat(chrm), repeat(strand), + reg_cov, ctrl_cov, valid_cov) + if pos_stats[-1] >= min_test_vals and not np.isnan(pos_stats[0])], - dtype=[('stat', 'f8'), ('mt_stat', 'f8'), - ('frac', 'f8'), ('alt_frac', 'f8'), - ('pos', 'u4'), ('chrm', 'S32'), ('strand', 'S1'), - ('cov', 'u4'), ('control_cov', 'u4')]) + dtype=[(str('frac'), 'f8'), (str('pos'), 'u4'), (str('chrm'), 'S32'), + (str('strand'), 'S1'), (str('cov'), 'u4'), + (str('control_cov'), 'u4'), (str('valid_cov'), 'u4')]) + if reg_stats.shape[0] == 0: return None return reg_stats def _test_signif_worker( - region_q, stats_q, raw_read_coverage, fm_offset, min_test_vals, - single_read_thresh, region_size, - ctrl_read_coverage, kmer_ref, upstrm_bases, alt_ref, alt_base): + region_q, stats_q, per_read_q, raw_read_coverage, fm_offset, + min_test_vals, single_read_thresh, region_size, ctrl_read_coverage, + std_ref, alt_ref, stat_type): ctrl_reg_reads = None while not region_q.empty(): try: chrm, strand, reg_start = region_q.get(block=False) - except Queue.Empty: + except queue.Empty: break reg_reads = [r_data for r_data in raw_read_coverage[(chrm, strand)] @@ -1380,10 +1741,9 @@ def _test_signif_worker( if not (r_data.start >= reg_start + region_size or r_data.end <= reg_start)] reg_stats = get_region_stats( - chrm, strand, reg_start, reg_reads, - fm_offset, min_test_vals, region_size, - single_read_thresh, ctrl_reg_reads, kmer_ref, upstrm_bases, - alt_ref, alt_base) + chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, + region_size, single_read_thresh, ctrl_reg_reads, std_ref, + alt_ref, per_read_q, stat_type) if reg_stats is not None: stats_q.put(reg_stats) if VERBOSE: @@ -1392,7 +1752,7 @@ def _test_signif_worker( return -if PROFILE_SIGNIF: +if _PROFILE_SIGNIF: _test_signif_wrapper = _test_signif_worker def _test_signif_worker(*args): import cProfile @@ -1402,14 +1762,15 @@ def _test_signif_worker(*args): def test_significance( raw_read_coverage, min_test_vals, fm_offset, single_read_thresh, - region_size, num_processes, ctrl_read_coverage=None, - kmer_ref=None, upstrm_bases=None, alt_ref=None, alt_base=None): + region_size, num_processes, per_read_bn, stat_type, + ctrl_read_coverage=None, std_ref=None, alt_ref=None, alt_name=None): """ Test for significant shifted signal in mutliprocessed batches """ manager = mp.Manager() region_q = manager.Queue() stats_q = manager.Queue() + per_read_q = manager.Queue() if per_read_bn else None # split chromosomes into separate regions to process independently chrm_sizes = th.get_chrm_sizes(raw_read_coverage, ctrl_read_coverage) num_regions = 0 @@ -1425,103 +1786,62 @@ def test_significance( num_regions += 1 if VERBOSE: sys.stderr.write( - 'Performing significance testing across ' + str(num_regions) + + 'Performing significance testing across ' + unicode(num_regions) + ' regions. (Will print a dot for each batch completed)\n') test_args = ( - region_q, stats_q, raw_read_coverage, fm_offset, min_test_vals, - single_read_thresh, region_size, - ctrl_read_coverage, kmer_ref, upstrm_bases, alt_ref, alt_base) + region_q, stats_q, per_read_q, raw_read_coverage, fm_offset, + min_test_vals, single_read_thresh, region_size, ctrl_read_coverage, + std_ref, alt_ref, stat_type) test_ps = [] - for p_id in xrange(num_processes): + for p_id in range(num_processes): p = mp.Process(target=_test_signif_worker, args=test_args) p.start() test_ps.append(p) + if per_read_bn is not None: + if stat_type == ALT_MODEL_TXT: + per_read_fn = per_read_bn + '.' + alt_name + '.tombo.per_read_stats' + else: + per_read_fn = per_read_bn + '.tombo.per_read_stats' + per_read_stats = PerReadStats(per_read_fn, stat_type, region_size) + all_reg_stats = [] while any(p.is_alive() for p in test_ps): try: - reg_stats = stats_q.get(block=False) - all_reg_stats.append(reg_stats) - except Queue.Empty: - sleep(1) - continue + if per_read_bn is None: raise queue.Empty + reg_read_stats = per_read_q.get(block=False) + per_read_stats.write_per_read_block(*reg_read_stats) + except queue.Empty: + try: + reg_stats = stats_q.get(block=False) + all_reg_stats.append(reg_stats) + except queue.Empty: + sleep(1) + continue + + # Clear leftover values from queues while not stats_q.empty(): reg_stats = stats_q.get(block=False) all_reg_stats.append(reg_stats) - if VERBOSE: sys.stderr.write('\n') + while per_read_bn is not None and not per_read_q.empty(): + reg_read_stats = per_read_q.get(block=False) + per_read_stats.write_per_read_block(*reg_read_stats) + + if VERBOSE: sys.stderr.write('\nTabulating all stats.\n') + if per_read_bn is not None: + per_read_stats.close() if len(all_reg_stats) == 0: - sys.stderr.write('********** ERROR *********\n\tNo genomic positions ' + - 'contain --minimum-test-reads.\n') - sys.exit() + th._error_message_and_exit( + 'No genomic positions contain --minimum-test-reads.') # put all stats back together all_stats = np.concatenate(all_reg_stats) - if alt_ref is None: - if VERBOSE: sys.stderr.write('Performing multiple testing correction.\n') - all_stats = add_multiple_testing(all_stats) - return all_stats ########################## -##### Statistics I/O ##### -########################## - -def parse_stats(stats_fn): - """ - Parse a tombo statistics file - """ - if stats_fn is None or not os.path.isfile(stats_fn): - sys.stderr.write( - '*' * 60 + '\nERROR: No statistics file provided.\n' + - '*' * 60 + '\n') - sys.exit() - - try: - with h5py.File(stats_fn, 'r') as stats_fp: - all_stats = stats_fp['stats'].value - try: - stat_type = stats_fp.attrs['stat_type'] - except: - # if this is the old stats file assume sample compare - stat_type = 'sample_compare' - except: - sys.stderr.write( - '*' * 60 + '\nERROR: Attempt to load statistics ' + - 'file failed. May be an old version of statistics ' + - 'file. Try deleting statistics file and ' + - 'recalculating using current tombo version.\n' + - '*' * 60 + '\n') - sys.exit() - - return all_stats, stat_type - -def write_stats(all_stats, stats_bsnm, stat_type): - """ - Write a tombo statistics file - """ - if VERBOSE: sys.stderr.write( - 'Saving signal shift significance testing results.\n') - if stat_type == 'model_compare': - # for alternative model testing, write one stats file per - # alternative model - for alt_name, alt_stats in all_stats: - with h5py.File(stats_bsnm + '.' + alt_name + - '.tombo.stats', 'w') as stats_fp: - stats_fp.create_dataset( - 'stats', data=alt_stats, compression="gzip") - stats_fp.attrs['stat_type'] = stat_type - else: - with h5py.File(stats_bsnm + '.tombo.stats', 'w') as stats_fp: - stats_fp.create_dataset('stats', data=all_stats, compression="gzip") - stats_fp.attrs['stat_type'] = stat_type - - return - - -########################## -##### Main functions ##### +##### Main Functions ##### ########################## def test_shifts_main(args): @@ -1529,19 +1849,14 @@ def test_shifts_main(args): VERBOSE = not args.quiet th.VERBOSE = VERBOSE - # apply single read threshold defaults - single_read_thresh = args.single_read_threshold - if single_read_thresh is None: - if args.alternate_model_filenames is not None: - single_read_thresh = 2.0 - else: - single_read_thresh = 0.01 - raw_read_coverage = th.parse_fast5s( args.fast5_basedirs, args.corrected_group, args.basecall_subgroups) # if second set of reads is prodived, perform comparison testing if args.control_fast5_basedirs is not None: - stat_type = 'sample_compare' + stat_type = SAMP_COMP_TXT + single_read_thresh = ( + args.single_read_threshold if args.single_read_threshold is not None + else HYPO_THRESH) if VERBOSE: sys.stderr.write( 'Performing two-sample comparison significance testing.\n') ctrl_read_coverage = th.parse_fast5s( @@ -1551,6 +1866,7 @@ def test_shifts_main(args): raw_read_coverage, args.minimum_test_reads, args.fishers_method_context, single_read_thresh, args.multiprocess_region_size, args.processes, + args.per_read_statistics_basename, stat_type, ctrl_read_coverage=ctrl_read_coverage) else: tb_model_fn = args.tombo_model_filename @@ -1560,50 +1876,54 @@ def test_shifts_main(args): if tb_model_fn is None: tb_model_fn, bio_samp_type = get_default_standard_ref( raw_read_coverage, bio_samp_type) - kmer_ref, upstrm_bases, _, _ = parse_tombo_model(tb_model_fn) + std_ref = TomboModel(tb_model_fn) # if no alt model provided perform de novo testing for shifts # from a standard model if (args.alternate_model_filenames is None and args.alternate_bases is None): - stat_type = 'de_novo' + stat_type = DE_NOVO_TXT + single_read_thresh = ( + args.single_read_threshold if args.single_read_threshold + is not None else HYPO_THRESH) if VERBOSE: sys.stderr.write( - 'Performing de novo model testing against a standard model\n') + 'Performing de novo model testing against a ' + + 'standard model\n') all_stats = test_significance( raw_read_coverage, args.minimum_test_reads, args.fishers_method_context, single_read_thresh, args.multiprocess_region_size, args.processes, - kmer_ref=kmer_ref, upstrm_bases=upstrm_bases) + args.per_read_statistics_basename, stat_type, + std_ref=std_ref) # else perform comparison model testing else: - stat_type = 'model_compare' + stat_type = ALT_MODEL_TXT + single_read_thresh = ( + args.single_read_threshold if args.single_read_threshold + is not None else LLR_THRESH) if VERBOSE: sys.stderr.write( 'Performing alternative model testing\n') - kmer_width = len(next(kmer_ref.iterkeys())) if args.alternate_model_filenames is not None: alt_refs = parse_tombo_models( - args.alternate_model_filenames, upstrm_bases, kmer_width) + args.alternate_model_filenames, std_ref) else: alt_refs = load_alt_refs( args.alternate_bases, raw_read_coverage, - upstrm_bases, kmer_width, bio_samp_type) + std_ref, bio_samp_type) if len(alt_refs) == 0: - sys.stderr.write( - '********* ERROR *********\n\tNo alternative models ' + - 'successfully loaded\n') - sys.exit() + th._error_message_and_exit( + 'No alternative models successfully loaded.') all_stats = [] - for alt_name, alt_ref, alt_base in alt_refs: + for alt_name, alt_ref in alt_refs.items(): if VERBOSE: sys.stderr.write( 'Performing alternative model testing against ' + alt_name + ' model\n') all_stats.append((alt_name, test_significance( raw_read_coverage, args.minimum_test_reads, 0, single_read_thresh, args.multiprocess_region_size, - args.processes, kmer_ref=kmer_ref, - upstrm_bases=upstrm_bases, alt_ref=alt_ref, - alt_base=alt_base))) + args.processes, args.per_read_statistics_basename, stat_type, + std_ref=std_ref, alt_ref=alt_ref, alt_name=alt_name))) # TODO add comparison to processed genome reference determined by # deep learning performed on the genomic sequence @@ -1617,10 +1937,9 @@ def est_ref_main(args): th.VERBOSE = VERBOSE if min(args.upstream_bases, args.downstream_bases) == 0: - sys.stderr.write( - '********** ERROR *********\n\tContext upstream and downstream ' + - 'must be greater than 0 for model estimation.\n') - sys.exit() + th._error_message_and_exit( + 'Context upstream and downstream must be greater ' + + 'than 0 for model estimation.') estimate_kmer_model( args.fast5_basedirs, args.corrected_group, args.basecall_subgroups, @@ -1644,7 +1963,9 @@ def est_alt_ref_main(args): args.alternate_model_base, args.alt_fraction_percentile, args.minimum_kmer_observations, args.save_density_basename, args.kernel_density_bandwidth, args.alternate_density_filename, - args.control_density_filename) + args.control_density_filename, args.processes) + # returns None when profiling method + if alt_ref is None: return write_tombo_model(alt_ref, args.alternate_model_filename, upstrm_bases, args.alternate_model_base, args.alternate_model_name) @@ -1658,32 +1979,28 @@ def estimate_scale_main(args): if VERBOSE: sys.stderr.write('Getting files list\n') try: if not os.path.isdir(args.fast5_basedir): - sys.stderr.write( - '*' * 60 + '\nERROR: Provided [fast5-basedir] is ' + - 'not a directory.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Provided [fast5-basedir] is not a directory.') fast5_basedir = ( args.fast5_basedir if args.fast5_basedir.endswith('/') else args.fast5_basedir + '/') fast5_fns = th.get_files_list(fast5_basedir) except OSError: - sys.stderr.write( - '*' * 60 + '\nERROR: Reads base directory, a sub-directory ' + - 'or an old (hidden) index file does not appear to be ' + - 'accessible. Check directory permissions.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'Reads base directory, a sub-directory or an old (hidden) ' + + 'index file does not appear to be accessible. Check ' + + 'directory permissions.') if len(fast5_fns) < 1: - sys.stderr.write( - '*' * 60 + '\nERROR: No files identified in the specified ' + - 'directory or within immediate subdirectories.\n' + '*' * 60 + '\n') - sys.exit() + th._error_message_and_exit( + 'No files identified in the specified ' + + 'directory or within immediate subdirectories.') sys.stdout.write('Global scaling estimate: ' + - str(th.estimate_global_scale(fast5_fns)) + '\n') + unicode(th.estimate_global_scale(fast5_fns)) + '\n') return if __name__ == '__main__': - raise NotImplementedError, ( + raise NotImplementedError( 'This is a module. See commands with `tombo -h`')