diff --git a/.travis.yml b/.travis.yml index 99b5610..177b73d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,16 +4,11 @@ python: - "3.6" dist: trusty -cache: - directories: - - bincache - git: submodules: true before_install: - sudo apt-get -qq update - #TODO: add any ubuntu packages needed - sudo apt-get install -y python3-all-dev script: @@ -23,7 +18,6 @@ script: deploy: provider: pages skip_cleanup: true - #TODO: add this token to the project on travis (ask cwright) github_token: $GHPAGES_TOKEN local_dir: docs/_build/html target_branch: gh-pages diff --git a/Makefile b/Makefile index 88cce86..e866be2 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,10 @@ .PHONY: install docs - +OS := $(shell uname) +ifeq ($(OS), Darwin) +SEDI=sed -i '.bak' +else +SEDI=sed -i +endif venv: venv/bin/activate IN_VENV=. ./venv/bin/activate @@ -7,9 +12,9 @@ IN_VENV=. ./venv/bin/activate venv/bin/activate: test -d venv || virtualenv venv --python=python3 ${IN_VENV} && pip install pip --upgrade - ${IN_VENV} && pip install numpy # needs to get done before other things -install: venv | $(addprefix $(BINCACHEDIR)/, $(BINARIES)) +install: venv + cd bwa && ${SEDI} 's/int\ bwa_verbose\ =\ 3;/int\ bwa_verbose\ =\ 2;/' bwa.c && make libbwa.a ${IN_VENV} && pip install -r requirements.txt && python setup.py install diff --git a/README.md b/README.md index 8f3f703..8af0bb4 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,15 @@ -[//]: # (TODO: Add a title) -TITLE +bwapy ===== -[//]: # (TODO: fill in ) -[![Build Status](https://travis-ci.org/nanoporetech/.svg?branch=master)](https://travis-ci.org/nanoporetech/) +[![Build Status](https://travis-ci.org/nanoporetech/bwapy.svg?branch=master)](https://travis-ci.org/nanoporetech/bwapy) -[//]: # (TODO: Add a description) +Python bindings to `bwa mem` aligner; sufficient to load and index and perform +alignments of sequences to the index to obtain basic statistics. -[//]: # (TODO: fill in ) -Documentation can be found at https://nanoporetech.github.io//. +These python bindings are licensed under Mozilla Public License 2.0, bwa is licenced +under GNU General Public License v3.0. + +Documentation can be found at https://nanoporetech.github.io/bwapy/. Build ----- @@ -19,9 +20,30 @@ environment. To setup the environment run: - [//]: # (TODO: fill in x2) - git clone --recursive https://github.com/nanoporetech/.git - cd + git clone --recursive https://github.com/nanoporetech/bwapy.git + cd bwapy make install . ./venv/bin/activate + +Example +------- + +The `BwaAligner` class provides a pythonic interface to `bwa mem` aligner. It +takes as input a bwa index fileset on construction and can then be used to find +alignments of sequences given as strings: + +```python +from bwapy import BwaAligner +aligner = BwaAligner(args.index) +alignments = aligner.align_seq(seq) +print('Found {} alignments for input {}.'.format(len(alignments), i)) +for aln in alignments: + print(' ', aln) +``` + +The alignments are returned as a named tuple, e.g.: + +```python +Alignment(rname='yeast', orient='+', pos=0, mapq=60, cigar='915M3D29M3D27M3D13M', NM=12) +``` diff --git a/__TODO__ b/__TODO__ deleted file mode 100644 index db39511..0000000 --- a/__TODO__ +++ /dev/null @@ -1,4 +0,0 @@ -TODO: -add your code under a top level directory -grep for TODO in all files and do what they say -git rm __TODO__ TODO_PACKAGE diff --git a/bwapy/__init__.py b/bwapy/__init__.py new file mode 100644 index 0000000..358956c --- /dev/null +++ b/bwapy/__init__.py @@ -0,0 +1,3 @@ +__version__ = '0.1.0' + +from bwapy.libbwa import BwaAligner diff --git a/bwapy/libbwa.py b/bwapy/libbwa.py new file mode 100644 index 0000000..a07ed82 --- /dev/null +++ b/bwapy/libbwa.py @@ -0,0 +1,201 @@ +import argparse +from collections import namedtuple +import importlib +import imp +import os +import sys + +from cffi import FFI +ffi = FFI() + +"""High-level interface to bwa (mem) aligner.""" + + +def get_shared_lib(name): + """Cross-platform resolution of shared-object libraries, working + around vagueries of setuptools. + :param name: name of shared library to find. + + :returns: FFI shared library object. + """ + try: + # after 'python setup.py install' we should be able to do this + lib_file = importlib.import_module(name).__file__ + except Exception as e: + try: + # after 'python setup.py develop' this should work + lib_file = imp.find_module(name)[1] + except Exception as e: + raise ImportError('Cannot locate C library "{}".'.format(name)) + else: + lib_file = os.path.abspath(lib_file) + finally: + library = ffi.dlopen(lib_file) + return library + + +libbwa = get_shared_lib('bwalib') + +ffi.cdef(""" + //////////////////////////////// + // Alignment hit list structures + // + typedef struct { + int64_t rb, re; // [rb,re): reference sequence in the alignment + int qb, qe; // [qb,qe): query sequence in the alignment + int rid; // reference seq ID + int score; // best local SW score + int truesc; // actual score corresponding to the aligned region; possibly smaller than $score + int sub; // 2nd best SW score + int alt_sc; + int csub; // SW score of a tandem hit + int sub_n; // approximate number of suboptimal hits + int w; // actual band width used in extension + int seedcov; // length of regions coverged by seeds + int secondary; // index of the parent hit shadowing the current hit; <0 if primary + int secondary_all; + int seedlen0; // length of the starting seed + int n_comp:30, is_alt:2; // number of sub-alignments chained together + float frac_rep; + uint64_t hash; + } mem_alnreg_t; + + typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; + + typedef struct { // This struct is only used for the convenience of API. + int64_t pos; // forward strand 5'-end mapping position + int rid; // reference sequence index in bntseq_t; <0 for unmapped + int flag; // extra flag + uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance + int n_cigar; // number of CIGAR operations + uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 + char *XA; // alternative mappings + + int score, sub, alt_sc; + } mem_aln_t; + + typedef struct { size_t n; mem_aln_t *aln; } mem_aln_v; + + void free_mem_aln_v (mem_aln_v *alns); + + /////////////////////// + // bwa index structures + // + typedef uint64_t bwtint_t; + + typedef struct { + bwtint_t primary; // S^{-1}(0), or the primary index of BWT + bwtint_t L2[5]; // C(), cumulative count + bwtint_t seq_len; // sequence length + bwtint_t bwt_size; // size of bwt, about seq_len/4 + uint32_t *bwt; // BWT + // occurance array, separated to two parts + uint32_t cnt_table[256]; + // suffix array + int sa_intv; + bwtint_t n_sa; + bwtint_t *sa; + } bwt_t; + + typedef struct { + int64_t offset; + int32_t len; + int32_t n_ambs; + uint32_t gi; + int32_t is_alt; + char *name, *anno; + } bntann1_t; + + typedef struct { + int64_t offset; + int32_t len; + char amb; + } bntamb1_t; + + typedef struct { + int64_t l_pac; + int32_t n_seqs; + uint32_t seed; + bntann1_t *anns; // n_seqs elements + int32_t n_holes; + bntamb1_t *ambs; // n_holes elements + FILE *fp_pac; + } bntseq_t; + + typedef struct { + bwt_t *bwt; // FM-index + bntseq_t *bns; // information on the reference sequences + uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base + + int is_shm; + int64_t l_mem; + uint8_t *mem; + } bwaidx_t; + + bwaidx_t *bwa_idx_load_all(const char *hint); + void bwa_idx_destroy(bwaidx_t *idx); + + /////////////////// + // Run an alignment + // + mem_aln_v *align(bwaidx_t * index, char * seq); +""") + + +Alignment = namedtuple('Alignment', [ + 'rname', 'orient', 'pos', 'mapq', 'cigar', 'NM' +]) + +class BwaAligner(object): + def __init__(self, index:str): + self.index_base = index.encode() + self.cigchar = "MIDSH" + self.index = libbwa.bwa_idx_load_all(self.index_base) + if self.index == ffi.NULL: + raise ValueError('Failed to load bwa index.') + + def __del__(self): + libbwa.bwa_idx_destroy(self.index) + + def _build_alignment(self, aln): + cigar = aln.cigar + cigar = ''.join( + # oplen + op + str(cigar[k]>>4) + self.cigchar[cigar[k] & 0xf] + for k in range(aln.n_cigar) + ) + return Alignment( + ffi.string(self.index.bns.anns[aln.rid].name).decode(), + '+-'[aln.is_rev], aln.pos, aln.mapq, cigar, aln.NM + ) + + + def align_seq(self, seq:str): + """Align a sequence to the index. + + :param seq: base sequence to align + + :returns: tuple of :class:`Alignment` + """ + alns = libbwa.align(self.index, seq.encode()) + alignments = tuple(self._build_alignment(alns.aln[i]) for i in range(alns.n)) + libbwa.free_mem_aln_v(alns) + return alignments + + +def get_parser(): + parser = argparse.ArgumentParser('Align a sequence with bwa mem.') + parser.add_argument('index', help='bwa index base path.') + parser.add_argument('sequence', nargs='+', help='base sequence') + return parser + + +def main(): + args = get_parser().parse_args() + aligner = BwaAligner(args.index) + for i, seq in enumerate(args.sequence, 1): + alignments = aligner.align_seq(seq) + print('Found {} alignments for input {}.'.format(len(alignments), i)) + for aln in alignments: + print(' ', aln) + diff --git a/bwapy/libbwapy.c b/bwapy/libbwapy.c new file mode 100644 index 0000000..fef67d0 --- /dev/null +++ b/bwapy/libbwapy.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include "bwamem.h" + + +static PyMethodDef module_functions[] = { + {NULL, NULL, 0, NULL} +}; + +#if PY_MAJOR_VERSION >= 3 + #define MOD_ERROR_VAL NULL + #define MOD_SUCCESS_VAL(val) val + #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void) + #define MOD_DEF(ob, name, doc, methods) \ + static struct PyModuleDef moduledef = { \ + PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \ + ob = PyModule_Create(&moduledef); +#else + #define MOD_ERROR_VAL + #define MOD_SUCCESS_VAL(val) + #define MOD_INIT(name) void init##name(void) + #define MOD_DEF(ob, name, doc, methods) \ + ob = Py_InitModule3(name, methods, doc); +#endif + +MOD_INIT(bwapy) +{ + PyObject *m; + + MOD_DEF(m, "bwalib", "High-level binding to bwa mem", + module_functions) + + if (m == NULL) + return MOD_ERROR_VAL; + return MOD_SUCCESS_VAL(m); + +} + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 +# ifdef MODULE_API_EXPORTS +# define MODULE_API __declspec(dllexport) +# define restrict __restrict +# else +# define MODULE_API __declspec(dllimport) +# endif +#else +# define MODULE_API +#endif + +MODULE_API int module_init(); + +#ifdef __cplusplus +} +#endif + + +typedef struct { size_t n; mem_aln_t *aln; } mem_aln_v; + + +mem_aln_v *new_mem_aln_v (size_t n) { + // Allocate a mem_aln_t vector + mem_aln_v *alns = malloc (sizeof (mem_aln_v)); + if (alns == NULL) + return NULL; + alns->aln = malloc (n * sizeof (mem_aln_t)); + if (alns->aln == NULL) { + free (alns); + return NULL; + } + alns->n = n; + return alns; +} + +void free_mem_aln_v (mem_aln_v *alns) { + // free mem_aln_v and all its submembers + if (alns != NULL) { + for (size_t i = 0; i < alns->n; ++i) { + free(alns->aln[i].cigar); + } + free(alns); + } +} + +bwaidx_t *bwa_idx_load_all(const char *hint){ + return bwa_idx_load(hint, BWA_IDX_ALL); +} + +mem_aln_v *align(bwaidx_t * idx, char * seq) { + mem_opt_t *opt; + const size_t seq_len = strlen(seq); + + opt = mem_opt_init(); // default values + mem_alnreg_v ar; + ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, seq_len, seq); + + // We won't take secondary alignments + size_t primary = 0; + for (size_t i = 0; i < ar.n; ++i) { + if (ar.a[i].secondary >= 0) continue; + primary++; + } + mem_aln_v *alns = new_mem_aln_v(primary); + alns->n = primary; + + for (size_t i = 0; i < ar.n; ++i) { + if (ar.a[i].secondary >= 0) continue; + alns->aln[i] = mem_reg2aln(opt, idx->bns, idx->pac, seq_len, seq, &ar.a[i]); + } + + free(ar.a); + free(opt); + + return alns; +} diff --git a/docs/conf.py b/docs/conf.py index 5eec573..15cb797 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,8 +13,7 @@ # General information about the project. -#TODO: fill this in -__pkg_name__ = u'' +__pkg_name__ = u'bwapy' project = __pkg_name__.capitalize() copyright = u'2017, Oxford Nanopore Technologies' diff --git a/docs/examples.rst b/docs/examples.rst deleted file mode 100644 index a169412..0000000 --- a/docs/examples.rst +++ /dev/null @@ -1,89 +0,0 @@ -Pomoxis Examples -================ -Below you will find examples of some key tools and ways in which they may be -usefully combined. - -Read Alignment with bwa ------------------------ - -Pomoxis bundles bwa and samtools to accomplish fastq to bam conversion in a -streamlined manner with the `bwa_align` tool: - -.. code-block:: bash - - bwa_align [-h] [-r reference] [-i fastq] [-p prefix] - - Align fastq/a formatted reads to a genome using bwa. - - -h show this help text. - -r reference, should be a fasta file. If correspondng bwa indices - do not exist they will be created. (required). - -i fastq/a input reads (required). - -a aggresively extend gaps (sets -A1 -B2 -O2 -E1 for bwa mem). - -c chunk size. Input reads/contigs will be broken into chunks - prior to alignment. - -t alignment threads (default: 1). - -p output file prefix (default: reads). - -i and -r must be specified. - - -Fast de-novo assembly ---------------------- - -From a .fastq file containing basecalls one can perform an assembly and -consensus using the `mini_assemble` tool. - -.. code-block:: bash - - mini_assemble [-h] [-i fastq] [-o directory] [-p prefix] - - Assemble fastq formatted reads and perform POA consensus - - -h show this help text - -i fastq input reads (required) - -o output folder (default: assm) - -p output file prefix (default: reads) - -i must be specified. - - -For E.coli on a computer with 16 CPUs this should take around 5 minutes for -a 50-fold coverage dataset. - - -Distributed basecalling ------------------------ - -A distributed basecalling platform is included in the epi3me program. This -comprises a `dealer` and a `router` component. The former wil watch for .fast5 -files appearing a a given path; numerous instances of the latter can be started -(on networked machines) to perform the basecalling. A current restriction is -that all machines used for basecalling must shared a filesystem. - -To start watching a filesytem for new .fast5 files use the dealer component: - -.. code-block:: bash - - usage: epi3me dealer [-h] [--port PORT] [--output OUTPUT] path outpath - - positional arguments: - path Path to watch for new files. - outpath Path to move finished files. - - optional arguments: - -h, --help show this help message and exit - --port PORT port on which to run. - --output OUTPUT output fasta file. - -Once the dealer is running and picking up new files, one can start running -router jobs on the same or different machine: - -.. code-block:: bash - - usage: epi3me router [-h] [--addr ADDR] - - optional arguments: - -h, --help show this help message and exit - --addr ADDR Address to use, should include port. - -The platform was used to basecall the CliveOME2 dataset on an SGE cluster -within AWS EC2 with files being synced concurrently from AWS S3. diff --git a/docs/index.rst b/docs/index.rst index 756a567..3b408a1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,27 +1,50 @@ -.. TODO fill in name -Welcome to {}'s documentation! +Welcome to bwapy's documentation! ================================== -.. TODO: add description +Python bindings to `bwa mem` aligner; sufficient to load and index and perform +alignments of sequences to the index to obtain basic statistics. + +These python bindings are licensed under Mozilla Public License 2.0, bwa is licenced +under GNU General Public License v3.0. Installation ------------ -The package should be installed inside a virtual environment. A Makefile is -provided to fetch, compile and install all direct dependencies into an -environment. (Some additional build dependencies may not installed via this -Makefile, see `.travis.yml` for additional requirements if things fail). +The git source repository contains bwa as a submodule. The repository should therefore +be cloned using the recursive option. -To setup the environment run: +The package `setup.py` script requires `libbwa.a` to have been built in the submodule +directory before running. To build and install the package one should therefore run: .. code-block:: bash - #TODO: fill in x2 - git clone --recursive https://git/research/.git - cd - make install - . ./venv/bin/activate -See :doc:`examples` for common workflows. + git clone --recursive https://git/research/bwapy.git + cd bwapy/bwa && make libbwa.a && cd .. + python setup.py install + + +Performing Alignments +--------------------- + +The `BwaAligner` class provides a pythonic interface to `bwa mem` aligner. It +takes as input a bwa index fileset on construction and can then be used to find +alignments of sequences given as strings: + +.. code-block:: python + + from bwapy import BwaAligner + aligner = BwaAligner(args.index) + alignments = aligner.align_seq(seq) + print('Found {} alignments for input {}.'.format(len(alignments), i)) + for aln in alignments: + print(' ', aln) + +The alignments are returned as a named tuple, e.g.: + +.. code-block:: python + + Alignment(rname='yeast', orient='+', pos=0, mapq=60, cigar='915M3D29M3D27M3D13M', NM=12) + Contents -------- @@ -29,8 +52,6 @@ Contents .. toctree:: :maxdepth: 2 - examples -.. TODO: add more pages Full API reference ------------------ @@ -38,8 +59,7 @@ Full API reference .. toctree:: :maxdepth: 3 - pomoxis -.. TODO: change name above + bwapy Indices and tables ------------------ diff --git a/requirements.txt b/requirements.txt index e486fe5..710ba63 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ # pomoxis requirements. # Add comments to keep track of why we are using particular versions -#TODO: add requirements - +cffi diff --git a/setup.py b/setup.py index 2f97554..580843a 100644 --- a/setup.py +++ b/setup.py @@ -11,11 +11,9 @@ import pkg_resources -#TODO: fill in these -__pkg_name__ = '' -__author__ = '' -__description__ = '' - +__pkg_name__ = 'bwapy' +__author__ = 'cwright' +__description__ = 'Bindings to bwa alinger.' __path__ = os.path.dirname(__file__) __pkg_path__ = os.path.join(os.path.join(__path__, __pkg_name__)) @@ -45,22 +43,17 @@ req.split('/')[-1].split('@')[0] install_requires.append(req) - -extra_requires = { - #TODO: any optional requirements -} - - +extra_requires = {} extensions = [] -#TODO: compile any extensions -#extensions.append(Extension( -# 'name', -# sources=[] -# include_dirs=[], -# extra_compile_args=['-pedantic', '-Wall', '-std=c99', '-march=native', '-ffast-math', '-DUSE_SSE2', '-DNDEBUG'], -# libraries=[] #e.g. 'blas' -#)) +extensions.append(Extension( + 'bwalib', + sources=[os.path.join('bwapy', 'libbwapy.c')], + include_dirs=['bwa'], + extra_compile_args=['-pedantic', '-Wall', '-std=c99', '-march=native', '-ffast-math', '-DUSE_SSE2', '-DNDEBUG'], + libraries=[], + extra_objects=[os.path.join('bwa','libbwa.a')] +)) setup( @@ -79,16 +72,11 @@ packages=find_packages(exclude=['*.test', '*.test.*', 'test.*', 'test']), package_data={}, zip_safe=False, - data_files=[ - #TODO: Probably won't have these, use package_data in most cases - ], + data_files=[], entry_points={ 'console_scripts': [ - #TODO: add entry points - #'name' = {}.package.module:function'.format(__pkg_name__) + 'bwamempy = {}.libbwa:main'.format(__pkg_name__) ] }, - scripts=[ - #TODO: Probably won't have these, use entry_points - ] + scripts=[] )