Skip to content

Commit

Permalink
Initial release
Browse files Browse the repository at this point in the history
  • Loading branch information
cjw85 committed Jul 9, 2017
1 parent 82ba4c0 commit 05e95e1
Show file tree
Hide file tree
Showing 12 changed files with 421 additions and 161 deletions.
6 changes: 0 additions & 6 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,11 @@ python:
- "3.6"
dist: trusty

cache:
directories:
- bincache

git:
submodules: true

before_install:
- sudo apt-get -qq update
#TODO: add any ubuntu packages needed
- sudo apt-get install -y python3-all-dev

script:
Expand All @@ -23,7 +18,6 @@ script:
deploy:
provider: pages
skip_cleanup: true
#TODO: add this token to the project on travis (ask cwright)
github_token: $GHPAGES_TOKEN
local_dir: docs/_build/html
target_branch: gh-pages
Expand Down
11 changes: 8 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
.PHONY: install docs

OS := $(shell uname)
ifeq ($(OS), Darwin)
SEDI=sed -i '.bak'
else
SEDI=sed -i
endif

venv: venv/bin/activate
IN_VENV=. ./venv/bin/activate

venv/bin/activate:
test -d venv || virtualenv venv --python=python3
${IN_VENV} && pip install pip --upgrade
${IN_VENV} && pip install numpy # needs to get done before other things

install: venv | $(addprefix $(BINCACHEDIR)/, $(BINARIES))
install: venv
cd bwa && ${SEDI} 's/int\ bwa_verbose\ =\ 3;/int\ bwa_verbose\ =\ 2;/' bwa.c && make libbwa.a
${IN_VENV} && pip install -r requirements.txt && python setup.py install


Expand Down
42 changes: 32 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
[//]: # (TODO: Add a title)
TITLE
bwapy
=====

[//]: # (TODO: fill in <project x2>)
[![Build Status](https://travis-ci.org/nanoporetech/<project>.svg?branch=master)](https://travis-ci.org/nanoporetech/<project>)
[![Build Status](https://travis-ci.org/nanoporetech/bwapy.svg?branch=master)](https://travis-ci.org/nanoporetech/bwapy)

[//]: # (TODO: Add a description)
Python bindings to `bwa mem` aligner; sufficient to load and index and perform
alignments of sequences to the index to obtain basic statistics.

[//]: # (TODO: fill in <project>)
Documentation can be found at https://nanoporetech.github.io/<project>/.
These python bindings are licensed under Mozilla Public License 2.0, bwa is licenced
under GNU General Public License v3.0.

Documentation can be found at https://nanoporetech.github.io/bwapy/.

Build
-----
Expand All @@ -19,9 +20,30 @@ environment.

To setup the environment run:

[//]: # (TODO: fill in <project> x2)
git clone --recursive https://github.com/nanoporetech/<project>.git
cd <project>
git clone --recursive https://github.com/nanoporetech/bwapy.git
cd bwapy
make install
. ./venv/bin/activate


Example
-------

The `BwaAligner` class provides a pythonic interface to `bwa mem` aligner. It
takes as input a bwa index fileset on construction and can then be used to find
alignments of sequences given as strings:

```python
from bwapy import BwaAligner
aligner = BwaAligner(args.index)
alignments = aligner.align_seq(seq)
print('Found {} alignments for input {}.'.format(len(alignments), i))
for aln in alignments:
print(' ', aln)
```

The alignments are returned as a named tuple, e.g.:

```python
Alignment(rname='yeast', orient='+', pos=0, mapq=60, cigar='915M3D29M3D27M3D13M', NM=12)
```
4 changes: 0 additions & 4 deletions __TODO__

This file was deleted.

3 changes: 3 additions & 0 deletions bwapy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__version__ = '0.1.0'

from bwapy.libbwa import BwaAligner
201 changes: 201 additions & 0 deletions bwapy/libbwa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import argparse
from collections import namedtuple
import importlib
import imp
import os
import sys

from cffi import FFI
ffi = FFI()

"""High-level interface to bwa (mem) aligner."""


def get_shared_lib(name):
"""Cross-platform resolution of shared-object libraries, working
around vagueries of setuptools.
:param name: name of shared library to find.
:returns: FFI shared library object.
"""
try:
# after 'python setup.py install' we should be able to do this
lib_file = importlib.import_module(name).__file__
except Exception as e:
try:
# after 'python setup.py develop' this should work
lib_file = imp.find_module(name)[1]
except Exception as e:
raise ImportError('Cannot locate C library "{}".'.format(name))
else:
lib_file = os.path.abspath(lib_file)
finally:
library = ffi.dlopen(lib_file)
return library


libbwa = get_shared_lib('bwalib')

ffi.cdef("""
////////////////////////////////
// Alignment hit list structures
//
typedef struct {
int64_t rb, re; // [rb,re): reference sequence in the alignment
int qb, qe; // [qb,qe): query sequence in the alignment
int rid; // reference seq ID
int score; // best local SW score
int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
int sub; // 2nd best SW score
int alt_sc;
int csub; // SW score of a tandem hit
int sub_n; // approximate number of suboptimal hits
int w; // actual band width used in extension
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
int secondary_all;
int seedlen0; // length of the starting seed
int n_comp:30, is_alt:2; // number of sub-alignments chained together
float frac_rep;
uint64_t hash;
} mem_alnreg_t;
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
typedef struct { // This struct is only used for the convenience of API.
int64_t pos; // forward strand 5'-end mapping position
int rid; // reference sequence index in bntseq_t; <0 for unmapped
int flag; // extra flag
uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
int n_cigar; // number of CIGAR operations
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
char *XA; // alternative mappings
int score, sub, alt_sc;
} mem_aln_t;
typedef struct { size_t n; mem_aln_t *aln; } mem_aln_v;
void free_mem_aln_v (mem_aln_v *alns);
///////////////////////
// bwa index structures
//
typedef uint64_t bwtint_t;
typedef struct {
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_table[256];
// suffix array
int sa_intv;
bwtint_t n_sa;
bwtint_t *sa;
} bwt_t;
typedef struct {
int64_t offset;
int32_t len;
int32_t n_ambs;
uint32_t gi;
int32_t is_alt;
char *name, *anno;
} bntann1_t;
typedef struct {
int64_t offset;
int32_t len;
char amb;
} bntamb1_t;
typedef struct {
int64_t l_pac;
int32_t n_seqs;
uint32_t seed;
bntann1_t *anns; // n_seqs elements
int32_t n_holes;
bntamb1_t *ambs; // n_holes elements
FILE *fp_pac;
} bntseq_t;
typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
int is_shm;
int64_t l_mem;
uint8_t *mem;
} bwaidx_t;
bwaidx_t *bwa_idx_load_all(const char *hint);
void bwa_idx_destroy(bwaidx_t *idx);
///////////////////
// Run an alignment
//
mem_aln_v *align(bwaidx_t * index, char * seq);
""")


Alignment = namedtuple('Alignment', [
'rname', 'orient', 'pos', 'mapq', 'cigar', 'NM'
])

class BwaAligner(object):
def __init__(self, index:str):
self.index_base = index.encode()
self.cigchar = "MIDSH"
self.index = libbwa.bwa_idx_load_all(self.index_base)
if self.index == ffi.NULL:
raise ValueError('Failed to load bwa index.')

def __del__(self):
libbwa.bwa_idx_destroy(self.index)

def _build_alignment(self, aln):
cigar = aln.cigar
cigar = ''.join(
# oplen + op
str(cigar[k]>>4) + self.cigchar[cigar[k] & 0xf]
for k in range(aln.n_cigar)
)
return Alignment(
ffi.string(self.index.bns.anns[aln.rid].name).decode(),
'+-'[aln.is_rev], aln.pos, aln.mapq, cigar, aln.NM
)


def align_seq(self, seq:str):
"""Align a sequence to the index.
:param seq: base sequence to align
:returns: tuple of :class:`Alignment`
"""
alns = libbwa.align(self.index, seq.encode())
alignments = tuple(self._build_alignment(alns.aln[i]) for i in range(alns.n))
libbwa.free_mem_aln_v(alns)
return alignments


def get_parser():
parser = argparse.ArgumentParser('Align a sequence with bwa mem.')
parser.add_argument('index', help='bwa index base path.')
parser.add_argument('sequence', nargs='+', help='base sequence')
return parser


def main():
args = get_parser().parse_args()
aligner = BwaAligner(args.index)
for i, seq in enumerate(args.sequence, 1):
alignments = aligner.align_seq(seq)
print('Found {} alignments for input {}.'.format(len(alignments), i))
for aln in alignments:
print(' ', aln)

Loading

0 comments on commit 05e95e1

Please sign in to comment.