Skip to content

Commit

Permalink
Merge branch 'release/0.3.4'
Browse files Browse the repository at this point in the history
  • Loading branch information
maehler committed Nov 8, 2015
2 parents e93e34f + 39cd117 commit a483405
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 21 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/dist/
/*.egg-info
/*.egg
/.eggs
/build/
/cover/
/.coverage
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ and operon extraction in prokaryotes. The secondary purpose of seqpoet is to be
a Python package that can be used for handling sequence data in the form of
FASTA and GenBank files.

## Documentation

For full documentation, please see http://seqpoet.readthedocs.org.

## Requirements

Currently, the only requirement is Python >= 2.7, but Python 3 is not supported
Expand Down
111 changes: 101 additions & 10 deletions bin/seqpoet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ from __future__ import print_function
import argparse
import itertools
import os
import stat
import sys

import seqpoet
Expand All @@ -22,6 +23,7 @@ def get_probe(fname):
exit(1)
elif len(seqs) > 2:
print('ERROR: probe file contains too many sequences', file=sys.stderr)
exit(1)

return seqs

Expand Down Expand Up @@ -213,6 +215,9 @@ def match_primer(primers, seqs, mismatches=2, minus_revcomp=True,
except seqpoet.genbank.ParsingError as pe:
print('ERROR: parsing failed in {0}: {1}'.format(fname, pe.message))
sys.exit(1)
except ValueError as ve:
print('ERROR: parsing failed in {0}: {1}'.format(fname, ve.message))
sys.exit(1)

return matches

Expand All @@ -231,14 +236,18 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True,
max(1, m['hitstart'] - extend_downstream),
m['hitend'] + extend_upstream)

match_is_complement = m['strand'] == '-'

features = filter(lambda x: x.location.is_complement == \
(m['strand'] == '-'), locus.features_at_location(location))
(match_is_complement), locus.features_at_location(location))

if len(features) == 0:
print('WARNING: no gene for match in locus {0}'.format(m['seqname']),
file=sys.stderr)
continue

upstream_edge = False
downstream_edge = False
operon_genes = []
for f in features:
# Find upstream genes
Expand All @@ -248,6 +257,15 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True,
operon_genes.append(us)
us = locus.next_upstream(us)

if us is None:
# We hit the edge of the sequence.
if match_is_complement and \
len(locus.seq) - operon_genes[-1].location.end < max_distance:
upstream_edge = True
if not match_is_complement and \
operon_genes[-1].location.start < max_distance:
upstream_edge = True

# Add the current feature
operon_genes.append(f)

Expand All @@ -258,26 +276,53 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True,
operon_genes.append(ds)
ds = locus.next_downstream(ds)

if ds is None:
if match_is_complement and \
operon_genes[-1].location.start < max_distance:
downstream_edge = True
if not match_is_complement and \
len(locus.seq) - operon_genes[-1].location.end < max_distance:
downstream_edge = True

if len(operon_genes) == 1:
print('WARNING: only one gene found for match in locus '
'{0}, not an operon'.format(m['seqname']), file=sys.stderr)

operon_genes = sorted(operon_genes, key=lambda x: x.location.start)
min_start = operon_genes[0].location.start
max_end = operon_genes[-1].location.end

operon_seq = locus.seq[min_start:max_end]
if match_is_complement:
if upstream_edge:
max_end = len(locus.seq)
else:
max_end = operon_genes[-1].location.end + 1
if downstream_edge:
min_start = 1
else:
min_start = operon_genes[0].location.start + 1
else:
if upstream_edge:
min_start = 1
else:
min_start = operon_genes[0].location.start + 1
if downstream_edge:
max_end = len(locus.seq)
else:
max_end = operon_genes[-1].location.end + 1

operon_seq = locus.seq[(min_start - 1):max_end]

if minus_revcomp and m['strand'] == '-':
operon_seq = operon_seq.revcomp()

match_operon.append({
'filename': m['filename'],
'seqname': m['seqname'],
'hitstart': m['hitstart'],
'hitend': m['hitend'],
'hitstart': min_start,
'hitend': max_end,
'length': m['length'],
'strand': m['strand'],
'upstream_edge': upstream_edge,
'downstream_edge': downstream_edge,
'operon': operon_genes,
'seq': operon_seq
})
Expand All @@ -303,7 +348,8 @@ def write_fasta(matches, filename=sys.stdout):
f.close()

def parse_args():
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(description='For more detailed '
'documentation, see http://seqpoet.readthedocs.org')

parser.add_argument('genomedir', help=('directory containing the genome '
'files to use (FASTA or GenBank format) or a single GenBank or '
Expand Down Expand Up @@ -363,6 +409,10 @@ def parse_args():
if not os.path.exists(args.probe):
parser.error('file or directory not found: {}'.format(args.probe))

if not os.path.isfile(args.probe) and \
not stat.S_ISFIFO(os.stat(args.probe).st_mode):
parser.error('probe is not a file: {}'. format(args.probe))

if not isinstance(args.out, file):
args.out = os.path.abspath(args.out)
if not os.path.exists(os.path.dirname(args.out)):
Expand Down Expand Up @@ -412,13 +462,16 @@ def main():
matches = match_probe(probe[0], seqs, mismatches=args.mismatches,
minus_revcomp=args.minus_revcomp)

print('Found {0} match{1}'.format(len(matches),
'es' if len(matches) > 1 or len(matches) == 0 else ''), file=sys.stderr)

if len(matches) == 0:
print('WARNING: no matches found', file=sys.stderr)
exit(0)

match_files = set([x['filename'] for x in matches])

print('Found {0} match{1} in {2} file{3}'.format(len(matches),
'es' if len(matches) > 1 or len(matches) == 0 else '',
len(match_files), 's' if len(match_files) > 1 else ''), file=sys.stderr)

# In silico PCR results
if is_primer and args.pcr:
write_fasta(matches, filename=args.out)
Expand All @@ -434,6 +487,44 @@ def main():
print('WARNING: no operons found', file=sys.stderr)
exit(0)

# Some statistics
edges_upstream = sum(x['upstream_edge'] for x in match_features)
edges_downstream = sum(x['downstream_edge'] for x in match_features)

print('Found {0} operon{1} in {2} file{3}'.format(
len(match_features), 's' if len(match_features) != 1 else '',
len(seqs), 's' if len(seqs) != 1 else ''), file=sys.stderr)
print('{0} operon{1} reached the sequence edge upstream of the match'.format(
edges_upstream, 's' if edges_upstream != 1 else ''), file=sys.stderr)
print('{0} operon{1} reached the sequence edge downstream of the match'.format(
edges_downstream, 's' if edges_downstream != 1 else ''), file=sys.stderr)

operon_files = set([x['filename'] for x in match_features])
for s in seqs.itervalues():
fname = s.filename
if fname in match_files:
n_matches = sum([x['filename'] == fname for x in matches])
print('{0}:'.format(os.path.basename(fname)), file=sys.stderr)
print('\t{1} match{2}'.format(fname, n_matches,
'es' if n_matches > 1 else ''), file=sys.stderr)
if fname in operon_files:
operons = [x for x in match_features if x['filename'] == fname]
n_operons = len(operons)
print('\t{0} operon{1}'.format(n_operons,
's' if n_operons > 1 else ''), file=sys.stderr)
for o in operons:
if o['upstream_edge']:
print('operon in {0} hit upstream edge'.format(
o['seqname']), file=sys.stderr)
if o['downstream_edge']:
print('\toperon in {0} hit downstream edge'.format(
o['seqname']), file=sys.stderr)
else:
print('\t0 operons', file=sys.stderr)
else:
print('{0}:\n\tno matches'.format(os.path.basename(fname)),
file=sys.stderr)

write_fasta(match_features, filename=args.out)

if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions docs/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ Currently, the easiest way of installing seqpoet is to download the `latest
release from GitHub <https://github.com/maehler/seqpoet/releases/latest>`_
and install it manually::

> unzip seqpoet-0.3.3.zip # or tar zxvf seqpoet-0.3.3.tar.gz
> cd seqpoet-0.3.3
> unzip seqpoet-0.3.4.zip # or tar zxvf seqpoet-0.3.4.tar.gz
> cd seqpoet-0.3.4
> python setup.py install

If you want the cutting edge version (potentially unstable), you can clone
Expand Down
2 changes: 1 addition & 1 deletion seqpoet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from sequence import Sequence
import search

__version__ = '0.3.3'
__version__ = '0.3.4'
2 changes: 2 additions & 0 deletions seqpoet/genbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ class GenBankLocus(object):
- **name:** locus name.
- **seq:** a Sequence object with the sequence of the locus.
- **features:** a dictionary containing the features of the locus.
- **header:** a dictionary containing a genbank header.
:param name: the name of the locus.
:param seq: a Sequence object representing the sequence of the locus.
Expand All @@ -426,6 +427,7 @@ def __init__(self, name, seq, features=None, header=None):
name: the name of the locus.
seq: a Sequence object representing the sequence of the locus.
features: a dictionary containing features of the locus.
header: a dictionary containing a genbank header
"""
self.name = name
self.seq = seq
Expand Down
4 changes: 2 additions & 2 deletions seqpoet/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def __init__(self, seq):
"""
self.seq = seq.lower()
if not re.match(r'^[acgtn]*$', self.seq):
raise ValueError('illegal characters in sequence, currently ',
'only supports DNA sequences')
raise ValueError('illegal characters in sequence, '
'currently only supports DNA sequences')

def revcomp(self):
"""Get the reverse complement of the sequence.
Expand Down
14 changes: 8 additions & 6 deletions seqpoet/tests/test_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
class TestFindOperon:

def setup(self):
gb_dir = ('/Users/niklasm/Dropbox/operon_extractor/data_genbank')
gb_dir = os.path.expanduser('~/Dropbox/operon_extractor/data_genbank')
gb_fname = os.path.join(gb_dir, 'LMG718-cremoris.gb')
if not os.path.exists(gb_fname):
raise SkipTest
Expand All @@ -25,8 +25,7 @@ def setup(self):
gb_fname: gb
}
self.matches = [{
'filename': ('/Users/niklasm/Dropbox/operon_extractor/'
'data_genbank/LMG718-cremoris.gb'),
'filename': gb_fname,
'hitend': 3360,
'hitstart': 3311,
'length': 50,
Expand All @@ -37,7 +36,7 @@ def setup(self):
'strand': '+'
}]
self.minus_matches = [{
'filename': '/Users/niklasm/Dropbox/operon_extractor/data_genbank/LMG718-cremoris.gb',
'filename': gb_fname,
'hitend': 3360,
'hitstart': 3311,
'length': 50,
Expand All @@ -60,7 +59,7 @@ def test_operon_find_extend_upstream(self):
assert len(res[0]['operon']) == 2

operon_len = len(res[0]['seq'])
assert operon_len == 3296, 'length is {0}'.format(operon_len)
assert operon_len == 3303, 'length is {0}'.format(operon_len)

def test_operon_find_extend_downstream(self):
res = seqpoet_script.find_operon(self.matches, self.seqs,
Expand All @@ -83,7 +82,10 @@ def test_revcomp_operon_find_extend_upstream(self):
assert len(res) == 1
assert len(res[0]['operon']) == 2

assert not res[0]['downstream_edge']
assert res[0]['upstream_edge']

assert all(x.location.is_complement for x in res[0]['operon'])

operon_len = len(res[0]['seq'])
assert operon_len == 2135, 'length is {0}'.format(operon_len)
assert operon_len == 2378, 'length is {0}'.format(operon_len)

0 comments on commit a483405

Please sign in to comment.