From 20b99bcf9b176c7d649910457785fbe213a4ae1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 11 Oct 2015 20:57:31 +0200 Subject: [PATCH 01/15] Ignore .eggs directory --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0125fc4..f27dcb0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /dist/ /*.egg-info /*.egg +/.eggs /build/ /cover/ /.coverage From 8ce784c384fd2e1ce6153a2ef4df4bf8085f25f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 11 Oct 2015 21:00:11 +0200 Subject: [PATCH 02/15] Exit if probe file has too many sequences --- bin/seqpoet | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/seqpoet b/bin/seqpoet index 9969d89..fc3cce9 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -22,6 +22,7 @@ def get_probe(fname): exit(1) elif len(seqs) > 2: print('ERROR: probe file contains too many sequences', file=sys.stderr) + exit(1) return seqs From a40fc70677a8c8d7ff742b7123d285ca5c0e060b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Mon, 12 Oct 2015 13:19:23 +0200 Subject: [PATCH 03/15] Documentation --- seqpoet/genbank.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py index 6a736ed..c69c9ca 100644 --- a/seqpoet/genbank.py +++ b/seqpoet/genbank.py @@ -413,6 +413,7 @@ class GenBankLocus(object): - **name:** locus name. - **seq:** a Sequence object with the sequence of the locus. - **features:** a dictionary containing the features of the locus. + - **header:** a dictionary containing a genbank header. :param name: the name of the locus. :param seq: a Sequence object representing the sequence of the locus. @@ -426,6 +427,7 @@ def __init__(self, name, seq, features=None, header=None): name: the name of the locus. seq: a Sequence object representing the sequence of the locus. features: a dictionary containing features of the locus. + header: a dictionary containing a genbank header """ self.name = name self.seq = seq From c1c238d6514e37bb64ca39c06dfabaf241e0fd3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 25 Oct 2015 19:03:39 +0100 Subject: [PATCH 04/15] Check that probe is a file --- bin/seqpoet | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/seqpoet b/bin/seqpoet index fc3cce9..4aec8d7 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -364,6 +364,9 @@ def parse_args(): if not os.path.exists(args.probe): parser.error('file or directory not found: {}'.format(args.probe)) + if not os.path.isfile(args.probe): + parser.error('probe is not a file: {}'. format(args.probe)) + if not isinstance(args.out, file): args.out = os.path.abspath(args.out) if not os.path.exists(os.path.dirname(args.out)): From 5113db75794c51c596d9c380987b836e652f6f2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Wed, 4 Nov 2015 21:52:31 +0100 Subject: [PATCH 05/15] Output statistics Right now we only get numbers on how many of the operons fell outside the edges of the matching sequences. This should be extended to include statistics on file and perhaps also sequence level. --- bin/seqpoet | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/bin/seqpoet b/bin/seqpoet index 4aec8d7..faf78d2 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -240,6 +240,8 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, file=sys.stderr) continue + upstream_edge = False + downstream_edge = False operon_genes = [] for f in features: # Find upstream genes @@ -249,6 +251,10 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, operon_genes.append(us) us = locus.next_upstream(us) + if us is None: + # We hit the edge of the sequence. + upstream_edge = True + # Add the current feature operon_genes.append(f) @@ -259,6 +265,9 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, operon_genes.append(ds) ds = locus.next_downstream(ds) + if ds is None: + downstream_edge = True + if len(operon_genes) == 1: print('WARNING: only one gene found for match in locus ' '{0}, not an operon'.format(m['seqname']), file=sys.stderr) @@ -279,6 +288,8 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, 'hitend': m['hitend'], 'length': m['length'], 'strand': m['strand'], + 'upstream_edge': upstream_edge, + 'downstream_edge': downstream_edge, 'operon': operon_genes, 'seq': operon_seq }) @@ -438,6 +449,18 @@ def main(): print('WARNING: no operons found', file=sys.stderr) exit(0) + # Some statistics + edges_upstream = sum(x['upstream_edge'] for x in match_features) + edges_downstream = sum(x['downstream_edge'] for x in match_features) + + print('Found {0} operon{1} in {2} file{3}'.format( + len(match_features), 's' if len(match_features) != 1 else '', + len(seqs), 's' if len(seqs) != 1 else ''), file=sys.stderr) + print('{0} operon{1} reached the sequence edge upstream of the match'.format( + edges_upstream, 's' if edges_upstream != 1 else ''), file=sys.stderr) + print('{0} operon{1} reached the sequence edge downstream of the match'.format( + edges_downstream, 's' if edges_downstream != 1 else ''), file=sys.stderr) + write_fasta(match_features, filename=args.out) if __name__ == '__main__': From f6cca29805959fd913643f1cb33565e30cf56ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Wed, 4 Nov 2015 21:57:31 +0100 Subject: [PATCH 06/15] Better error message for illegal characters --- bin/seqpoet | 3 +++ seqpoet/sequence.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/seqpoet b/bin/seqpoet index faf78d2..c7e7a3b 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -214,6 +214,9 @@ def match_primer(primers, seqs, mismatches=2, minus_revcomp=True, except seqpoet.genbank.ParsingError as pe: print('ERROR: parsing failed in {0}: {1}'.format(fname, pe.message)) sys.exit(1) + except ValueError as ve: + print('ERROR: parsing failed in {0}: {1}'.format(fname, ve.message)) + sys.exit(1) return matches diff --git a/seqpoet/sequence.py b/seqpoet/sequence.py index fc62217..ceec66f 100644 --- a/seqpoet/sequence.py +++ b/seqpoet/sequence.py @@ -33,8 +33,8 @@ def __init__(self, seq): """ self.seq = seq.lower() if not re.match(r'^[acgtn]*$', self.seq): - raise ValueError('illegal characters in sequence, currently ', - 'only supports DNA sequences') + raise ValueError('illegal characters in sequence, ' + 'currently only supports DNA sequences') def revcomp(self): """Get the reverse complement of the sequence. From 4a078c38c9632979a76ea5ccd0a7090a6e4eb408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Thu, 5 Nov 2015 23:02:18 +0100 Subject: [PATCH 07/15] Output more stats --- bin/seqpoet | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/bin/seqpoet b/bin/seqpoet index c7e7a3b..00eeca5 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -430,13 +430,16 @@ def main(): matches = match_probe(probe[0], seqs, mismatches=args.mismatches, minus_revcomp=args.minus_revcomp) - print('Found {0} match{1}'.format(len(matches), - 'es' if len(matches) > 1 or len(matches) == 0 else ''), file=sys.stderr) - if len(matches) == 0: print('WARNING: no matches found', file=sys.stderr) exit(0) + match_files = set([x['filename'] for x in matches]) + + print('Found {0} match{1} in {2} file{3}'.format(len(matches), + 'es' if len(matches) > 1 or len(matches) == 0 else '', + len(match_files), 's' if len(match_files) > 1 else ''), file=sys.stderr) + # In silico PCR results if is_primer and args.pcr: write_fasta(matches, filename=args.out) @@ -464,6 +467,32 @@ def main(): print('{0} operon{1} reached the sequence edge downstream of the match'.format( edges_downstream, 's' if edges_downstream != 1 else ''), file=sys.stderr) + operon_files = set([x['filename'] for x in match_features]) + for s in seqs.itervalues(): + fname = s.filename + if fname in match_files: + n_matches = sum([x['filename'] == fname for x in matches]) + print('{0}:'.format(os.path.basename(fname)), file=sys.stderr) + print('\t{1} match{2}'.format(fname, n_matches, + 'es' if n_matches > 1 else ''), file=sys.stderr) + if fname in operon_files: + operons = [x for x in match_features if x['filename'] == fname] + n_operons = len(operons) + print('\t{0} operon{1}'.format(n_operons, + 's' if n_operons > 1 else ''), file=sys.stderr) + for o in operons: + if o['upstream_edge']: + print('operon in {0} hit upstream edge'.format( + o['seqname']), file=sys.stderr) + if o['downstream_edge']: + print('\toperon in {0} hit downstream edge'.format( + o['seqname']), file=sys.stderr) + else: + print('\t0 operons', file=sys.stderr) + else: + print('{0}:\n\tno matches'.format(os.path.basename(fname)), + file=sys.stderr) + write_fasta(match_features, filename=args.out) if __name__ == '__main__': From 55ced6e778a46565c18331ae761a668e165acbd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Fri, 6 Nov 2015 14:25:57 +0100 Subject: [PATCH 08/15] Add link to readthedocs in help output --- bin/seqpoet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/seqpoet b/bin/seqpoet index 00eeca5..46f8119 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -318,7 +318,8 @@ def write_fasta(matches, filename=sys.stdout): f.close() def parse_args(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description='For more detailed ' + 'documentation, see http://seqpoet.readthedocs.org') parser.add_argument('genomedir', help=('directory containing the genome ' 'files to use (FASTA or GenBank format) or a single GenBank or ' From f657ed36a1dbc43ad21b436c42827cbb51c83f5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Fri, 6 Nov 2015 19:39:49 +0100 Subject: [PATCH 09/15] Fix #12 --- bin/seqpoet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/seqpoet b/bin/seqpoet index 46f8119..c72b981 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -287,8 +287,8 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, match_operon.append({ 'filename': m['filename'], 'seqname': m['seqname'], - 'hitstart': m['hitstart'], - 'hitend': m['hitend'], + 'hitstart': min_start + 1, + 'hitend': max_end + 1, 'length': m['length'], 'strand': m['strand'], 'upstream_edge': upstream_edge, From f3fb3fdce71d91177ae92de893c87c058402ae42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sat, 7 Nov 2015 18:01:43 +0100 Subject: [PATCH 10/15] Enable testing on more systems --- seqpoet/tests/test_script.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/seqpoet/tests/test_script.py b/seqpoet/tests/test_script.py index 19ab874..3d661a2 100644 --- a/seqpoet/tests/test_script.py +++ b/seqpoet/tests/test_script.py @@ -15,7 +15,7 @@ class TestFindOperon: def setup(self): - gb_dir = ('/Users/niklasm/Dropbox/operon_extractor/data_genbank') + gb_dir = os.path.expanduser('~/Dropbox/operon_extractor/data_genbank') gb_fname = os.path.join(gb_dir, 'LMG718-cremoris.gb') if not os.path.exists(gb_fname): raise SkipTest @@ -25,8 +25,7 @@ def setup(self): gb_fname: gb } self.matches = [{ - 'filename': ('/Users/niklasm/Dropbox/operon_extractor/' - 'data_genbank/LMG718-cremoris.gb'), + 'filename': gb_fname, 'hitend': 3360, 'hitstart': 3311, 'length': 50, @@ -37,7 +36,7 @@ def setup(self): 'strand': '+' }] self.minus_matches = [{ - 'filename': '/Users/niklasm/Dropbox/operon_extractor/data_genbank/LMG718-cremoris.gb', + 'filename': gb_fname, 'hitend': 3360, 'hitstart': 3311, 'length': 50, From 265aa26f5d8038db7d1388d91aae0a3dcec78d37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sat, 7 Nov 2015 18:02:32 +0100 Subject: [PATCH 11/15] Probe/primers can be FIFO --- bin/seqpoet | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/seqpoet b/bin/seqpoet index c72b981..bf95dd7 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -3,6 +3,7 @@ from __future__ import print_function import argparse import itertools import os +import stat import sys import seqpoet @@ -379,7 +380,8 @@ def parse_args(): if not os.path.exists(args.probe): parser.error('file or directory not found: {}'.format(args.probe)) - if not os.path.isfile(args.probe): + if not os.path.isfile(args.probe) and \ + not stat.S_ISFIFO(os.stat(args.probe).st_mode): parser.error('probe is not a file: {}'. format(args.probe)) if not isinstance(args.out, file): From a7362bd7a8bbdf459e6468c81fc0bff0ee67f759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sat, 7 Nov 2015 18:04:09 +0100 Subject: [PATCH 12/15] Better edge checks for operon search The edge detection introduced with 5113db7 wasn't completely correct. This should now be fixed, and appropriate tests have been formulated. --- bin/seqpoet | 45 +++++++++++++++++++++++++++++------- seqpoet/tests/test_script.py | 7 ++++-- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/bin/seqpoet b/bin/seqpoet index bf95dd7..deca9ed 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -236,8 +236,10 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, max(1, m['hitstart'] - extend_downstream), m['hitend'] + extend_upstream) + match_is_complement = m['strand'] == '-' + features = filter(lambda x: x.location.is_complement == \ - (m['strand'] == '-'), locus.features_at_location(location)) + (match_is_complement), locus.features_at_location(location)) if len(features) == 0: print('WARNING: no gene for match in locus {0}'.format(m['seqname']), @@ -257,7 +259,12 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, if us is None: # We hit the edge of the sequence. - upstream_edge = True + if match_is_complement and \ + len(locus.seq) - operon_genes[-1].location.end < max_distance: + upstream_edge = True + if not match_is_complement and \ + operon_genes[-1].location.start < max_distance: + upstream_edge = True # Add the current feature operon_genes.append(f) @@ -270,17 +277,39 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, ds = locus.next_downstream(ds) if ds is None: - downstream_edge = True + if match_is_complement and \ + operon_genes[-1].location.start < max_distance: + downstream_edge = True + if not match_is_complement and \ + len(locus.seq) - operon_genes[-1].location.end < max_distance: + downstream_edge = True if len(operon_genes) == 1: print('WARNING: only one gene found for match in locus ' '{0}, not an operon'.format(m['seqname']), file=sys.stderr) operon_genes = sorted(operon_genes, key=lambda x: x.location.start) - min_start = operon_genes[0].location.start - max_end = operon_genes[-1].location.end - operon_seq = locus.seq[min_start:max_end] + if match_is_complement: + if upstream_edge: + max_end = len(locus.seq) + else: + max_end = operon_genes[-1].location.end + 1 + if downstream_edge: + min_start = 1 + else: + min_start = operon_genes[0].location.start + 1 + else: + if upstream_edge: + min_start = 1 + else: + operon_genes[0].location.start + 1 + if downstream_edge: + max_end = len(locus.seq) + else: + max_end = operon_genes[-1].location.end + 1 + + operon_seq = locus.seq[(min_start - 1):max_end] if minus_revcomp and m['strand'] == '-': operon_seq = operon_seq.revcomp() @@ -288,8 +317,8 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, match_operon.append({ 'filename': m['filename'], 'seqname': m['seqname'], - 'hitstart': min_start + 1, - 'hitend': max_end + 1, + 'hitstart': min_start, + 'hitend': max_end, 'length': m['length'], 'strand': m['strand'], 'upstream_edge': upstream_edge, diff --git a/seqpoet/tests/test_script.py b/seqpoet/tests/test_script.py index 3d661a2..21815fc 100644 --- a/seqpoet/tests/test_script.py +++ b/seqpoet/tests/test_script.py @@ -59,7 +59,7 @@ def test_operon_find_extend_upstream(self): assert len(res[0]['operon']) == 2 operon_len = len(res[0]['seq']) - assert operon_len == 3296, 'length is {0}'.format(operon_len) + assert operon_len == 3303, 'length is {0}'.format(operon_len) def test_operon_find_extend_downstream(self): res = seqpoet_script.find_operon(self.matches, self.seqs, @@ -82,7 +82,10 @@ def test_revcomp_operon_find_extend_upstream(self): assert len(res) == 1 assert len(res[0]['operon']) == 2 + assert not res[0]['downstream_edge'] + assert res[0]['upstream_edge'] + assert all(x.location.is_complement for x in res[0]['operon']) operon_len = len(res[0]['seq']) - assert operon_len == 2135, 'length is {0}'.format(operon_len) + assert operon_len == 2378, 'length is {0}'.format(operon_len) From 351387ae1967992914e189395e87b8656a7f39a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sat, 7 Nov 2015 18:38:44 +0100 Subject: [PATCH 13/15] Fix small error --- bin/seqpoet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/seqpoet b/bin/seqpoet index deca9ed..d2ff942 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -303,7 +303,7 @@ def find_operon(matches, seqs, max_distance=500, minus_revcomp=True, if upstream_edge: min_start = 1 else: - operon_genes[0].location.start + 1 + min_start = operon_genes[0].location.start + 1 if downstream_edge: max_end = len(locus.seq) else: From 8370305df659206f52b419c1517090cfe98f3ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 8 Nov 2015 19:06:06 +0100 Subject: [PATCH 14/15] Add link to readthedocs --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index dcd1775..e3e9b23 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ and operon extraction in prokaryotes. The secondary purpose of seqpoet is to be a Python package that can be used for handling sequence data in the form of FASTA and GenBank files. +## Documentation + +For full documentation, please see http://seqpoet.readthedocs.org. + ## Requirements Currently, the only requirement is Python >= 2.7, but Python 3 is not supported From 39cd117d251f9196b2dec7cd64abfe3e32e0a5a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 8 Nov 2015 19:09:34 +0100 Subject: [PATCH 15/15] Bump version number --- docs/installation.rst | 4 ++-- seqpoet/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index 81edc90..b966c10 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -9,8 +9,8 @@ Currently, the easiest way of installing seqpoet is to download the `latest release from GitHub `_ and install it manually:: - > unzip seqpoet-0.3.3.zip # or tar zxvf seqpoet-0.3.3.tar.gz - > cd seqpoet-0.3.3 + > unzip seqpoet-0.3.4.zip # or tar zxvf seqpoet-0.3.4.tar.gz + > cd seqpoet-0.3.4 > python setup.py install If you want the cutting edge version (potentially unstable), you can clone diff --git a/seqpoet/__init__.py b/seqpoet/__init__.py index 40bd8b7..5644704 100644 --- a/seqpoet/__init__.py +++ b/seqpoet/__init__.py @@ -3,4 +3,4 @@ from sequence import Sequence import search -__version__ = '0.3.3' +__version__ = '0.3.4'