From 601b6624f0ac0b630b4cfac4bc29d03f62759266 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 20:25:36 +0200
Subject: [PATCH 01/40] Ignore compiled script

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 96e4050..de16f90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@
 /.coverage
 /docs/_build
 /data
+/bin/propexc

From 3716caf3f9efc79e53f5e130aa0700b04c8b7a88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 16:13:55 +0200
Subject: [PATCH 02/40] More general parsing of genbank features

---
 propex/genbank.py | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index a4f0ce8..167ed8c 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -283,9 +283,7 @@ def features_at_location(self, location):
                   overlapping the location.
         """
         features = []
-        for feat in GenBank.features:
-            if feat not in self.features:
-                continue
+        for feat in self.features.iterkeys():
             for feature in self.features[feat]:
                 if feature.location.overlaps(location):
                     features.append(feature)
@@ -380,9 +378,6 @@ class GenBank(object):
     :raises: ValueError if parsing fails.
     """
 
-    #: List of supported features.
-    features = ['CDS']
-
     def __init__(self, fname):
         """GenBank constructor.
 
@@ -390,7 +385,7 @@ def __init__(self, fname):
             fname: filename of the GenBank file.
         """
         self.filename = fname
-        self.index = self._index()
+        self.index, self.features = self._index()
 
     def _index(self):
         """Create and index of a the GenBank object.
@@ -399,7 +394,9 @@ def _index(self):
             a list of dictionaries where each element in the list
             represents a locus.
         """
+        features = set()
         indexdicts = []
+        in_features = False
         with open(self.filename) as f:
             offset = 0
             for lineno, line in enumerate(f):
@@ -411,24 +408,30 @@ def _index(self):
                     indexdicts.append({})
                     indexdicts[-1]['name'] = current_locus
                     indexdicts[-1]['offset'] = offset
-                if line.strip().split()[0] == 'CDS':
-                    if 'CDS' not in indexdicts[-1]:
-                        indexdicts[-1]['CDS'] = []
-                    indexdicts[-1]['CDS'].append({
+                if line.strip().split()[0] == 'ORIGIN':
+                    indexdicts[-1]['ORIGIN'] = offset + len(line)
+                    in_features = False
+                if in_features and line[5] != ' ':
+                    feature = line.strip().split()[0]
+                    features.add(feature)
+                    if feature not in indexdicts[-1]:
+                        indexdicts[-1][feature] = []
+                    indexdicts[-1][feature].append({
                         'offset': offset,
                         'location': Location(line.strip().split()[1])
                     })
-                if line.strip().split()[0] == 'ORIGIN':
-                    indexdicts[-1]['ORIGIN'] = offset + len(line)
+                if line.startswith('FEATURES'):
+                    in_features = True
                 offset += len(line)
 
-        # Sort the CDS according to start position
-        for s in indexdicts:
-            if 'CDS' not in s:
-                continue
-            s['CDS'] = sorted(s['CDS'], key=lambda x: x['location'].start)
+        # Sort the features according to start position
+        for f in features:
+            for s in indexdicts:
+                if f not in s:
+                    continue
+                s[f] = sorted(s[f], key=lambda x: x['location'].start)
 
-        return indexdicts
+        return indexdicts, features
 
     def __getitem__(self, index):
         """Get a specific GenBankLocus object.

From 3d3a8181298480bd962493e8f6c7f22d41331ce4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 20:02:37 +0200
Subject: [PATCH 03/40] Add custom exceptions

---
 bin/propex                   |  2 +-
 propex/genbank.py            | 14 ++++++++++----
 propex/tests/test_genbank.py |  4 ++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/bin/propex b/bin/propex
index 31d5024..7fe112f 100644
--- a/bin/propex
+++ b/bin/propex
@@ -32,7 +32,7 @@ def get_single_sequence(fname, genbank_only=False, stop_on_error=False):
     try:
         seq = propex.GenBank(fname)
         genbank_success = True
-    except ValueError:
+    except propex.genbank.ParsingError:
         pass
 
     if not genbank_success and not genbank_only:
diff --git a/propex/genbank.py b/propex/genbank.py
index 167ed8c..7a5fc17 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -10,6 +10,9 @@
 
 from propex.sequence import Sequence
 
+class LocationError(Exception):
+    pass
+
 class Location(object):
 
     """Represent a GenBank feature location.
@@ -93,7 +96,7 @@ def _parse(self):
             is located on the complement strand. Returned positions
             are 0-based.
         Raises:
-            ValueError: if the location string is not valid.
+            LocationError: if the location string is not valid.
         """
         locstring = self.locstring
         re_name = None
@@ -107,7 +110,7 @@ def _parse(self):
                 re_name = name
                 regex = r
         if re_name is None:
-            raise ValueError('unknown location string: {0}'.format(self.locstring))
+            raise LocationError('unknown location string: {0}'.format(self.locstring))
 
         if re_name == 'single':
             start = end = int(regex.match(locstring).group(1))
@@ -366,6 +369,9 @@ def _neighbor(self, feature, downstream=True):
 
         return self.features[ftype][findex]
 
+class ParsingError(Exception):
+    pass
+
 class GenBank(object):
 
     """Represent a GenBank file.
@@ -375,7 +381,7 @@ class GenBank(object):
         * index: a list of dictionaries representing an index of the file.
 
     :param fname: filename of the GenBank file.
-    :raises: ValueError if parsing fails.
+    :raises: :py:exc:`.ParsingError` if parsing fails.
     """
 
     def __init__(self, fname):
@@ -401,7 +407,7 @@ def _index(self):
             offset = 0
             for lineno, line in enumerate(f):
                 if lineno == 0 and not line.strip().startswith('LOCUS'):
-                    raise ValueError('does not look like a GenBank file: {0}' \
+                    raise ParsingError('does not look like a GenBank file: {0}' \
                         .format(self.filename))
                 if line.strip().split()[0] == 'LOCUS':
                     current_locus = line.strip().split()[1]
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index 30c58e5..d25a90d 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -62,7 +62,7 @@ def test_get_locus_from_name(self):
         assert len(loci) > 0
         assert len(loci[0].seq) == 8967
 
-    @raises(ValueError)
+    @raises(propex.genbank.ParsingError)
     def test_parse_fasta(self):
         gb = propex.GenBank(os.path.join(self.genbankdir, '..', 'data_fasta',
             'LMG718-cremoris.fasta'))
@@ -331,7 +331,7 @@ def test_overlap(self):
         assert not loc4.overlaps(loc3)
         assert not loc1.overlaps(loc5)
 
-    @raises(ValueError)
+    @raises(propex.genbank.LocationError)
     def test_invalid_location(self):
         loc = Location('123..noloc')
 

From b3682453911fd00c24def000d30e2cac9e02c978 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 20:07:48 +0200
Subject: [PATCH 04/40] Make location regexes private

---
 propex/genbank.py            | 29 ++++++++++++++---------------
 propex/tests/test_genbank.py | 20 ++++++++++----------
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index 7a5fc17..c70733a 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -21,7 +21,7 @@ class Location(object):
     http://www.insdc.org/files/feature_table.html#3.4
 
     Location (and GenBank files) are using 1-based positions. To make
-    location-bases string handling easier, the Location class represents
+    location-based string handling easier, the Location class represents
     the locations internally as 0-based::
 
         >>> loc = Location('42..84')
@@ -50,18 +50,17 @@ class Location(object):
     """
 
     #: Regular expression for finding complement locations.
-    loc_complement = re.compile(r'^complement\((.+)\)$')
-
+    _re_complement = re.compile(r'^complement\((.+)\)$')
     #: Regular expression for single base locations.
-    loc_single = re.compile(r'^(\d+)$')
+    _re_single = re.compile(r'^(\d+)$')
     #: Regular expression for range locations,
-    loc_range = re.compile(r'^(\d+)\.\.(\d+)$')
+    _re_range = re.compile(r'^(\d+)\.\.(\d+)$')
     #: Regular expression for locations with unknown lower boundary.
-    loc_lower_unknown = re.compile(r'^<(\d+)\.\.(\d+)$')
+    _re_lower_unknown = re.compile(r'^<(\d+)\.\.(\d+)$')
     #: Regular expression for locations with unknown upper boundary.
-    loc_upper_unknown = re.compile(r'^(\d+)\.\.>(\d+)$')
+    _re_upper_unknown = re.compile(r'^(\d+)\.\.>(\d+)$')
     #: Regular expression for single base locations within a range.
-    loc_one_of = re.compile(r'^(\d+)\.(\d+)$')
+    _re_one_of = re.compile(r'^(\d+)\.(\d+)$')
 
     def __init__(self, locstring):
         """Location constructor.
@@ -80,11 +79,11 @@ def _regex_dict(self):
             expression.
         """
         return {
-            'single': Location.loc_single,
-            'range': Location.loc_range,
-            'upper_unknown': Location.loc_lower_unknown,
-            'lower_unknown': Location.loc_upper_unknown,
-            'one_of': Location.loc_one_of
+            'single': Location._re_single,
+            'range': Location._re_range,
+            'upper_unknown': Location._re_lower_unknown,
+            'lower_unknown': Location._re_upper_unknown,
+            'one_of': Location._re_one_of
         }
 
     def _parse(self):
@@ -102,9 +101,9 @@ def _parse(self):
         re_name = None
         regex = None
         is_complement = False
-        if Location.loc_complement.match(locstring):
+        if Location._re_complement.match(locstring):
             is_complement = True
-            locstring = Location.loc_complement.match(locstring).group(1)
+            locstring = Location._re_complement.match(locstring).group(1)
         for name, r in self._regex_dict().iteritems():
             if r.match(locstring) is not None:
                 re_name = name
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index d25a90d..8df9d77 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -218,30 +218,30 @@ def setUp(self):
         self.complement2 = 'complement(467)'
 
     def test_complement(self):
-        match = Location.loc_complement.match(self.complement)
+        match = Location._re_complement.match(self.complement)
         assert match
         assert match.group(1) == '340..565'
-        match = Location.loc_complement.match(self.complement2)
+        match = Location._re_complement.match(self.complement2)
         assert match
         assert match.group(1) == '467'
-        match = Location.loc_complement.match(self.one_of)
+        match = Location._re_complement.match(self.one_of)
         assert match is None
 
     def test_range_regex(self):
-        match = Location.loc_range.match(self.range)
-        assert Location.loc_one_of.match(self.range) is None
+        match = Location._re_range.match(self.range)
+        assert Location._re_one_of.match(self.range) is None
         assert match
         assert match.group(1) == '340'
         assert match.group(2) == '565'
 
     def test_single_regex(self):
-        match = Location.loc_single.match(self.single)
+        match = Location._re_single.match(self.single)
         assert match
         assert match.group(1) == '467'
 
     def test_lower_unknown(self):
-        match1 = Location.loc_lower_unknown.match(self.lower_unknown)
-        match2 = Location.loc_lower_unknown.match(self.lower_unknown2)
+        match1 = Location._re_lower_unknown.match(self.lower_unknown)
+        match2 = Location._re_lower_unknown.match(self.lower_unknown2)
         assert match1
         assert match1.group(1) == '345'
         assert match1.group(2) == '500'
@@ -250,13 +250,13 @@ def test_lower_unknown(self):
         assert match2.group(2) == '888'
 
     def test_upper_unknown(self):
-        match = Location.loc_upper_unknown.match(self.upper_unknown)
+        match = Location._re_upper_unknown.match(self.upper_unknown)
         assert match
         assert match.group(1) == '1'
         assert match.group(2) == '888'
 
     def test_one_of(self):
-        match = Location.loc_one_of.match(self.one_of)
+        match = Location._re_one_of.match(self.one_of)
         assert match
         assert match.group(1) == '102'
         assert match.group(2) == '110'

From 6f72b1628cf6ee944cc178e91a7730db16d81b62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 20:20:12 +0200
Subject: [PATCH 05/40] Add missing case of unknown boundaries

There was no way of identifying a location with both unkown lower
and upper boundary. This could definitely be done more elegantly,
but that is something for later.
---
 propex/genbank.py            | 3 +++
 propex/tests/test_genbank.py | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/propex/genbank.py b/propex/genbank.py
index c70733a..19fbca0 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -59,6 +59,8 @@ class Location(object):
     _re_lower_unknown = re.compile(r'^<(\d+)\.\.(\d+)$')
     #: Regular expression for locations with unknown upper boundary.
     _re_upper_unknown = re.compile(r'^(\d+)\.\.>(\d+)$')
+    #: Regular expression for locations with unknown upper and lower boundary.
+    _re_lower_upper_unknown = re.compile(r'^<(\d+)\.\.>(\d+)$')
     #: Regular expression for single base locations within a range.
     _re_one_of = re.compile(r'^(\d+)\.(\d+)$')
 
@@ -83,6 +85,7 @@ def _regex_dict(self):
             'range': Location._re_range,
             'upper_unknown': Location._re_lower_unknown,
             'lower_unknown': Location._re_upper_unknown,
+            'lower_upper_unkown': Location._re_lower_upper_unknown,
             'one_of': Location._re_one_of
         }
 
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index 8df9d77..75ccbc3 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -213,6 +213,7 @@ def setUp(self):
         self.lower_unknown = '<345..500'
         self.lower_unknown2 = '<1..888'
         self.upper_unknown = '1..>888'
+        self.lower_upper_unkown = '<1..>888'
         self.one_of = '102.110'
         self.complement = 'complement(340..565)'
         self.complement2 = 'complement(467)'
@@ -255,6 +256,12 @@ def test_upper_unknown(self):
         assert match.group(1) == '1'
         assert match.group(2) == '888'
 
+    def test_lower_upper_unknown(self):
+        match = Location._re_lower_upper_unknown.match(self.lower_upper_unkown)
+        assert match
+        assert match.group(1) == '1'
+        assert match.group(2) == '888'
+
     def test_one_of(self):
         match = Location._re_one_of.match(self.one_of)
         assert match

From e4764c29e3026d663cfe05c62970a2df81c74ae6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 20:21:32 +0200
Subject: [PATCH 06/40] Add safeguard for empty lines

---
 propex/genbank.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/propex/genbank.py b/propex/genbank.py
index 19fbca0..ad721f5 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -411,6 +411,8 @@ def _index(self):
                 if lineno == 0 and not line.strip().startswith('LOCUS'):
                     raise ParsingError('does not look like a GenBank file: {0}' \
                         .format(self.filename))
+                if len(line.strip()) == 0:
+                    continue
                 if line.strip().split()[0] == 'LOCUS':
                     current_locus = line.strip().split()[1]
                     indexdicts.append({})

From 1026a107dcb6509d79b23b836462352a005131b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 20:23:09 +0200
Subject: [PATCH 07/40] Split genbank tests general and local

The local class requires some files that I cannot put in the repo.
Instead I split the class into two: one for these local files and
one for the files I add to the repo. Currently there's only one
file in the repo.
---
 propex/tests/data/U49845.gb  | 167 +++++++++++++++++++++++++++++++++++
 propex/tests/test_genbank.py |  10 +++
 2 files changed, 177 insertions(+)
 create mode 100644 propex/tests/data/U49845.gb

diff --git a/propex/tests/data/U49845.gb b/propex/tests/data/U49845.gb
new file mode 100644
index 0000000..c56f129
--- /dev/null
+++ b/propex/tests/data/U49845.gb
@@ -0,0 +1,167 @@
+LOCUS       SCU49845                5028 bp    DNA     linear   PLN 23-MAR-2010
+DEFINITION  Saccharomyces cerevisiae TCP1-beta gene, partial cds; and Axl2p
+            (AXL2) and Rev7p (REV7) genes, complete cds.
+ACCESSION   U49845
+VERSION     U49845.1  GI:1293613
+KEYWORDS    .
+SOURCE      Saccharomyces cerevisiae (baker's yeast)
+  ORGANISM  Saccharomyces cerevisiae
+            Eukaryota; Fungi; Dikarya; Ascomycota; Saccharomycotina;
+            Saccharomycetes; Saccharomycetales; Saccharomycetaceae;
+            Saccharomyces.
+REFERENCE   1  (bases 1 to 5028)
+  AUTHORS   Roemer,T., Madden,K., Chang,J. and Snyder,M.
+  TITLE     Selection of axial growth sites in yeast requires Axl2p, a novel
+            plasma membrane glycoprotein
+  JOURNAL   Genes Dev. 10 (7), 777-793 (1996)
+   PUBMED   8846915
+REFERENCE   2  (bases 1 to 5028)
+  AUTHORS   Roemer,T.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (22-FEB-1996) Biology, Yale University, New Haven, CT
+            06520, USA
+FEATURES             Location/Qualifiers
+     source          1..5028
+                     /organism="Saccharomyces cerevisiae"
+                     /mol_type="genomic DNA"
+                     /db_xref="taxon:4932"
+                     /chromosome="IX"
+     mRNA            <1..>206
+                     /product="TCP1-beta"
+     CDS             <1..206
+                     /codon_start=3
+                     /product="TCP1-beta"
+                     /protein_id="AAA98665.1"
+                     /db_xref="GI:1293614"
+                     /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
+                     AEVLLRVDNIIRARPRTANRQHM"
+     gene            <687..>3158
+                     /gene="AXL2"
+     mRNA            <687..>3158
+                     /gene="AXL2"
+                     /product="Axl2p"
+     CDS             687..3158
+                     /gene="AXL2"
+                     /note="plasma membrane glycoprotein"
+                     /codon_start=1
+                     /product="Axl2p"
+                     /protein_id="AAA98666.1"
+                     /db_xref="GI:1293615"
+                     /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
+                     TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
+                     VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
+                     VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
+                     TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
+                     YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
+                     DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
+                     DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
+                     NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
+                     CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
+                     NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
+                     SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
+                     YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
+                     HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
+                     VDFSNKSNVNVGQVKDIHGRIPEML"
+     gene            complement(<3300..>4037)
+                     /gene="REV7"
+     mRNA            complement(<3300..>4037)
+                     /gene="REV7"
+                     /product="Rev7p"
+     CDS             complement(3300..4037)
+                     /gene="REV7"
+                     /codon_start=1
+                     /product="Rev7p"
+                     /protein_id="AAA98667.1"
+                     /db_xref="GI:1293616"
+                     /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
+                     FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
+                     KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
+                     RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
+                     LISGDDKILNGVYSQYEEGESIFGSLF"
+ORIGIN      
+        1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
+       61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
+      121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
+      181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
+      241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
+      301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
+      361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
+      421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
+      481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
+      541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
+      601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
+      661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
+      721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
+      781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
+      841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
+      901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
+      961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
+     1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
+     1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
+     1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
+     1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
+     1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
+     1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
+     1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
+     1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
+     1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
+     1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
+     1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
+     1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
+     1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
+     1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
+     1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
+     1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
+     1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
+     2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
+     2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
+     2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
+     2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
+     2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
+     2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
+     2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
+     2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
+     2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
+     2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
+     2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
+     2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
+     2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
+     2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
+     2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
+     2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
+     3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
+     3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
+     3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
+     3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
+     3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
+     3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
+     3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
+     3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
+     3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
+     3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
+     3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
+     3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
+     3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
+     3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
+     3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
+     3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
+     3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
+     4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
+     4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
+     4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
+     4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
+     4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
+     4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
+     4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
+     4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
+     4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
+     4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
+     4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
+     4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
+     4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
+     4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
+     4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
+     4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
+     4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
+//
+
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index 75ccbc3..140452f 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -7,6 +7,16 @@
 
 class TestGenBank:
 
+    def setUp(self):
+        self.testdir = os.path.dirname(__file__)
+        self.sc = os.path.join(self.testdir, 'data', 'U49845.gb')
+
+    def test_sequence_length(self):
+        gb = propex.GenBank(self.sc)
+        assert len(gb[0].seq) == 5028
+
+class TestGenBankLocal:
+
     def setUp(self):
         self.testdir = os.path.dirname(__file__)
         self.genbankdir = os.path.join(os.path.expanduser('~'), 'Dropbox',

From a170badac60d10b9561dba2c5bd32381f38fcec2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 21:33:03 +0200
Subject: [PATCH 08/40] Add all features to the genbank locus

---
 propex/genbank.py            | 24 +++++++++++++++---------
 propex/tests/test_genbank.py | 16 ++++++++++++++--
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index ad721f5..6de0457 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -289,6 +289,11 @@ def features_at_location(self, location):
         """
         features = []
         for feat in self.features.iterkeys():
+            # Skip the source feature since it always will
+            # overlap (assuming that the feature location
+            # is within the sequence boundaries).
+            if feat == 'source':
+                continue
             for feature in self.features[feat]:
                 if feature.location.overlaps(location):
                     features.append(feature)
@@ -452,20 +457,21 @@ def __getitem__(self, index):
         locus_index = self.index[index]
         locus_offset = locus_index['offset']
         origin_offset = locus_index['ORIGIN']
-        features = {'CDS': []}
+        features = collections.defaultdict(list)
         with open(self.filename) as f:
-            # Get the CDSs
-            if 'CDS' in locus_index:
-                for cds in locus_index['CDS']:
-                    f.seek(cds['offset'])
-                    cds_string = f.readline()
+            for ftype in self.features:
+                if ftype not in locus_index:
+                    continue
+                for feature in locus_index[ftype]:
+                    f.seek(feature['offset'])
+                    feature_string = f.readline()
                     line = f.readline()
                     while line[5] == ' ':
-                        cds_string += line
+                        feature_string += line
                         line = f.readline()
-                    features['CDS'].append(
+                    features[ftype].append(
                         GenBankFeature.from_string(locus_index['name'],
-                            cds_string))
+                            feature_string))
 
             # Get the sequence
             f.seek(origin_offset)
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index 140452f..06b46c8 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -10,10 +10,22 @@ class TestGenBank:
     def setUp(self):
         self.testdir = os.path.dirname(__file__)
         self.sc = os.path.join(self.testdir, 'data', 'U49845.gb')
+        self.gb = propex.GenBank(self.sc)
 
     def test_sequence_length(self):
-        gb = propex.GenBank(self.sc)
-        assert len(gb[0].seq) == 5028
+        assert len(self.gb[0].seq) == 5028
+
+    def test_mRNA(self):
+        assert len(self.gb[0].features['mRNA']) == 3
+
+    def test_next_downstream(self):
+        locus = self.gb[0]
+        gbf = locus.features['mRNA'][0]
+        assert gbf is not None
+        assert str(gbf.location) == '<1..>206'
+        next = locus.next_downstream(gbf)
+        assert next is not None
+        assert str(next.location) == '<687..>3158'
 
 class TestGenBankLocal:
 

From 608d58b2170482200e32c858c4b257ddcd7b93b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 12 Apr 2015 20:21:32 +0200
Subject: [PATCH 09/40] Add safeguard for empty lines

---
 propex/genbank.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/propex/genbank.py b/propex/genbank.py
index a4f0ce8..bf03c40 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -406,6 +406,8 @@ def _index(self):
                 if lineno == 0 and not line.strip().startswith('LOCUS'):
                     raise ValueError('does not look like a GenBank file: {0}' \
                         .format(self.filename))
+                if len(line.strip()) == 0:
+                    continue
                 if line.strip().split()[0] == 'LOCUS':
                     current_locus = line.strip().split()[1]
                     indexdicts.append({})

From 8ebc84b04fe3e7114655239f31e4f1655be93907 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 14 Apr 2015 20:58:07 +0200
Subject: [PATCH 10/40] Fix empty line bug

---
 propex/genbank.py           | 1 +
 propex/tests/data/U49845.gb | 1 +
 2 files changed, 2 insertions(+)

diff --git a/propex/genbank.py b/propex/genbank.py
index 6de0457..55d9b77 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -417,6 +417,7 @@ def _index(self):
                     raise ParsingError('does not look like a GenBank file: {0}' \
                         .format(self.filename))
                 if len(line.strip()) == 0:
+                    offset += len(line)
                     continue
                 if line.strip().split()[0] == 'LOCUS':
                     current_locus = line.strip().split()[1]
diff --git a/propex/tests/data/U49845.gb b/propex/tests/data/U49845.gb
index c56f129..c053364 100644
--- a/propex/tests/data/U49845.gb
+++ b/propex/tests/data/U49845.gb
@@ -62,6 +62,7 @@ FEATURES             Location/Qualifiers
                      YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
                      HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
                      VDFSNKSNVNVGQVKDIHGRIPEML"
+                     
      gene            complement(<3300..>4037)
                      /gene="REV7"
      mRNA            complement(<3300..>4037)

From 64c1c2cb81e8ee4d61ab87838a7dac2f4095a0ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 14 Apr 2015 20:58:07 +0200
Subject: [PATCH 11/40] Fix empty line bug

Conflicts:
	propex/tests/data/U49845.gb
---
 propex/genbank.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/propex/genbank.py b/propex/genbank.py
index bf03c40..cc24b64 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -407,6 +407,7 @@ def _index(self):
                     raise ValueError('does not look like a GenBank file: {0}' \
                         .format(self.filename))
                 if len(line.strip()) == 0:
+                    offset += len(line)
                     continue
                 if line.strip().split()[0] == 'LOCUS':
                     current_locus = line.strip().split()[1]

From f2846be56478f2bad9cbcedf3a6913a75c8cb333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Thu, 16 Apr 2015 20:39:23 +0200
Subject: [PATCH 12/40] Allow directories in input directory

---
 propex/tests/test_genbank.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index 06b46c8..257b8f3 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -59,7 +59,8 @@ def test_iteration(self):
 
     def test_load_directory(self):
         gbs = [propex.GenBank(os.path.join(self.genbankdir, x)) \
-            for x in os.listdir(self.genbankdir)]
+            for x in os.listdir(self.genbankdir) \
+            if os.path.isfile(os.path.join(self.genbankdir, x))]
 
     def test_features_at_location(self):
         gb = propex.GenBank(self.lmg718)

From afbe591ddb8dcd80cdc0683ca10ec734974c7bcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Fri, 17 Apr 2015 15:41:31 +0200
Subject: [PATCH 13/40] Parse GenBank header

The results are not yet used, but should go into the GenBankLocus
class.
---
 propex/genbank.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/propex/genbank.py b/propex/genbank.py
index 55d9b77..68f8a7f 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -449,6 +449,78 @@ def _index(self):
 
         return indexdicts, features
 
+    def _parse_header(self, hstring):
+        """Parse a GenBank header string into a nested dictionary.
+        """
+        head_data = collections.OrderedDict()
+
+        header_lines = iter(hstring.splitlines(True))
+
+        line = header_lines.next()
+
+        header = line.strip().split()
+
+        name = header[1]
+        length = ' '.join(header[2:4])
+        molecule = header[4]
+        molecule_type = header[5]
+
+        if len(header) == 8:
+            division = header[6]
+            date = header[7]
+        elif len(header) == 7:
+            division = ''
+            date = header[6]
+
+        head_data['LOCUS'] = {
+            'name': name,
+            'length': length,
+            'molecule': molecule,
+            'molecule_type': molecule_type,
+            'genbank_division': division,
+            'modification_data': date
+        }
+
+        last_key = None
+        line = header_lines.next()
+        while True:
+            if line[0] != ' ':
+                key = line[:11].strip()
+                last_key = key
+                if key in head_data:
+                    old_entry = head_data[key]
+                    if not isinstance(old_entry, list):
+                        head_data[key] = [(old_entry,
+                            collections.OrderedDict())]
+                    head_data[key].append((line[11:].strip(),
+                        collections.OrderedDict()))
+                else:
+                    head_data[key] = line[11:].strip()
+            elif len(line[:11].strip()) != 0:
+                sub_key = line[:11].strip()
+                old_entry = head_data[last_key]
+                if not isinstance(old_entry, list):
+                    head_data[last_key] = [(old_entry,
+                        collections.OrderedDict())]
+                old_entry = head_data[key][-1]
+                old_entry[1][sub_key] = line[11:].strip()
+                head_data[key][-1] = (old_entry[0], old_entry[1])
+            else:
+                if isinstance(head_data[last_key], list):
+                    sub_key = head_data[last_key][-1][1].keys()[-1]
+                    head_data[last_key][-1][1][sub_key] = \
+                        '\n'.join([head_data[last_key][-1][1][sub_key],
+                            line.strip()])
+                else:
+                    head_data[last_key] = '\n'.join([head_data[last_key],
+                        line.strip()])
+            try:
+                line = header_lines.next()
+            except StopIteration:
+                break
+
+        return head_data
+
     def __getitem__(self, index):
         """Get a specific GenBankLocus object.
 
@@ -459,7 +531,20 @@ def __getitem__(self, index):
         locus_offset = locus_index['offset']
         origin_offset = locus_index['ORIGIN']
         features = collections.defaultdict(list)
+
+        headstring = ''
+
         with open(self.filename) as f:
+            f.seek(locus_offset)
+            headstring += f.readline()
+
+            line = f.readline()
+            while not line.startswith('FEATURES'):
+                headstring += line
+                line = f.readline()
+
+            head_data = self._parse_header(headstring)
+
             for ftype in self.features:
                 if ftype not in locus_index:
                     continue

From 731426d39f0b47a502929ad75d2fb19f9848f5e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Fri, 17 Apr 2015 15:42:48 +0200
Subject: [PATCH 14/40] Shorten long lines

---
 propex/genbank.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index 68f8a7f..70f08e5 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -112,7 +112,8 @@ def _parse(self):
                 re_name = name
                 regex = r
         if re_name is None:
-            raise LocationError('unknown location string: {0}'.format(self.locstring))
+            raise LocationError('unknown location string: {0}' \
+                .format(self.locstring))
 
         if re_name == 'single':
             start = end = int(regex.match(locstring).group(1))
@@ -134,7 +135,8 @@ def min_distance(self, other):
         if self.overlaps(other):
             return 0
         else:
-            return min(abs(self.start - other.end), abs(self.end - other.start))
+            return min(abs(self.start - other.end),
+                abs(self.end - other.start))
 
     @classmethod
     def from_int(cls, start, end=None, strand='+'):
@@ -165,7 +167,8 @@ class GenBankFeature(object):
 
     **Class attributes:**
         * **feature_type**: a string with the feature key.
-        * **location**: a Location object representing the location of the feature.
+        * **location**: a Location object representing the location of
+                        the feature.
         * **qualifiers**: a dictionary of qualifiers of the feature.
 
     :param locus: the name of the locus that the feature belongs to.
@@ -181,7 +184,8 @@ def __init__(self, locus, feature_type, location, qualifiers=None):
         Args:
             locus: the locus that the feature belongs to.
             feature_type: the key of the feature, e.g. 'CDS' or 'tRNA'.
-            location: a Location object representing the location of the feature
+            location: a Location object representing the location of the
+                      feature
             qualifiers: a dictionary of qualifiers with the qualifier names
                         as keys and the qualifier values as values.
         """
@@ -362,7 +366,8 @@ def _neighbor(self, feature, downstream=True):
                 break
 
 
-        if findex is None or findex >= len(self.features[ftype]) or findex < 0:
+        if findex is None or findex >= len(self.features[ftype]) or \
+                findex < 0:
             return None
 
         # Make sure the feature is on the same strand
@@ -414,8 +419,9 @@ def _index(self):
             offset = 0
             for lineno, line in enumerate(f):
                 if lineno == 0 and not line.strip().startswith('LOCUS'):
-                    raise ParsingError('does not look like a GenBank file: {0}' \
-                        .format(self.filename))
+                    raise ParsingError(
+                        'does not look like a GenBank file: {0}' \
+                            .format(self.filename))
                 if len(line.strip()) == 0:
                     offset += len(line)
                     continue

From 39527343bfa08214a2ae81e513d1b896615f6ee2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Fri, 17 Apr 2015 15:56:49 +0200
Subject: [PATCH 15/40] Add header to GenBankLocus + test

---
 propex/genbank.py            |  9 +++++++--
 propex/tests/test_genbank.py | 15 +++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index 70f08e5..1117d7f 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -267,7 +267,7 @@ class GenBankLocus(object):
     :param features: a dictionary containing features of the locus.
     """
 
-    def __init__(self, name, seq, features=None):
+    def __init__(self, name, seq, features=None, header=None):
         """GenBankLocus constructor.
 
         Args:
@@ -281,6 +281,10 @@ def __init__(self, name, seq, features=None):
             self.features = {}
         else:
             self.features = features
+        if header is None:
+            self.header = {}
+        else:
+            self.header = header
 
     def features_at_location(self, location):
         """Get features at a location.
@@ -573,7 +577,8 @@ def __getitem__(self, index):
                 line = f.readline()
                 seq += ''.join(line.strip().split()[1:])
 
-        return GenBankLocus(locus_index['name'], Sequence(seq), features)
+        return GenBankLocus(locus_index['name'], Sequence(seq), features,
+            head_data)
 
     def get_locus_from_name(self, name):
         """Get a specific GenBankLocus object from the locus name.
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index 257b8f3..ed8896c 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -27,6 +27,21 @@ def test_next_downstream(self):
         assert next is not None
         assert str(next.location) == '<687..>3158'
 
+    def test_header(self):
+        header = self.gb[0].header
+
+        assert all(x in header for x in ['LOCUS', 'DEFINITION',
+            'ACCESSION', 'VERSION', 'KEYWORDS', 'SOURCE', 'REFERENCE'])
+
+        assert header['LOCUS']['molecule'] == 'DNA'
+
+        assert header['ACCESSION'] == 'U49845'
+
+        assert len(header['REFERENCE']) == 2
+        assert header['REFERENCE'][0][0] == '1  (bases 1 to 5028)'
+        assert all(x in header['REFERENCE'][0][1] for x in ['AUTHORS',
+            'TITLE', 'JOURNAL', 'PUBMED'])
+
 class TestGenBankLocal:
 
     def setUp(self):

From 2d7eef0639d721495a0d0d3045a330e83c29de63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20M=C3=A4hler?= <niklas.mahler@gmail.com>
Date: Sat, 18 Apr 2015 22:46:24 +0200
Subject: [PATCH 16/40] Handle join locations

The genbank module is now able to handle join locations, including
join locations that are wrapped with complement. The comparison
operator is currently a bit misleading since the locations
join(complement(...),complement(...)) and complement(join(...,...))
will not be equal. Will probably be fixed in the future.
---
 propex/genbank.py            | 141 +++++++++++++++++++++++++++++++++--
 propex/tests/test_genbank.py |  86 ++++++++++++++++++++-
 2 files changed, 219 insertions(+), 8 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index 1117d7f..a34f78f 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -6,6 +6,7 @@
 """
 
 import collections
+import itertools
 import re
 
 from propex.sequence import Sequence
@@ -13,6 +14,120 @@
 class LocationError(Exception):
     pass
 
+def parse_location(locstring):
+    """Parse a location string and return a :py:class:`.Location`
+    or :py:class:`.JoinLocation` object.
+
+    :param locstring: a GenBank location string.
+    :raises: :py:class:`.LocationError` if parsing fails.
+    """
+    if locstring.startswith('join') or \
+            locstring.startswith('complement(join'):
+        return JoinLocation(locstring)
+    else:
+        return Location(locstring)
+
+class JoinLocation(object):
+
+    """Represent a "join" GenBank feature location.
+
+    For more information on locations, see
+    http://www.insdc.org/files/feature_table.html#3.4
+
+    For information on how locations work, see :py:class:`.Location`.
+
+    :param locstring: a GenBank location string.
+    :raises: :py:class:`.LocationError` if parsing fails.
+    """
+
+    def __init__(self, locstring):
+        self.locstring = locstring
+        self.loctype = 'join'
+        self.locations, self.start, self.end, self.is_complement = \
+            self._get_locations()
+
+    def _get_locations(self):
+        complement_wrap = False
+        if self.locstring.startswith('complement'):
+            compmatch = re.match(r'complement\((join\(.+\))\)', self.locstring)
+            if compmatch is None:
+                raise LocationError('invalid join location: {0}' \
+                    .format(self.locstring))
+            locstring = compmatch.group(1)
+            complement_wrap = True
+        else:
+            locstring = self.locstring
+
+        match = re.match(r'^join\((.+)\)$', locstring)
+        if match is None:
+            raise LocationError('invalid join location: {0}' \
+                .format(self.locstring))
+        locstring = match.group(1)
+        locations = [Location(x.strip()) for x in locstring.split(',')]
+        if len(set(x.is_complement for x in locations)) != 1:
+            raise LocationError('joint location is located on both strands')
+        start = min(x.start for x in locations)
+        end = max(x.end for x in locations)
+        if not complement_wrap:
+            is_complement = locations[0].is_complement
+        else:
+            is_complement = True
+        return locations, start, end, is_complement
+
+    def overlaps(self, other):
+        """Test whether the location overlaps with another location.
+
+        :param other: a :py:class:`.Location` or :py:class:`.JoinLocation`
+            object.
+        :returns: True if the locations overlap with at least one base,
+            otherwise False.
+        """
+        if isinstance(other, JoinLocation):
+            for loc1, loc2 in itertools.product(self.locations,
+                    other.locations):
+                if loc1.overlaps(loc2):
+                    return True
+        else:
+            for loc in self.locations:
+                if loc.overlaps(other):
+                    return True
+        return False
+
+    def min_distance(self, other):
+        """Get the minimum distance to another location.
+
+        :param other: a :py:class:`.Location` or :py:class:`.JoinLocation`
+            object.
+        :returns: the minimum distance between the locations.
+        """
+        min_acc = []
+        if isinstance(other, JoinLocation):
+            for loc1, loc2 in itertools.product(self.locations,
+                    other.locations):
+                d = loc1.min_distance(loc2)
+                if d == 0:
+                    return 0
+                min_acc.append(d)
+        else:
+            for loc in self.locations:
+                d = loc.min_distance(other)
+                if d == 0:
+                    return 0
+                min_acc.append(d)
+        return min(min_acc)
+
+    def __str__(self):
+        return self.locstring
+
+    def __eq__(self, other):
+        return str(self) == str(other)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __repr__(self):
+        return '<JoinLocation: {0}>'.format(repr(self.locstring))
+
 class Location(object):
 
     """Represent a GenBank feature location.
@@ -47,6 +162,7 @@ class Location(object):
           the complement of the sequence.
 
     :param locstring: a GenBank location string.
+    :raises: :py:class:`.LocationError` if parsing fails.
     """
 
     #: Regular expression for finding complement locations.
@@ -77,7 +193,7 @@ def _regex_dict(self):
 
         Returns:
             a dictionary where the keys correspond to the type of
-            location and the values are the correpsponding regular
+            location and the values are the corresponding regular
             expression.
         """
         return {
@@ -122,16 +238,27 @@ def _parse(self):
 
         return re_name, start - 1, end - 1, is_complement
 
-    def overlaps(self, location):
+    def overlaps(self, other):
         """Test whether the location overlaps with another location.
 
-        :param location: a Location object.
+        :param other: a :py:class:`.Location` or :py:class:`.JoinLocation`
+            object.
         :returns: True if the locations overlap with at least one base,
             otherwise False.
         """
-        return self.start <= location.end and location.start <= self.end
+        if isinstance(other, JoinLocation):
+            return other.overlaps(self)
+        return self.start <= other.end and other.start <= self.end
 
     def min_distance(self, other):
+        """Get the minimum distance to another location.
+
+        :param other: a :py:class:`.Location` or :py:class:`.JoinLocation`
+            object.
+        :returns: the minimum distance between the locations.
+        """
+        if isinstance(other, JoinLocation):
+            return other.min_distance(self)
         if self.overlaps(other):
             return 0
         else:
@@ -238,14 +365,14 @@ def from_string(cls, locus, feature_string):
                 value = qualifiers[-1][1] + line.strip('"')
                 qualifiers[-1] = (key, value)
 
-        return cls(locus, ftype, Location(location), dict(qualifiers))
+        return cls(locus, ftype, parse_location(location), dict(qualifiers))
 
     def get_qualifier(self, qualifier_name):
         """Get a feature qualifier.
 
         :param qualifier_name: a string representing a qualifier.
         :returns: the value of the qualifier.
-        :raises: KeyError if the feature does not have a qualifier called
+        :raises: :py:class:`KeyError` if the feature does not have a qualifier called
                       ``qualifier_name``.
         """
         if qualifier_name not in self.qualifiers:
@@ -444,7 +571,7 @@ def _index(self):
                         indexdicts[-1][feature] = []
                     indexdicts[-1][feature].append({
                         'offset': offset,
-                        'location': Location(line.strip().split()[1])
+                        'location': parse_location(line.strip().split()[1])
                     })
                 if line.startswith('FEATURES'):
                     in_features = True
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index ed8896c..824c46e 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -3,7 +3,7 @@
 import os
 
 import propex
-from propex.genbank import Location
+from propex.genbank import Location, JoinLocation
 
 class TestGenBank:
 
@@ -203,6 +203,20 @@ def test_parse_feature(self):
         assert gbf_inference == ['ab initio prediction:Prodigal:2.60',
                                  'similar to AA sequence:UniProtKB:Q9RVE0']
 
+    def test_feature_join_location(self):
+        feature = '''     CDS             join(52625..53704,54000..55000)
+                     /gene="recF"
+                     /locus_tag="LMG718_02589"
+                     /inference="ab initio prediction:Prodigal:2.60"
+                     /inference="similar to AA sequence:UniProtKB:Q9RVE0"
+                     /codon_start=1
+                     /transl_table=11'''
+        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+
+        assert gbf.feature_type == 'CDS'
+        assert gbf.get_qualifier('gene') == 'recF'
+        assert len(gbf.get_qualifier('inference')) == 2
+
     def test_empty_qualifiers(self):
         feature = '''     CDS             complement(52625..53704)
                      /gene="recF"
@@ -408,3 +422,73 @@ def test_from_int(self):
         assert str(Location.from_int(100, 200)) == '100..200'
         assert str(Location.from_int(100, 200, '-')) == 'complement(100..200)'
         assert str(Location.from_int(100, strand='-')) == 'complement(100)'
+
+class TestJoinLocation:
+
+    def setUp(self):
+        self.jloc1 = JoinLocation('join(1..200,300..400)')
+        self.jloc2 = JoinLocation('join(1..100, 200..300)')
+        self.jloc3 = JoinLocation('join(150..175,180..190,310..320)')
+        self.jloc4 = JoinLocation('join(complement(100),complement(200))')
+
+    def test_instance(self):
+        assert isinstance(self.jloc1, JoinLocation)
+        assert isinstance(self.jloc2, JoinLocation)
+
+    @raises(propex.genbank.LocationError)
+    def test_invalid_location(self):
+        jloc = JoinLocation('join(1..200,300..400')
+
+    @raises(propex.genbank.LocationError)
+    def test_invalid_strands(self):
+        jloc = JoinLocation('join(complement(100),200)')
+
+    def test_start(self):
+        # Remember, 0-indexed
+        assert self.jloc1.start == 0
+        assert self.jloc2.start == 0
+        assert self.jloc3.start == 149
+
+    def test_end(self):
+        assert self.jloc1.end == 399
+        assert self.jloc2.end == 299
+        assert self.jloc3.end == 319
+
+    def test_loctype(self):
+        assert self.jloc1.loctype == 'join'
+        assert self.jloc2.loctype == 'join'
+        assert self.jloc3.loctype == 'join'
+
+    def test_complement(self):
+        assert not self.jloc1.is_complement
+        assert self.jloc4.is_complement
+
+    def test_complement_wrap(self):
+        jloc = JoinLocation('complement(join(380844..381260,382591..382872))')
+        assert isinstance(jloc, JoinLocation)
+        assert jloc.is_complement
+        assert jloc.start == 380843
+        assert jloc.end == 382871
+
+    def test_overlap(self):
+        assert self.jloc1.overlaps(self.jloc2)
+        assert self.jloc1.overlaps(Location('200..300'))
+        assert self.jloc1.overlaps(Location('150..250'))
+        assert not self.jloc3.overlaps(self.jloc2)
+        assert self.jloc3.overlaps(self.jloc1)
+        assert Location('150..250').overlaps(self.jloc1)
+
+    def test_str(self):
+        assert str(self.jloc1) == 'join(1..200,300..400)'
+        assert str(self.jloc2) == 'join(1..100, 200..300)'
+
+    def test_repr(self):
+        assert repr(self.jloc1) == '<JoinLocation: \'join(1..200,300..400)\'>'
+        assert repr(self.jloc2) == '<JoinLocation: \'join(1..100, 200..300)\'>'
+
+    def test_min_distance(self):
+        assert self.jloc1.min_distance(self.jloc2) == 0
+        assert self.jloc1.min_distance(self.jloc3) == 0
+        assert self.jloc2.min_distance(self.jloc3) == 10
+        assert self.jloc1.min_distance(Location('250')) == 50
+        assert Location('250').min_distance(self.jloc1) == 50

From a969830335830de7e9d3c8c0a21bee160dbdacbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20M=C3=A4hler?= <niklas.mahler@gmail.com>
Date: Sat, 18 Apr 2015 22:46:24 +0200
Subject: [PATCH 17/40] Handle join locations

The genbank module is now able to handle join locations, including
join locations that are wrapped with complement. The comparison
operator is currently a bit misleading since the locations
join(complement(...),complement(...)) and complement(join(...,...))
will not be equal. Will probably be fixed in the future.
---
 propex/genbank.py            | 141 +++++++++++++++++++++++++++++++++--
 propex/tests/test_genbank.py |  90 +++++++++++++++++++++-
 2 files changed, 223 insertions(+), 8 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index 1117d7f..1a2e899 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -6,6 +6,7 @@
 """
 
 import collections
+import itertools
 import re
 
 from propex.sequence import Sequence
@@ -13,6 +14,120 @@
 class LocationError(Exception):
     pass
 
+def parse_location(locstring):
+    """Parse a location string and return a :py:class:`.Location`
+    or :py:class:`.JoinLocation` object.
+
+    :param locstring: a GenBank location string.
+    :raises: :py:class:`.LocationError` if parsing fails.
+    """
+    if locstring.startswith('join') or \
+            locstring.startswith('complement(join'):
+        return JoinLocation(locstring)
+    else:
+        return Location(locstring)
+
+class JoinLocation(object):
+
+    """Represent a "join" GenBank feature location.
+
+    For more information on locations, see
+    http://www.insdc.org/files/feature_table.html#3.4
+
+    For information on how locations work, see :py:class:`.Location`.
+
+    :param locstring: a GenBank location string.
+    :raises: :py:class:`.LocationError` if parsing fails.
+    """
+
+    def __init__(self, locstring):
+        self.locstring = locstring
+        self.loctype = 'join'
+        self.locations, self.start, self.end, self.is_complement = \
+            self._get_locations()
+
+    def _get_locations(self):
+        complement_wrap = False
+        if self.locstring.startswith('complement'):
+            compmatch = re.match(r'^complement\((join\(.+\))\)$', self.locstring)
+            if compmatch is None:
+                raise LocationError('invalid join location: {0}' \
+                    .format(self.locstring))
+            locstring = compmatch.group(1)
+            complement_wrap = True
+        else:
+            locstring = self.locstring
+
+        match = re.match(r'^join\((.+)\)$', locstring)
+        if match is None:
+            raise LocationError('invalid join location: {0}' \
+                .format(self.locstring))
+        locstring = match.group(1)
+        locations = [Location(x.strip()) for x in locstring.split(',')]
+        if len(set(x.is_complement for x in locations)) != 1:
+            raise LocationError('joint location is located on both strands')
+        start = min(x.start for x in locations)
+        end = max(x.end for x in locations)
+        if not complement_wrap:
+            is_complement = locations[0].is_complement
+        else:
+            is_complement = True
+        return locations, start, end, is_complement
+
+    def overlaps(self, other):
+        """Test whether the location overlaps with another location.
+
+        :param other: a :py:class:`.Location` or :py:class:`.JoinLocation`
+            object.
+        :returns: True if the locations overlap with at least one base,
+            otherwise False.
+        """
+        if isinstance(other, JoinLocation):
+            for loc1, loc2 in itertools.product(self.locations,
+                    other.locations):
+                if loc1.overlaps(loc2):
+                    return True
+        else:
+            for loc in self.locations:
+                if loc.overlaps(other):
+                    return True
+        return False
+
+    def min_distance(self, other):
+        """Get the minimum distance to another location.
+
+        :param other: a :py:class:`.Location` or :py:class:`.JoinLocation`
+            object.
+        :returns: the minimum distance between the locations.
+        """
+        min_acc = []
+        if isinstance(other, JoinLocation):
+            for loc1, loc2 in itertools.product(self.locations,
+                    other.locations):
+                d = loc1.min_distance(loc2)
+                if d == 0:
+                    return 0
+                min_acc.append(d)
+        else:
+            for loc in self.locations:
+                d = loc.min_distance(other)
+                if d == 0:
+                    return 0
+                min_acc.append(d)
+        return min(min_acc)
+
+    def __str__(self):
+        return self.locstring
+
+    def __eq__(self, other):
+        return str(self) == str(other)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __repr__(self):
+        return '<JoinLocation: {0}>'.format(repr(self.locstring))
+
 class Location(object):
 
     """Represent a GenBank feature location.
@@ -47,6 +162,7 @@ class Location(object):
           the complement of the sequence.
 
     :param locstring: a GenBank location string.
+    :raises: :py:class:`.LocationError` if parsing fails.
     """
 
     #: Regular expression for finding complement locations.
@@ -77,7 +193,7 @@ def _regex_dict(self):
 
         Returns:
             a dictionary where the keys correspond to the type of
-            location and the values are the correpsponding regular
+            location and the values are the corresponding regular
             expression.
         """
         return {
@@ -122,16 +238,27 @@ def _parse(self):
 
         return re_name, start - 1, end - 1, is_complement
 
-    def overlaps(self, location):
+    def overlaps(self, other):
         """Test whether the location overlaps with another location.
 
-        :param location: a Location object.
+        :param other: a :py:class:`.Location` or :py:class:`.JoinLocation`
+            object.
         :returns: True if the locations overlap with at least one base,
             otherwise False.
         """
-        return self.start <= location.end and location.start <= self.end
+        if isinstance(other, JoinLocation):
+            return other.overlaps(self)
+        return self.start <= other.end and other.start <= self.end
 
     def min_distance(self, other):
+        """Get the minimum distance to another location.
+
+        :param other: a :py:class:`.Location` or :py:class:`.JoinLocation`
+            object.
+        :returns: the minimum distance between the locations.
+        """
+        if isinstance(other, JoinLocation):
+            return other.min_distance(self)
         if self.overlaps(other):
             return 0
         else:
@@ -238,14 +365,14 @@ def from_string(cls, locus, feature_string):
                 value = qualifiers[-1][1] + line.strip('"')
                 qualifiers[-1] = (key, value)
 
-        return cls(locus, ftype, Location(location), dict(qualifiers))
+        return cls(locus, ftype, parse_location(location), dict(qualifiers))
 
     def get_qualifier(self, qualifier_name):
         """Get a feature qualifier.
 
         :param qualifier_name: a string representing a qualifier.
         :returns: the value of the qualifier.
-        :raises: KeyError if the feature does not have a qualifier called
+        :raises: :py:class:`KeyError` if the feature does not have a qualifier called
                       ``qualifier_name``.
         """
         if qualifier_name not in self.qualifiers:
@@ -444,7 +571,7 @@ def _index(self):
                         indexdicts[-1][feature] = []
                     indexdicts[-1][feature].append({
                         'offset': offset,
-                        'location': Location(line.strip().split()[1])
+                        'location': parse_location(line.strip().split()[1])
                     })
                 if line.startswith('FEATURES'):
                     in_features = True
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index ed8896c..9f24e5e 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -3,7 +3,7 @@
 import os
 
 import propex
-from propex.genbank import Location
+from propex.genbank import Location, JoinLocation
 
 class TestGenBank:
 
@@ -203,6 +203,20 @@ def test_parse_feature(self):
         assert gbf_inference == ['ab initio prediction:Prodigal:2.60',
                                  'similar to AA sequence:UniProtKB:Q9RVE0']
 
+    def test_feature_join_location(self):
+        feature = '''     CDS             join(52625..53704,54000..55000)
+                     /gene="recF"
+                     /locus_tag="LMG718_02589"
+                     /inference="ab initio prediction:Prodigal:2.60"
+                     /inference="similar to AA sequence:UniProtKB:Q9RVE0"
+                     /codon_start=1
+                     /transl_table=11'''
+        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+
+        assert gbf.feature_type == 'CDS'
+        assert gbf.get_qualifier('gene') == 'recF'
+        assert len(gbf.get_qualifier('inference')) == 2
+
     def test_empty_qualifiers(self):
         feature = '''     CDS             complement(52625..53704)
                      /gene="recF"
@@ -408,3 +422,77 @@ def test_from_int(self):
         assert str(Location.from_int(100, 200)) == '100..200'
         assert str(Location.from_int(100, 200, '-')) == 'complement(100..200)'
         assert str(Location.from_int(100, strand='-')) == 'complement(100)'
+
+class TestJoinLocation:
+
+    def setUp(self):
+        self.jloc1 = JoinLocation('join(1..200,300..400)')
+        self.jloc2 = JoinLocation('join(1..100, 200..300)')
+        self.jloc3 = JoinLocation('join(150..175,180..190,310..320)')
+        self.jloc4 = JoinLocation('join(complement(100),complement(200))')
+
+    def test_instance(self):
+        assert isinstance(self.jloc1, JoinLocation)
+        assert isinstance(self.jloc2, JoinLocation)
+
+    @raises(propex.genbank.LocationError)
+    def test_invalid_location(self):
+        jloc = JoinLocation('join(1..200,300..400')
+
+    @raises(propex.genbank.LocationError)
+    def test_invalid_strands(self):
+        jloc = JoinLocation('join(complement(100),200)')
+
+    @raises(propex.genbank.LocationError)
+    def test_messed_up_location(self):
+        jloc = JoinLocation('complement(join(687..700,800..900,1000..1100))mRNA            <687..>3158')
+
+    def test_start(self):
+        # Remember, 0-indexed
+        assert self.jloc1.start == 0
+        assert self.jloc2.start == 0
+        assert self.jloc3.start == 149
+
+    def test_end(self):
+        assert self.jloc1.end == 399
+        assert self.jloc2.end == 299
+        assert self.jloc3.end == 319
+
+    def test_loctype(self):
+        assert self.jloc1.loctype == 'join'
+        assert self.jloc2.loctype == 'join'
+        assert self.jloc3.loctype == 'join'
+
+    def test_complement(self):
+        assert not self.jloc1.is_complement
+        assert self.jloc4.is_complement
+
+    def test_complement_wrap(self):
+        jloc = JoinLocation('complement(join(380844..381260,382591..382872))')
+        assert isinstance(jloc, JoinLocation)
+        assert jloc.is_complement
+        assert jloc.start == 380843
+        assert jloc.end == 382871
+
+    def test_overlap(self):
+        assert self.jloc1.overlaps(self.jloc2)
+        assert self.jloc1.overlaps(Location('200..300'))
+        assert self.jloc1.overlaps(Location('150..250'))
+        assert not self.jloc3.overlaps(self.jloc2)
+        assert self.jloc3.overlaps(self.jloc1)
+        assert Location('150..250').overlaps(self.jloc1)
+
+    def test_str(self):
+        assert str(self.jloc1) == 'join(1..200,300..400)'
+        assert str(self.jloc2) == 'join(1..100, 200..300)'
+
+    def test_repr(self):
+        assert repr(self.jloc1) == '<JoinLocation: \'join(1..200,300..400)\'>'
+        assert repr(self.jloc2) == '<JoinLocation: \'join(1..100, 200..300)\'>'
+
+    def test_min_distance(self):
+        assert self.jloc1.min_distance(self.jloc2) == 0
+        assert self.jloc1.min_distance(self.jloc3) == 0
+        assert self.jloc2.min_distance(self.jloc3) == 10
+        assert self.jloc1.min_distance(Location('250')) == 50
+        assert Location('250').min_distance(self.jloc1) == 50

From 32a8dfcd4f97a562408ae8f114f5e45e86e1cb09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20M=C3=A4hler?= <niklas.mahler@gmail.com>
Date: Sun, 19 Apr 2015 15:30:14 +0200
Subject: [PATCH 18/40] Handle multiline location strings

---
 propex/genbank.py            | 29 +++++++++++++++++++++++++++--
 propex/tests/data/U49845.gb  |  6 ++++++
 propex/tests/test_genbank.py | 13 +++++++++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index 1a2e899..0249cc0 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -339,10 +339,20 @@ def from_string(cls, locus, feature_string):
         """
         lines = [x.strip() for x in feature_string.splitlines()]
         ftype, location = lines[0].strip().split()
+        # Multiline location string
+        i = 1
+        line = lines[i]
+        while not line.startswith('/'):
+            i += 1
+            location += line
+            try:
+                line = lines[i]
+            except IndexError:
+                break
 
         qualifiers = []
 
-        for line in lines[1:]:
+        for line in lines[i:]:
             if line.startswith('/'):
                 # New qualifier
                 i = line.find('=')
@@ -569,10 +579,25 @@ def _index(self):
                     features.add(feature)
                     if feature not in indexdicts[-1]:
                         indexdicts[-1][feature] = []
+
+                    locstring = line.strip().split()[1]
+
+                    nl = f.next()
+                    loc_offset = offset + len(line)
+                    while len(nl[:21].strip()) == 0 and not nl.strip().startswith('/'):
+                        locstring += nl.strip()
+                        loc_offset += len(nl)
+                        nl = f.next()
+
+                    f.seek(loc_offset)
+
                     indexdicts[-1][feature].append({
                         'offset': offset,
-                        'location': parse_location(line.strip().split()[1])
+                        'location': parse_location(locstring)
                     })
+
+                    offset = loc_offset
+                    continue
                 if line.startswith('FEATURES'):
                     in_features = True
                 offset += len(line)
diff --git a/propex/tests/data/U49845.gb b/propex/tests/data/U49845.gb
index c053364..a523cdd 100644
--- a/propex/tests/data/U49845.gb
+++ b/propex/tests/data/U49845.gb
@@ -37,6 +37,12 @@ FEATURES             Location/Qualifiers
                      AEVLLRVDNIIRARPRTANRQHM"
      gene            <687..>3158
                      /gene="AXL2"
+     gene            complement(join(687..700,800..900,
+                     1000..1100))
+     gene            complement(join(687..700,800..900,
+                     1000..1100))
+                     /gene="testGene"
+                     /product="blargh"
      mRNA            <687..>3158
                      /gene="AXL2"
                      /product="Axl2p"
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index 9f24e5e..c4f501d 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -217,6 +217,19 @@ def test_feature_join_location(self):
         assert gbf.get_qualifier('gene') == 'recF'
         assert len(gbf.get_qualifier('inference')) == 2
 
+    def test_multiline_location(self):
+        feature = '''     CDS             complement(join(1294426..1294992,1294992..1295141,
+                     1295140..1295322))
+                     /gene="insZ"
+                     /locus_tag="b4573"'''
+        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+
+        assert gbf.feature_type == 'CDS'
+        assert isinstance(gbf.location, JoinLocation)
+        assert len(gbf.location.locations) == 3
+        assert gbf.get_qualifier('gene') == 'insZ'
+        assert gbf.get_qualifier('locus_tag') == 'b4573'
+
     def test_empty_qualifiers(self):
         feature = '''     CDS             complement(52625..53704)
                      /gene="recF"

From 7e545488ac38cc958445d065a2f24d66a1471627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20M=C3=A4hler?= <niklas.mahler@gmail.com>
Date: Sun, 19 Apr 2015 15:36:44 +0200
Subject: [PATCH 19/40] Test three line location string

---
 propex/tests/test_genbank.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index c4f501d..cc59c32 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -230,6 +230,18 @@ def test_multiline_location(self):
         assert gbf.get_qualifier('gene') == 'insZ'
         assert gbf.get_qualifier('locus_tag') == 'b4573'
 
+        feature = '''     CDS             complement(join(1294426..1294992,
+                     1294992..1295141,
+                     1295140..1295322))
+                     /gene="insZ"
+                     /locus_tag="b4573"'''
+
+        assert gbf.feature_type == 'CDS'
+        assert isinstance(gbf.location, JoinLocation)
+        assert len(gbf.location.locations) == 3
+        assert gbf.get_qualifier('gene') == 'insZ'
+        assert gbf.get_qualifier('locus_tag') == 'b4573'
+
     def test_empty_qualifiers(self):
         feature = '''     CDS             complement(52625..53704)
                      /gene="recF"

From f283ab0a7af76923dbc8b8b265fdf74b631c9470 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20M=C3=A4hler?= <niklas.mahler@gmail.com>
Date: Sun, 19 Apr 2015 15:41:27 +0200
Subject: [PATCH 20/40] Better handling of empty lines

This solution is not very pretty, but it works for now.
---
 propex/genbank.py           | 4 ++++
 propex/tests/data/U49845.gb | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index 0249cc0..e2c830c 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -714,9 +714,13 @@ def __getitem__(self, index):
                     f.seek(feature['offset'])
                     feature_string = f.readline()
                     line = f.readline()
+                    while len(line) < 6:
+                        line = f.readline()
                     while line[5] == ' ':
                         feature_string += line
                         line = f.readline()
+                        while len(line) < 6:
+                            line = f.readline()
                     features[ftype].append(
                         GenBankFeature.from_string(locus_index['name'],
                             feature_string))
diff --git a/propex/tests/data/U49845.gb b/propex/tests/data/U49845.gb
index a523cdd..47979fb 100644
--- a/propex/tests/data/U49845.gb
+++ b/propex/tests/data/U49845.gb
@@ -68,7 +68,7 @@ FEATURES             Location/Qualifiers
                      YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
                      HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
                      VDFSNKSNVNVGQVKDIHGRIPEML"
-                     
+
      gene            complement(<3300..>4037)
                      /gene="REV7"
      mRNA            complement(<3300..>4037)
@@ -85,7 +85,7 @@ FEATURES             Location/Qualifiers
                      KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
                      RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
                      LISGDDKILNGVYSQYEEGESIFGSLF"
-ORIGIN      
+ORIGIN
         1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
        61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
       121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa

From e10646f8f9556f824627775ddc10f8e14727d25e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20M=C3=A4hler?= <niklas.mahler@gmail.com>
Date: Sun, 19 Apr 2015 16:07:53 +0200
Subject: [PATCH 21/40] Fix duplicate qualifier bug

---
 propex/genbank.py            | 11 +++++++++--
 propex/tests/test_genbank.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/propex/genbank.py b/propex/genbank.py
index e2c830c..b0ddcdc 100644
--- a/propex/genbank.py
+++ b/propex/genbank.py
@@ -366,13 +366,20 @@ def from_string(cls, locus, feature_string):
 
                 if len(qualifiers) > 0 and key == qualifiers[-1][0]:
                     # Multiple qualifiers with the same key
-                    qualifiers[-1] = (key, [qualifiers[-1][1], value])
+                    if isinstance(qualifiers[-1][1], list):
+                        qualifiers[-1][1].append(value)
+                    else:
+                        qualifiers[-1] = (key, [qualifiers[-1][1], value])
                 else:
                     qualifiers.append((key, value))
             else:
                 # Continuation of qualifier
                 key = qualifiers[-1][0]
-                value = qualifiers[-1][1] + line.strip('"')
+                if isinstance(qualifiers[-1][1], list):
+                    value = qualifiers[-1][1]
+                    value[-1] += ' ' + line.strip('"')
+                else:
+                    value = qualifiers[-1][1] + ' ' + line.strip('"')
                 qualifiers[-1] = (key, value)
 
         return cls(locus, ftype, parse_location(location), dict(qualifiers))
diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py
index cc59c32..e267239 100644
--- a/propex/tests/test_genbank.py
+++ b/propex/tests/test_genbank.py
@@ -203,6 +203,34 @@ def test_parse_feature(self):
         assert gbf_inference == ['ab initio prediction:Prodigal:2.60',
                                  'similar to AA sequence:UniProtKB:Q9RVE0']
 
+    def test_multiple_qualifiers(self):
+        feature = '''     ncRNA           476448..476561
+                     /ncRNA_class="SRP_RNA"
+                     /gene="ffs"
+                     /locus_tag="b0455"
+                     /gene_synonym="ECK0449"
+                     /gene_synonym="JWR0009"
+                     /product="4.5S sRNA component of Signal Recognition
+                     Particle (SRP)"
+                     /note="4.5S RNA; component of ribonucleoprotein particle;
+                     works with the Ffh protein;
+                     adjusted endpoints to reflect the mature 4.5S RNA (114
+                     nt)"
+                     /function="2.2.6 information transfer; RNA related; rRNA,
+                     stable RNA"
+                     /function="2.3.2 information transfer; protein related;
+                     translation"
+                     /function="7.1 location of gene products; cytoplasm"
+                     /function="component of Signal Recognition Particle (SRP)
+                     with the Ffh protein; involved in co-translational
+                     targeting of proteins to membranes"
+                     /function="RNA; Ribosomal and stable RNAs"
+                     /db_xref="ASAP:ABE-0001579"
+                     /db_xref="EcoGene:EG30027"'''
+        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+        func = gbf.get_qualifier('function')
+        assert len(func) == 5
+
     def test_feature_join_location(self):
         feature = '''     CDS             join(52625..53704,54000..55000)
                      /gene="recF"

From c2071b2e7da99ffa56c9139b53c8c00b3133e801 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 19 Apr 2015 18:57:09 +0200
Subject: [PATCH 22/40] Change name from propex to seqpoet, close #7

I will update the name of the GitHub repo when I merge these
changes onto the master branch.
---
 bin/{propex => seqpoet}                       | 28 ++++----
 {propex => seqpoet}/__init__.py               |  0
 {propex => seqpoet}/fasta.py                  |  2 +-
 {propex => seqpoet}/genbank.py                |  2 +-
 {propex => seqpoet}/search.py                 |  0
 {propex => seqpoet}/sequence.py               |  0
 {propex => seqpoet}/tests/data/U49845.gb      |  0
 {propex => seqpoet}/tests/data/dups.fasta     |  0
 {propex => seqpoet}/tests/data/dups.fasta.fai |  0
 .../tests/data/dups_noindex.fasta             |  0
 .../tests/data/empty.fasta.fai                |  0
 .../tests/data/empty_sequence.fasta           |  0
 .../tests/data/sample_primers.txt             |  0
 .../tests/data/sample_sequence.fa             |  0
 {propex => seqpoet}/tests/data/uneven.fasta   |  0
 .../tests/data/valid_index.fasta              |  0
 .../tests/data/valid_index.fasta.fai          |  0
 .../tests/data/valid_noindex.fasta            |  0
 {propex => seqpoet}/tests/test_fasta.py       | 58 +++++++--------
 {propex => seqpoet}/tests/test_genbank.py     | 72 +++++++++----------
 {propex => seqpoet}/tests/test_search.py      |  8 +--
 {propex => seqpoet}/tests/test_sequence.py    | 22 +++---
 setup.py                                      | 16 ++---
 23 files changed, 104 insertions(+), 104 deletions(-)
 rename bin/{propex => seqpoet} (93%)
 rename {propex => seqpoet}/__init__.py (100%)
 rename {propex => seqpoet}/fasta.py (99%)
 rename {propex => seqpoet}/genbank.py (99%)
 rename {propex => seqpoet}/search.py (100%)
 rename {propex => seqpoet}/sequence.py (100%)
 rename {propex => seqpoet}/tests/data/U49845.gb (100%)
 rename {propex => seqpoet}/tests/data/dups.fasta (100%)
 rename {propex => seqpoet}/tests/data/dups.fasta.fai (100%)
 rename {propex => seqpoet}/tests/data/dups_noindex.fasta (100%)
 rename {propex => seqpoet}/tests/data/empty.fasta.fai (100%)
 rename {propex => seqpoet}/tests/data/empty_sequence.fasta (100%)
 rename {propex => seqpoet}/tests/data/sample_primers.txt (100%)
 rename {propex => seqpoet}/tests/data/sample_sequence.fa (100%)
 rename {propex => seqpoet}/tests/data/uneven.fasta (100%)
 rename {propex => seqpoet}/tests/data/valid_index.fasta (100%)
 rename {propex => seqpoet}/tests/data/valid_index.fasta.fai (100%)
 rename {propex => seqpoet}/tests/data/valid_noindex.fasta (100%)
 rename {propex => seqpoet}/tests/test_fasta.py (79%)
 rename {propex => seqpoet}/tests/test_genbank.py (91%)
 rename {propex => seqpoet}/tests/test_search.py (94%)
 rename {propex => seqpoet}/tests/test_sequence.py (72%)

diff --git a/bin/propex b/bin/seqpoet
similarity index 93%
rename from bin/propex
rename to bin/seqpoet
index 7fe112f..5528e67 100644
--- a/bin/propex
+++ b/bin/seqpoet
@@ -5,12 +5,12 @@ import itertools
 import os
 import sys
 
-import propex
+import seqpoet
 
 def get_probe(fname):
     with open(fname) as f:
         try:
-            seqs = [propex.sequence.Sequence(line.strip()) for line in f \
+            seqs = [seqpoet.sequence.Sequence(line.strip()) for line in f \
                 if len(line.strip()) > 0]
         except ValueError:
             print('ERROR: probe file does not contain valid sequences',
@@ -30,14 +30,14 @@ def get_single_sequence(fname, genbank_only=False, stop_on_error=False):
     genbank_success = False
     fasta_success = False
     try:
-        seq = propex.GenBank(fname)
+        seq = seqpoet.GenBank(fname)
         genbank_success = True
-    except propex.genbank.ParsingError:
+    except seqpoet.genbank.ParsingError:
         pass
 
     if not genbank_success and not genbank_only:
         try:
-            seq = propex.Fasta(fname)
+            seq = seqpoet.Fasta(fname)
             fasta_success = True
         except ValueError:
             pass
@@ -100,10 +100,10 @@ def match_probe(probe, seqs, mismatches=2):
     pl = len(probe)
     for f in seqs.itervalues():
         for i, record in enumerate(f):
-            res1 = propex.search.search(str(probe), str(record.seq),
+            res1 = seqpoet.search.search(str(probe), str(record.seq),
                 mismatches=mismatches)
             res2 = [len(record.seq) - x - pl for x in \
-                propex.search.search(str(probe), str(record.seq.revcomp()),
+                seqpoet.search.search(str(probe), str(record.seq.revcomp()),
                     mismatches=mismatches)]
 
             if len(res1) > 0:
@@ -143,16 +143,16 @@ def match_primer(primers, seqs, mismatches=2,
     pl2 = len(primers[1])
     for f in seqs.itervalues():
         for i, record in enumerate(f):
-            res1_1 = propex.search.search(str(primers[0]), str(record.seq),
+            res1_1 = seqpoet.search.search(str(primers[0]), str(record.seq),
                 mismatches=mismatches)
             res1_2 = [len(record.seq) - x - pl1 for x in \
-                propex.search.search(str(primers[0]), str(record.seq.revcomp()),
+                seqpoet.search.search(str(primers[0]), str(record.seq.revcomp()),
                     mismatches=mismatches)]
 
-            res2_1 = propex.search.search(str(primers[1]), str(record.seq),
+            res2_1 = seqpoet.search.search(str(primers[1]), str(record.seq),
                 mismatches=mismatches)
             res2_2 = [len(record.seq) - x - pl2 for x in \
-                propex.search.search(str(primers[1]), str(record.seq.revcomp()),
+                seqpoet.search.search(str(primers[1]), str(record.seq.revcomp()),
                     mismatches=mismatches)]
 
             # Match res1_1 with res2_2 and res2_1 with res1_2 to get primer
@@ -208,7 +208,7 @@ def find_operon(matches, seqs, max_distance=500):
     for m in matches:
         gb = seqs[m['filename']]
         locus = gb[m['seqindex']]
-        location = propex.genbank.Location.from_int(m['hitstart'], m['hitend'])
+        location = seqpoet.genbank.Location.from_int(m['hitstart'], m['hitend'])
         features = locus.features_at_location(location)
         if len(features) == 0:
             print('WARNING: no gene for match in locus {0}'.format(m['seqname']),
@@ -268,7 +268,7 @@ def write_fasta(matches, filename=sys.stdout):
 
     for m in matches:
         m['filename'] = os.path.basename(m['filename'])
-        s = propex.fasta.FastaRecord(m['seq'],
+        s = seqpoet.fasta.FastaRecord(m['seq'],
             '{filename}:{seqname}:{hitstart}:{hitend}:{length}:{strand}' \
                 .format(**m))
         print(s, file=f)
@@ -311,7 +311,7 @@ def parse_args():
         default=sys.stdout)
 
     parser.add_argument('--version', help=('print version and exit'),
-        action='version', version='%(prog)s v{0}'.format(propex.__version__))
+        action='version', version='%(prog)s v{0}'.format(seqpoet.__version__))
 
     args = parser.parse_args()
 
diff --git a/propex/__init__.py b/seqpoet/__init__.py
similarity index 100%
rename from propex/__init__.py
rename to seqpoet/__init__.py
diff --git a/propex/fasta.py b/seqpoet/fasta.py
similarity index 99%
rename from propex/fasta.py
rename to seqpoet/fasta.py
index a86de1b..eb0d50e 100644
--- a/propex/fasta.py
+++ b/seqpoet/fasta.py
@@ -7,7 +7,7 @@
 import os
 import textwrap
 
-from propex.sequence import Sequence
+from seqpoet.sequence import Sequence
 
 class FastaIndex(object):
     """Represents an index for a FASTA file.
diff --git a/propex/genbank.py b/seqpoet/genbank.py
similarity index 99%
rename from propex/genbank.py
rename to seqpoet/genbank.py
index b0ddcdc..9f17a3b 100644
--- a/propex/genbank.py
+++ b/seqpoet/genbank.py
@@ -9,7 +9,7 @@
 import itertools
 import re
 
-from propex.sequence import Sequence
+from seqpoet.sequence import Sequence
 
 class LocationError(Exception):
     pass
diff --git a/propex/search.py b/seqpoet/search.py
similarity index 100%
rename from propex/search.py
rename to seqpoet/search.py
diff --git a/propex/sequence.py b/seqpoet/sequence.py
similarity index 100%
rename from propex/sequence.py
rename to seqpoet/sequence.py
diff --git a/propex/tests/data/U49845.gb b/seqpoet/tests/data/U49845.gb
similarity index 100%
rename from propex/tests/data/U49845.gb
rename to seqpoet/tests/data/U49845.gb
diff --git a/propex/tests/data/dups.fasta b/seqpoet/tests/data/dups.fasta
similarity index 100%
rename from propex/tests/data/dups.fasta
rename to seqpoet/tests/data/dups.fasta
diff --git a/propex/tests/data/dups.fasta.fai b/seqpoet/tests/data/dups.fasta.fai
similarity index 100%
rename from propex/tests/data/dups.fasta.fai
rename to seqpoet/tests/data/dups.fasta.fai
diff --git a/propex/tests/data/dups_noindex.fasta b/seqpoet/tests/data/dups_noindex.fasta
similarity index 100%
rename from propex/tests/data/dups_noindex.fasta
rename to seqpoet/tests/data/dups_noindex.fasta
diff --git a/propex/tests/data/empty.fasta.fai b/seqpoet/tests/data/empty.fasta.fai
similarity index 100%
rename from propex/tests/data/empty.fasta.fai
rename to seqpoet/tests/data/empty.fasta.fai
diff --git a/propex/tests/data/empty_sequence.fasta b/seqpoet/tests/data/empty_sequence.fasta
similarity index 100%
rename from propex/tests/data/empty_sequence.fasta
rename to seqpoet/tests/data/empty_sequence.fasta
diff --git a/propex/tests/data/sample_primers.txt b/seqpoet/tests/data/sample_primers.txt
similarity index 100%
rename from propex/tests/data/sample_primers.txt
rename to seqpoet/tests/data/sample_primers.txt
diff --git a/propex/tests/data/sample_sequence.fa b/seqpoet/tests/data/sample_sequence.fa
similarity index 100%
rename from propex/tests/data/sample_sequence.fa
rename to seqpoet/tests/data/sample_sequence.fa
diff --git a/propex/tests/data/uneven.fasta b/seqpoet/tests/data/uneven.fasta
similarity index 100%
rename from propex/tests/data/uneven.fasta
rename to seqpoet/tests/data/uneven.fasta
diff --git a/propex/tests/data/valid_index.fasta b/seqpoet/tests/data/valid_index.fasta
similarity index 100%
rename from propex/tests/data/valid_index.fasta
rename to seqpoet/tests/data/valid_index.fasta
diff --git a/propex/tests/data/valid_index.fasta.fai b/seqpoet/tests/data/valid_index.fasta.fai
similarity index 100%
rename from propex/tests/data/valid_index.fasta.fai
rename to seqpoet/tests/data/valid_index.fasta.fai
diff --git a/propex/tests/data/valid_noindex.fasta b/seqpoet/tests/data/valid_noindex.fasta
similarity index 100%
rename from propex/tests/data/valid_noindex.fasta
rename to seqpoet/tests/data/valid_noindex.fasta
diff --git a/propex/tests/test_fasta.py b/seqpoet/tests/test_fasta.py
similarity index 79%
rename from propex/tests/test_fasta.py
rename to seqpoet/tests/test_fasta.py
index e844609..f3c9d31 100644
--- a/propex/tests/test_fasta.py
+++ b/seqpoet/tests/test_fasta.py
@@ -2,7 +2,7 @@
 from nose.plugins.skip import SkipTest
 import os
 
-import propex
+import seqpoet
 
 class TestFastaIndex:
 
@@ -13,61 +13,61 @@ def setUp(self):
         self.empty_index = os.path.join(testdir, 'data', 'empty.fasta.fai')
 
     def test_faidx_length(self):
-        faidx = propex.FastaIndex(self.valid_index)
+        faidx = seqpoet.FastaIndex(self.valid_index)
         assert len(faidx) == 4
 
     def test_faidx_order(self):
-        faidx = propex.FastaIndex(self.valid_index)
+        faidx = seqpoet.FastaIndex(self.valid_index)
         assert faidx[0]['name'] == 'seq1'
         assert faidx[1]['name'] == 'seq2'
         assert faidx[2]['name'] == 'aaa'
         assert faidx[3]['name'] == 'bbb'
 
     def test_str(self):
-        faidx = propex.FastaIndex(self.valid_index)
+        faidx = seqpoet.FastaIndex(self.valid_index)
         assert str(faidx) == '\n'.join(['seq1\t78\t6\t28\t29',
                                         'seq2\t28\t93\t28\t29',
                                         'aaa\t44\t127\t28\t29',
                                         'bbb\t73\t178\t28\t29'])
 
     def test_keys(self):
-        faidx = propex.FastaIndex(self.valid_index)
+        faidx = seqpoet.FastaIndex(self.valid_index)
         assert faidx.keys() == ['seq1', 'seq2', 'aaa', 'bbb']
 
     def test_iter(self):
-        faidx = propex.FastaIndex(self.valid_index)
+        faidx = seqpoet.FastaIndex(self.valid_index)
         for k, v in faidx:
             pass
 
     def test_repr(self):
-        faidx = propex.FastaIndex(self.valid_index)
+        faidx = seqpoet.FastaIndex(self.valid_index)
         assert repr(faidx) == '<FastaIndex for {0}>' \
             .format(os.path.splitext(self.valid_index)[0])
 
     @raises(ValueError)
     def test_nonexisting_index(self):
-        faidx = propex.FastaIndex(self.invalid_index)
+        faidx = seqpoet.FastaIndex(self.invalid_index)
 
     @raises(ValueError)
     def test_empty_index(self):
-        faidx = propex.FastaIndex(self.empty_index)
+        faidx = seqpoet.FastaIndex(self.empty_index)
 
     @raises(ValueError)
     def test_incorrect_filetype(self):
-        faidx = propex.FastaIndex(os.path.splitext(self.invalid_index)[0])
+        faidx = seqpoet.FastaIndex(os.path.splitext(self.invalid_index)[0])
 
 class TestFastaRecord:
 
     def test_string_sequence(self):
-        fr = propex.FastaRecord('accaggata', 'test')
+        fr = seqpoet.FastaRecord('accaggata', 'test')
 
     @raises(ValueError)
     def test_invalid_sequence(self):
-        fr = propex.FastaRecord('thisisnotdna', 'test')
+        fr = seqpoet.FastaRecord('thisisnotdna', 'test')
 
     @raises(TypeError)
     def test_wrong_sequence_type(self):
-        fr = propex.FastaRecord(['a', 'c', 'g', 't'], 'test')
+        fr = seqpoet.FastaRecord(['a', 'c', 'g', 't'], 'test')
 
 class TestFasta:
 
@@ -78,11 +78,11 @@ def setUp(self):
         self.dups_fname = os.path.join(testdir, 'data', 'dups.fasta')
 
     def test_fasta_length(self):
-        fasta = propex.Fasta(self.valid_index)
+        fasta = seqpoet.Fasta(self.valid_index)
         assert len(fasta) == 4, 'unexpected number of sequences'
 
     def test_fasta_headers(self):
-        fasta = propex.Fasta(self.valid_index)
+        fasta = seqpoet.Fasta(self.valid_index)
         headers = ['seq1', 'seq2', 'aaa', 'bbb']
         for i, record in enumerate(fasta):
             assert record.name == headers[i], \
@@ -91,14 +91,14 @@ def test_fasta_headers(self):
                 'spaces in sequence'
 
     def test_sequence_length(self):
-        fasta = propex.Fasta(self.valid_index)
+        fasta = seqpoet.Fasta(self.valid_index)
         lens = [78, 28, 44, 73]
         for i, record in enumerate(fasta):
             assert len(record) == lens[i], \
                 'sequence length ({0}) is not {1}'.format(len(record), lens[i])
 
     def test_indexing(self):
-        fasta = propex.Fasta(self.valid_index)
+        fasta = seqpoet.Fasta(self.valid_index)
         assert len(fasta[1]) == 28
         assert fasta[1].seq == 'cacaggaggatagaccagatgacagata'
         assert repr(fasta[1]) == '<FastaRecord \'seq2\': <Sequence: cacag...> (28 nt)>', \
@@ -106,12 +106,12 @@ def test_indexing(self):
 
     @raises(IndexError)
     def test_invalid_index(self):
-        fasta = propex.Fasta(self.valid_index)
+        fasta = seqpoet.Fasta(self.valid_index)
         fasta[4]
 
     @raises(ValueError)
     def test_parse_duplicate_fasta(self):
-        fasta = propex.Fasta(self.dups_fname)
+        fasta = seqpoet.Fasta(self.dups_fname)
 
 class TestFastaWithoutIndex:
 
@@ -125,11 +125,11 @@ def tearDown(self):
             os.unlink(self.valid_noindex + '.fai')
 
     def test_fasta_length(self):
-        fasta = propex.Fasta(self.valid_noindex)
+        fasta = seqpoet.Fasta(self.valid_noindex)
         assert len(fasta) == 4, 'found {0} seqs, expected 4'.format(len(fasta))
 
     def test_fasta_headers(self):
-        fasta = propex.Fasta(self.valid_noindex)
+        fasta = seqpoet.Fasta(self.valid_noindex)
         headers = ['seq1', 'seq2', 'aaa', 'bbb']
         for i, record in enumerate(fasta):
             assert record.name == headers[i], \
@@ -138,14 +138,14 @@ def test_fasta_headers(self):
                 'spaces in sequence'
 
     def test_sequence_length(self):
-        fasta = propex.Fasta(self.valid_noindex)
+        fasta = seqpoet.Fasta(self.valid_noindex)
         lens = [78, 28, 44, 73]
         for i, record in enumerate(fasta):
             assert len(record) == lens[i], \
                 'sequence length ({0}) is not {1}'.format(len(record), lens[i])
 
     def test_record_repr(self):
-        fasta = propex.Fasta(self.valid_noindex)
+        fasta = seqpoet.Fasta(self.valid_noindex)
         headers = ['seq1', 'seq2', 'aaa', 'bbb']
         seqs = ['actaa', 'cacag', 'actga', 'acatc']
         lens = [78, 28, 44, 73]
@@ -156,7 +156,7 @@ def test_record_repr(self):
 
     @raises(ValueError)
     def test_duplicate_headers(self):
-        fasta = propex.Fasta(self.dups_noindex)
+        fasta = seqpoet.Fasta(self.dups_noindex)
 
 class TestInvalidFasta:
 
@@ -168,10 +168,10 @@ def setUp(self):
             'operon_extractor', 'data_genbank', 'LMG718-cremoris.gb')
 
     def test_empty_sequence(self):
-        fasta = propex.Fasta(self.empty_sequence)
+        fasta = seqpoet.Fasta(self.empty_sequence)
 
     def test_fasta_headers(self):
-        fasta = propex.Fasta(self.empty_sequence)
+        fasta = seqpoet.Fasta(self.empty_sequence)
         headers = ['seq1', 'seq2', 'empty', 'aaa', 'bbb']
         for i, record in enumerate(fasta):
             assert record.name == headers[i], \
@@ -180,7 +180,7 @@ def test_fasta_headers(self):
                 'spaces in sequence'
 
     def test_sequence_length(self):
-        fasta = propex.Fasta(self.empty_sequence)
+        fasta = seqpoet.Fasta(self.empty_sequence)
         lens = [78, 28, 0, 44, 73]
         for i, record in enumerate(fasta):
             assert len(record) == lens[i], \
@@ -190,8 +190,8 @@ def test_sequence_length(self):
     def test_genbank(self):
         if not os.path.isfile(self.gb):
             raise SkipTest
-        fasta = propex.Fasta(self.gb)
+        fasta = seqpoet.Fasta(self.gb)
 
     @raises(ValueError)
     def test_uneven_rows(self):
-        fasta = propex.Fasta(self.uneven)
+        fasta = seqpoet.Fasta(self.uneven)
diff --git a/propex/tests/test_genbank.py b/seqpoet/tests/test_genbank.py
similarity index 91%
rename from propex/tests/test_genbank.py
rename to seqpoet/tests/test_genbank.py
index e267239..d9f6593 100644
--- a/propex/tests/test_genbank.py
+++ b/seqpoet/tests/test_genbank.py
@@ -2,15 +2,15 @@
 from nose.plugins.skip import SkipTest
 import os
 
-import propex
-from propex.genbank import Location, JoinLocation
+import seqpoet
+from seqpoet.genbank import Location, JoinLocation
 
 class TestGenBank:
 
     def setUp(self):
         self.testdir = os.path.dirname(__file__)
         self.sc = os.path.join(self.testdir, 'data', 'U49845.gb')
-        self.gb = propex.GenBank(self.sc)
+        self.gb = seqpoet.GenBank(self.sc)
 
     def test_sequence_length(self):
         assert len(self.gb[0].seq) == 5028
@@ -56,29 +56,29 @@ def setUp(self):
         self.lmga18 = os.path.join(self.genbankdir, 'LMGA18-cremoris.gb')
 
     def test_index_length(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         assert len(gb) == 251, 'unexpected number of loci: {0}'.format(len(gb))
 
     def test_duplicate_locus_length(self):
-        gb = propex.GenBank(self.lmga18)
+        gb = seqpoet.GenBank(self.lmga18)
         assert len(gb) == 231, 'unexpected number of loci: {0}'.format(len(gb))
 
     def test_sequence_length(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         assert len(gb[0].seq) == 1522
 
     def test_iteration(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         for locus in gb:
             pass
 
     def test_load_directory(self):
-        gbs = [propex.GenBank(os.path.join(self.genbankdir, x)) \
+        gbs = [seqpoet.GenBank(os.path.join(self.genbankdir, x)) \
             for x in os.listdir(self.genbankdir) \
             if os.path.isfile(os.path.join(self.genbankdir, x))]
 
     def test_features_at_location(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         locus = gb.get_locus_from_name('718_Contig_100_c')[0]
         f = locus.features_at_location(Location('800'))
         assert len(f) == 1, 'found {0} features, expected 1'.format(len(f))
@@ -95,39 +95,39 @@ def test_features_at_location(self):
         assert f[1].get_qualifier('locus_tag') == 'LMG718_00020'
 
     def test_get_locus_from_name(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         loci = gb.get_locus_from_name('718_Contig_106_c')
         assert len(loci) > 0
         assert len(loci[0].seq) == 8967
 
-    @raises(propex.genbank.ParsingError)
+    @raises(seqpoet.genbank.ParsingError)
     def test_parse_fasta(self):
-        gb = propex.GenBank(os.path.join(self.genbankdir, '..', 'data_fasta',
+        gb = seqpoet.GenBank(os.path.join(self.genbankdir, '..', 'data_fasta',
             'LMG718-cremoris.fasta'))
 
     def test_next_downstream(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         locus = gb.get_locus_from_name('718_Contig_10_co')[0]
         gbf = locus.features_at_location(Location('1355'))[0]
         next = locus.next_downstream(gbf)
         assert str(next.location) == '2532..2819'
 
     def test_next_downstream_duplicate_loci(self):
-        gb = propex.GenBank(self.lmga18)
+        gb = seqpoet.GenBank(self.lmga18)
         locus = gb.get_locus_from_name('LMGA18_Contig_10')[1]
         gbf = locus.features_at_location(Location('301'))[0]
         next = locus.next_downstream(gbf)
         assert str(next.location) == '3180..3404'
 
     def test_next_downstream_last(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         locus = gb.get_locus_from_name('718_Contig_102_c')[0]
         gbf = locus.features_at_location(Location('9765'))[0]
         next = locus.next_downstream(gbf)
         assert next is None
 
     def test_next_downstream_complement(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         locus = gb.get_locus_from_name('718_Contig_101_c')[0]
         gbf = locus.features_at_location(Location('7664'))[0]
         next = locus.next_downstream(gbf)
@@ -138,28 +138,28 @@ def test_next_downstream_complement(self):
         assert str(next.location) == 'complement(2752..5457)'
 
     def test_next_upstream(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         locus = gb.get_locus_from_name('718_Contig_106_c')[0]
         gbf = locus.features_at_location(Location('754'))[0]
         next = locus.next_upstream(gbf)
         assert str(next.location) == '58..747'
 
     def test_next_upstream_duplicate_loci(self):
-        gb = propex.GenBank(self.lmga18)
+        gb = seqpoet.GenBank(self.lmga18)
         locus = gb.get_locus_from_name('LMGA18_Contig_10')[1]
         gbf = locus.features_at_location(Location('3180'))[0]
         next = locus.next_upstream(gbf)
         assert str(next.location) == '301..1245'
 
     def test_next_upstream_last(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         locus = gb.get_locus_from_name('718_Contig_106_c')[0]
         gbf = locus.features_at_location(Location('58'))[0]
         next = locus.next_upstream(gbf)
         assert next is None
 
     def test_next_upstream_complement(self):
-        gb = propex.GenBank(self.lmg718)
+        gb = seqpoet.GenBank(self.lmg718)
         locus = gb.get_locus_from_name('718_Contig_106_c')[0]
         gbf = locus.features_at_location(Location('7161'))[0]
         next = locus.next_upstream(gbf)
@@ -173,7 +173,7 @@ class TestGenBankFeature:
 
     def test_qualifier_names(self):
         f = {'name': 'lalala'}
-        gbf = propex.GenBankFeature('testlocus', 'CDS', '123..679', f)
+        gbf = seqpoet.GenBankFeature('testlocus', 'CDS', '123..679', f)
         assert gbf.get_qualifier('name') == f['name'], \
             'wrong name: {0}'.format(gbf.get_qualifier('name'))
 
@@ -193,7 +193,7 @@ def test_parse_feature(self):
                      LKITYNQNVKTDFSKELLSRQDHDIFRHQTTVGPHRDDLQFFINEINVADFGSQGQQR
                      TVTLSIKLAEIDLIFEETGEYPILLLDDVMSELDNHRQLDLIETSLGKTQTFITTTTL
                      DHLKNLPENLSIFHVTDGTIEKEKE"'''
-        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+        gbf = seqpoet.GenBankFeature.from_string('testlocus', feature)
 
         assert gbf.feature_type == 'CDS'
         gbf_gene = gbf.get_qualifier('gene')
@@ -227,7 +227,7 @@ def test_multiple_qualifiers(self):
                      /function="RNA; Ribosomal and stable RNAs"
                      /db_xref="ASAP:ABE-0001579"
                      /db_xref="EcoGene:EG30027"'''
-        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+        gbf = seqpoet.GenBankFeature.from_string('testlocus', feature)
         func = gbf.get_qualifier('function')
         assert len(func) == 5
 
@@ -239,7 +239,7 @@ def test_feature_join_location(self):
                      /inference="similar to AA sequence:UniProtKB:Q9RVE0"
                      /codon_start=1
                      /transl_table=11'''
-        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+        gbf = seqpoet.GenBankFeature.from_string('testlocus', feature)
 
         assert gbf.feature_type == 'CDS'
         assert gbf.get_qualifier('gene') == 'recF'
@@ -250,7 +250,7 @@ def test_multiline_location(self):
                      1295140..1295322))
                      /gene="insZ"
                      /locus_tag="b4573"'''
-        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+        gbf = seqpoet.GenBankFeature.from_string('testlocus', feature)
 
         assert gbf.feature_type == 'CDS'
         assert isinstance(gbf.location, JoinLocation)
@@ -276,7 +276,7 @@ def test_empty_qualifiers(self):
                      /locus_tag=
                      /note
                      /random=""'''
-        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+        gbf = seqpoet.GenBankFeature.from_string('testlocus', feature)
 
         assert gbf.get_qualifier('locus_tag') == ''
         assert gbf.get_qualifier('note') is None
@@ -286,22 +286,22 @@ def test_empty_qualifiers(self):
     def test_missing_qualifier(self):
         feature = '''     CDS             complement(52625..53704)
                      /gene="recF"'''
-        gbf = propex.GenBankFeature.from_string('testlocus', feature)
+        gbf = seqpoet.GenBankFeature.from_string('testlocus', feature)
         gbf.get_qualifier('locus_tag')
 
     def test_empty_qualifiers(self):
-        gbf = propex.GenBankFeature('testlocus', 'CDS', '123..679')
+        gbf = seqpoet.GenBankFeature('testlocus', 'CDS', '123..679')
         assert isinstance(gbf.qualifiers, list)
         assert len(gbf.qualifiers) == 0
 
     def test_equality(self):
-        gbf1 = propex.GenBankFeature('testlocus', 'CDS',
+        gbf1 = seqpoet.GenBankFeature('testlocus', 'CDS',
             Location('123..679'), {'name': 'randomname'})
-        gbf2 = propex.GenBankFeature('testlocus', 'CDS',
+        gbf2 = seqpoet.GenBankFeature('testlocus', 'CDS',
             Location('123..679'), {'name': 'randomname'})
-        gbf3 = propex.GenBankFeature('testlocus', 'CDS',
+        gbf3 = seqpoet.GenBankFeature('testlocus', 'CDS',
             Location('123..679'), {'name': 'otherrandomname'})
-        gbf4 = propex.GenBankFeature('testlocus', 'CDS',
+        gbf4 = seqpoet.GenBankFeature('testlocus', 'CDS',
             Location('120..679'), {'name': 'randomname'})
 
         assert gbf1 == gbf2
@@ -443,7 +443,7 @@ def test_overlap(self):
         assert not loc4.overlaps(loc3)
         assert not loc1.overlaps(loc5)
 
-    @raises(propex.genbank.LocationError)
+    @raises(seqpoet.genbank.LocationError)
     def test_invalid_location(self):
         loc = Location('123..noloc')
 
@@ -488,15 +488,15 @@ def test_instance(self):
         assert isinstance(self.jloc1, JoinLocation)
         assert isinstance(self.jloc2, JoinLocation)
 
-    @raises(propex.genbank.LocationError)
+    @raises(seqpoet.genbank.LocationError)
     def test_invalid_location(self):
         jloc = JoinLocation('join(1..200,300..400')
 
-    @raises(propex.genbank.LocationError)
+    @raises(seqpoet.genbank.LocationError)
     def test_invalid_strands(self):
         jloc = JoinLocation('join(complement(100),200)')
 
-    @raises(propex.genbank.LocationError)
+    @raises(seqpoet.genbank.LocationError)
     def test_messed_up_location(self):
         jloc = JoinLocation('complement(join(687..700,800..900,1000..1100))mRNA            <687..>3158')
 
diff --git a/propex/tests/test_search.py b/seqpoet/tests/test_search.py
similarity index 94%
rename from propex/tests/test_search.py
rename to seqpoet/tests/test_search.py
index a985c9b..e646d1c 100644
--- a/propex/tests/test_search.py
+++ b/seqpoet/tests/test_search.py
@@ -4,10 +4,10 @@
 from nose.tools import raises
 from nose.plugins.skip import SkipTest
 
-from propex.search import search, hamming_distance
-from propex import Sequence
-from propex import GenBank
-from propex.genbank import Location
+from seqpoet.search import search, hamming_distance
+from seqpoet import Sequence
+from seqpoet import GenBank
+from seqpoet.genbank import Location
 
 class TestHammingDistance:
 
diff --git a/propex/tests/test_sequence.py b/seqpoet/tests/test_sequence.py
similarity index 72%
rename from propex/tests/test_sequence.py
rename to seqpoet/tests/test_sequence.py
index 3fcbed7..33e38fe 100644
--- a/propex/tests/test_sequence.py
+++ b/seqpoet/tests/test_sequence.py
@@ -3,7 +3,7 @@
 
 from nose.tools import raises
 
-import propex
+import seqpoet
 
 class TestSequence:
 
@@ -12,42 +12,42 @@ def setup(self):
         self.illegal = 'agagcatgcacthisisnotcorrect'
 
     def test_sequence_length(self):
-        s = propex.Sequence(self.seq1)
+        s = seqpoet.Sequence(self.seq1)
         assert len(s) == len(self.seq1)
 
     def test_casing(self):
-        s = propex.Sequence(self.seq1)
+        s = seqpoet.Sequence(self.seq1)
         assert re.match('^[acgt]+$', str(s))
 
     def test_reverse_complement(self):
-        s = propex.Sequence(self.seq1)
-        s2 = propex.Sequence('acct')
+        s = seqpoet.Sequence(self.seq1)
+        s2 = seqpoet.Sequence('acct')
         assert s.revcomp() == 'tatgtgtctctattctgtgtatgt', \
             '"{0}" is not "tatgtgtctctattctgtgtatgt"'.format(s.revcomp().seq)
         assert s2.revcomp() == 'aggt', \
             '"{0}" is not "aggt"'.format(s2.revcomp().seq)
 
     def test_str(self):
-        s = propex.Sequence(self.seq1)
+        s = seqpoet.Sequence(self.seq1)
         assert str(s) == self.seq1.lower()
 
     def test_repr(self):
-        s = propex.Sequence(self.seq1)
+        s = seqpoet.Sequence(self.seq1)
         assert repr(s) == '<Sequence: acata...>'
         assert repr(s.revcomp()) == '<Sequence: tatgt...>'
 
     def test_indexing(self):
-        s = propex.Sequence(self.seq1)
+        s = seqpoet.Sequence(self.seq1)
         assert s[4] == 'a'
         assert s[:5] == 'acata'
         assert s[-6:] == 'cacata'
         assert s[4:8] == 'acac'
 
     def test_equality(self):
-        s = propex.Sequence(self.seq1)
+        s = seqpoet.Sequence(self.seq1)
         assert s == self.seq1.lower()
-        assert s[:3] == propex.Sequence(self.seq1[:3])
+        assert s[:3] == seqpoet.Sequence(self.seq1[:3])
 
     @raises(ValueError)
     def test_illegal_characters(self):
-        s = propex.Sequence(self.illegal)
+        s = seqpoet.Sequence(self.illegal)
diff --git a/setup.py b/setup.py
index 028d288..0cc2cf5 100644
--- a/setup.py
+++ b/setup.py
@@ -8,12 +8,12 @@
 
 if sys.version_info[0] < 2 or \
         sys.version_info[0] == 2 and sys.version_info[1] < 7:
-    sys.stderr.write('Error in propex setup\n')
-    sys.stderr.write('You need at least version 2.7 of Python to use propex\n')
+    sys.stderr.write('Error in seqpoet setup\n')
+    sys.stderr.write('You need at least version 2.7 of Python to use seqpoet\n')
     sys.exit(1)
 
 if sys.version_info[0] >= 3:
-    sys.stderr.write('Error in propex setup\n')
+    sys.stderr.write('Error in seqpoet setup\n')
     sys.stderr.write('This package only works with Python 2 at the moment\n')
     sys.stderr.write('Please use Python 2.x, x >= 7\n')
     sys.exit(1)
@@ -33,20 +33,20 @@ def find_version(*file_paths):
         return version_match.group(1)
     raise RuntimeError("Unable to find version string.")
 
-setup(name='propex',
-    version=find_version('propex/__init__.py'),
+setup(name='seqpoet',
+    version=find_version('seqpoet/__init__.py'),
     description='In silico PCR and operon extraction',
-    url='https://github.com/maehler/propex',
+    url='https://github.com/maehler/seqpoet',
     author='Niklas Mähler',
     author_email='niklas.mahler@gmail.com',
     maintainer='Niklas Mähler',
     maintainer_email='niklas.mahler@gmail.com',
     license='MIT',
-    packages=['propex'],
+    packages=['seqpoet'],
     zip_safe=False,
     test_suite='nose.collector',
     tests_require=['nose'],
-    scripts=['bin/propex'],
+    scripts=['bin/seqpoet'],
     classifiers=[
         'Development Status :: 3 - Alpha',
         'Intended Audience :: Science/Research',

From e00ea2851dfdbf8e2d731f93f02ef2f31c134432 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 19 Apr 2015 19:39:31 +0200
Subject: [PATCH 23/40] Reverse complement results, close #6

---
 bin/seqpoet | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/bin/seqpoet b/bin/seqpoet
index 5528e67..7e7e798 100644
--- a/bin/seqpoet
+++ b/bin/seqpoet
@@ -203,7 +203,7 @@ def match_primer(primers, seqs, mismatches=2,
 
     return matches
 
-def find_operon(matches, seqs, max_distance=500):
+def find_operon(matches, seqs, max_distance=500, no_revcomp=False):
     match_operon = []
     for m in matches:
         gb = seqs[m['filename']]
@@ -243,7 +243,9 @@ def find_operon(matches, seqs, max_distance=500):
 
         operon_seq = locus.seq[min_start:max_end]
 
-        # Reverse complement matches on minus-strand?
+        # Reverse complement matches on minus-strand
+        if not no_revcomp and m['strand'] == '-':
+            operon_seq = operon_seq.revcomp()
 
         match_operon.append({
             'filename': m['filename'],
@@ -258,7 +260,7 @@ def find_operon(matches, seqs, max_distance=500):
 
     return match_operon
 
-def write_fasta(matches, filename=sys.stdout):
+def write_fasta(matches, filename=sys.stdout, no_revcomp=False):
     if isinstance(filename, file):
         f = filename
         close = False
@@ -267,6 +269,8 @@ def write_fasta(matches, filename=sys.stdout):
         close = True
 
     for m in matches:
+        if not no_revcomp and m['strand'] == '-':
+            m['seq'] = m['seq'].revcomp()
         m['filename'] = os.path.basename(m['filename'])
         s = seqpoet.fasta.FastaRecord(m['seq'],
             '{filename}:{seqname}:{hitstart}:{hitend}:{length}:{strand}' \
@@ -307,6 +311,10 @@ def parse_args():
         'to consider (default: %(default)d)'), type=int, default=3000,
         metavar='N')
 
+    parser.add_argument('--no-revcomp', help=('don\'t reverse complement '
+        'results on the minus strand (default: do reverse complementation)'),
+        action='store_true')
+
     parser.add_argument('-o', '--out', help='file for output (default: stdout)',
         default=sys.stdout)
 
@@ -376,12 +384,13 @@ def main():
 
     # In silico PCR results
     if is_primer and args.pcr:
-        write_fasta(matches, filename=args.out)
+        write_fasta(matches, filename=args.out, no_revcomp=args.no_revcomp)
         exit(0)
 
     # Operon extraction
     print('Looking for operons', file=sys.stderr)
-    match_features = find_operon(matches, seqs, max_distance=args.max_distance)
+    match_features = find_operon(matches, seqs, max_distance=args.max_distance,
+        no_revcomp=args.no_revcomp)
 
     if len(match_features) == 0:
         print('WARNING: no operons found', file=sys.stderr)

From d7bd6efb86e81620ef211d05b7df096d6c84de69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 21 Apr 2015 19:37:00 +0200
Subject: [PATCH 24/40] Fix minimal feature bug

---
 seqpoet/genbank.py            | 2 ++
 seqpoet/tests/test_genbank.py | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py
index 9f17a3b..7c867e5 100644
--- a/seqpoet/genbank.py
+++ b/seqpoet/genbank.py
@@ -339,6 +339,8 @@ def from_string(cls, locus, feature_string):
         """
         lines = [x.strip() for x in feature_string.splitlines()]
         ftype, location = lines[0].strip().split()
+        if len(lines) == 1:
+            return cls(locus, ftype, parse_location(location), {})
         # Multiline location string
         i = 1
         line = lines[i]
diff --git a/seqpoet/tests/test_genbank.py b/seqpoet/tests/test_genbank.py
index d9f6593..6f8a46c 100644
--- a/seqpoet/tests/test_genbank.py
+++ b/seqpoet/tests/test_genbank.py
@@ -282,6 +282,13 @@ def test_empty_qualifiers(self):
         assert gbf.get_qualifier('note') is None
         assert gbf.get_qualifier('random') == ''
 
+    def test_minimal_feature(self):
+        feature = '     CDS             complement(52625..53704)'
+        gbf = seqpoet.GenBankFeature.from_string('testlocus', feature)
+
+        assert gbf.feature_type == 'CDS'
+        assert str(gbf.location) == 'complement(52625..53704)'
+
     @raises(KeyError)
     def test_missing_qualifier(self):
         feature = '''     CDS             complement(52625..53704)

From db82ca0d1db4591e22da117fdd9c25cf3138c4c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 21 Apr 2015 20:50:15 +0200
Subject: [PATCH 25/40] Forgot to update name in docs

---
 docs/Makefile                    |  8 ++++----
 docs/conf.py                     | 18 +++++++++---------
 docs/index.rst                   | 20 ++++++++++----------
 docs/installation.rst            |  6 +++---
 docs/make.bat                    |  4 ++--
 docs/{propex.rst => seqpoet.rst} | 30 +++++++++++++++---------------
 6 files changed, 43 insertions(+), 43 deletions(-)
 rename docs/{propex.rst => seqpoet.rst} (57%)

diff --git a/docs/Makefile b/docs/Makefile
index 48244a5..bdae658 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -87,9 +87,9 @@ qthelp:
 	@echo
 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/propex.qhcp"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/seqpoet.qhcp"
 	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/propex.qhc"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/seqpoet.qhc"
 
 applehelp:
 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
@@ -104,8 +104,8 @@ devhelp:
 	@echo
 	@echo "Build finished."
 	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/propex"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/propex"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/seqpoet"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/seqpoet"
 	@echo "# devhelp"
 
 epub:
diff --git a/docs/conf.py b/docs/conf.py
index f4e4ba7..a13fad9 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
-# propex documentation build configuration file, created by
-# sphinx-quickstart on Sat Mar 14 20:54:34 2015.
+# seqpoet documentation build configuration file, created by
+# sphinx-quickstart on Tue Apr 21 20:33:27 2015.
 #
 # This file is execfile()d with the current directory set to its
 # containing dir.
@@ -48,7 +48,7 @@
 master_doc = 'index'
 
 # General information about the project.
-project = u'propex'
+project = u'seqpoet'
 copyright = u'2015, Niklas Mähler'
 author = u'Niklas Mähler'
 
@@ -59,7 +59,7 @@
 # The short X.Y version.
 import pkg_resources
 try:
-    release = pkg_resources.get_distribution('propex').version
+    release = pkg_resources.get_distribution('seqpoet').version
 except pkg_resources.DistributionNotFound:
     print 'To build the documentation, The distribution information of sandman'
     print 'Has to be available.  Either install the package into your'
@@ -211,7 +211,7 @@
 #html_search_scorer = 'scorer.js'
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'propexdoc'
+htmlhelp_basename = 'seqpoetdoc'
 
 # -- Options for LaTeX output ---------------------------------------------
 
@@ -233,7 +233,7 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  (master_doc, 'propex.tex', u'propex Documentation',
+  (master_doc, 'seqpoet.tex', u'seqpoet Documentation',
    u'Author', 'manual'),
 ]
 
@@ -263,7 +263,7 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'propex', u'propex Documentation',
+    (master_doc, 'seqpoet', u'seqpoet Documentation',
      [author], 1)
 ]
 
@@ -277,8 +277,8 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  (master_doc, 'propex', u'propex Documentation',
-   author, 'propex', 'One line description of project.',
+  (master_doc, 'seqpoet', u'seqpoet Documentation',
+   author, 'seqpoet', 'One line description of project.',
    'Miscellaneous'),
 ]
 
diff --git a/docs/index.rst b/docs/index.rst
index d49a044..21216f4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,17 +1,17 @@
-.. propex documentation master file, created by
-   sphinx-quickstart on Sat Mar 14 20:54:34 2015.
+.. seqpoet documentation master file, created by
+   sphinx-quickstart on Tue Apr 21 20:33:27 2015.
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-propex: Prokaryotic Operon Extractor
-====================================
+seqpoet: *In silico* PCR and operon extraction for genomic assemblies
+=====================================================================
 
-The main purpose of propex is to provide a simple interface for `in silico`
-PCR and operon extraction in prokaryotes. The secondary purpose of propex is
-to be a Python package that can be used for handling sequence data in the form
-of FASTA and GenBank files.
+The main purpose of seqpoet is to provide a simple interface for `in silico`
+PCR and operon extraction for genomic assemblies. The secondary purpose of
+seqpoet is to be a Python package that can be used for handling sequence
+data in the form of FASTA and GenBank files.
 
-Source code is hosted on GitHub: https://github.com/maehler/propex.
+Source code is hosted on GitHub: https://github.com/maehler/seqpoet.
 
 Contents:
 
@@ -21,7 +21,7 @@ Contents:
    installation
    insilico_pcr
    operon_extraction
-   propex
+   seqpoet
 
 
 Indices and tables
diff --git a/docs/installation.rst b/docs/installation.rst
index e9bf325..7a78f42 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -1,11 +1,11 @@
 Installation
 ============
 
-Currently, the easiest way of installing propex is to clone the GitHub
+Currently, the easiest way of installing seqpoet is to clone the GitHub
 repository and install it manually::
 
-    > git clone https://github.com/maehler/propex.git
-    > cd propex
+    > git clone https://github.com/maehler/seqpoet.git
+    > cd seqpoet
     > python setup.py install
 
 Eventually the package will be submitted to `PyPI <https://pypi.python.org/pypi>`_.
diff --git a/docs/make.bat b/docs/make.bat
index 38b06ba..abe8a02 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -127,9 +127,9 @@ if "%1" == "qthelp" (
 	echo.
 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
 .qhcp project file in %BUILDDIR%/qthelp, like this:
-	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\propex.qhcp
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\seqpoet.qhcp
 	echo.To view the help file:
-	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\propex.ghc
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\seqpoet.ghc
 	goto end
 )
 
diff --git a/docs/propex.rst b/docs/seqpoet.rst
similarity index 57%
rename from docs/propex.rst
rename to docs/seqpoet.rst
index a4f75e1..5ac4df9 100644
--- a/docs/propex.rst
+++ b/docs/seqpoet.rst
@@ -1,37 +1,37 @@
-propex package documentation
-============================
+seqpoet package
+===============
 
 Submodules
 ----------
 
-propex.fasta module
--------------------
+seqpoet.fasta module
+--------------------
 
-.. automodule:: propex.fasta
+.. automodule:: seqpoet.fasta
     :members:
     :undoc-members:
     :show-inheritance:
 
-propex.genbank module
----------------------
+seqpoet.genbank module
+----------------------
 
-.. automodule:: propex.genbank
+.. automodule:: seqpoet.genbank
     :members:
     :undoc-members:
     :show-inheritance:
 
-propex.search module
---------------------
+seqpoet.search module
+---------------------
 
-.. automodule:: propex.search
+.. automodule:: seqpoet.search
     :members:
     :undoc-members:
     :show-inheritance:
 
-propex.sequence module
-----------------------
+seqpoet.sequence module
+-----------------------
 
-.. automodule:: propex.sequence
+.. automodule:: seqpoet.sequence
     :members:
     :undoc-members:
     :show-inheritance:
@@ -40,7 +40,7 @@ propex.sequence module
 Module contents
 ---------------
 
-.. automodule:: propex
+.. automodule:: seqpoet
     :members:
     :undoc-members:
     :show-inheritance:

From bf85818eb100a45983d606157cd4f707e99f7d0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 21 Apr 2015 21:31:16 +0200
Subject: [PATCH 26/40] Fix neighbor bug

Before, the search could fall over the edge and start from the
beginning, resulting in an infinite loop.
---
 seqpoet/genbank.py            |  2 ++
 seqpoet/tests/test_genbank.py | 36 ++++++++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py
index 7c867e5..9f0a6a9 100644
--- a/seqpoet/genbank.py
+++ b/seqpoet/genbank.py
@@ -528,6 +528,8 @@ def _neighbor(self, feature, downstream=True):
                 findex -= 1
             else:
                 findex += 1
+            if findex >= len(self.features[ftype]) or findex < 0:
+                return None
 
         return self.features[ftype][findex]
 
diff --git a/seqpoet/tests/test_genbank.py b/seqpoet/tests/test_genbank.py
index 6f8a46c..5b4c8d4 100644
--- a/seqpoet/tests/test_genbank.py
+++ b/seqpoet/tests/test_genbank.py
@@ -1,6 +1,7 @@
 from nose.tools import raises
 from nose.plugins.skip import SkipTest
 import os
+import tempfile
 
 import seqpoet
 from seqpoet.genbank import Location, JoinLocation
@@ -18,7 +19,7 @@ def test_sequence_length(self):
     def test_mRNA(self):
         assert len(self.gb[0].features['mRNA']) == 3
 
-    def test_next_downstream(self):
+    def test_neighbors(self):
         locus = self.gb[0]
         gbf = locus.features['mRNA'][0]
         assert gbf is not None
@@ -27,6 +28,39 @@ def test_next_downstream(self):
         assert next is not None
         assert str(next.location) == '<687..>3158'
 
+        # Weird issue of alternating results when selecting next
+        # downstream
+        temp = tempfile.NamedTemporaryFile(delete=False)
+        temp.write('''LOCUS testlocus 5758 bp  DNA linear  12-APR-2015
+FEATURES            Location/qualifiers
+    source          1..5758
+    CDS             7..693
+    CDS             697..3303
+    CDS             complement(3381..4166)
+    CDS             complement(4167..5516)
+    ORIGIN
+//''')
+        temp.close()
+        gbfile = temp.name
+
+        gb = seqpoet.GenBank(gbfile)
+
+        locus = gb[0]
+
+        gbf = locus.features_at_location(Location('4170'))[0]
+        assert gbf.location.is_complement
+        assert str(gbf.location) == 'complement(4167..5516)'
+
+        ds = locus.next_downstream(gbf)
+        assert ds.location.is_complement
+        assert str(ds.location) == 'complement(3381..4166)'
+
+        ds = locus.next_downstream(ds)
+        assert ds is None, 'should be None, found feature at {0}' \
+            .format(ds.location)
+
+        os.unlink(gbfile)
+
     def test_header(self):
         header = self.gb[0].header
 

From 3752b8fd9b01339820217a3e11f2f2977301441c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 21 Apr 2015 21:48:17 +0200
Subject: [PATCH 27/40] Fix for very short headers

---
 seqpoet/genbank.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py
index 9f0a6a9..4650326 100644
--- a/seqpoet/genbank.py
+++ b/seqpoet/genbank.py
@@ -655,7 +655,10 @@ def _parse_header(self, hstring):
         }
 
         last_key = None
-        line = header_lines.next()
+        try:
+            line = header_lines.next()
+        except StopIteration:
+            return head_data
         while True:
             if line[0] != ' ':
                 key = line[:11].strip()

From e44167500a0575d9fa6a456f126a596aad4dd5c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 21 Apr 2015 21:53:24 +0200
Subject: [PATCH 28/40] Add upstream/downstream extension arguments, close #4

---
 bin/seqpoet | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/bin/seqpoet b/bin/seqpoet
index 7e7e798..48cc8a8 100644
--- a/bin/seqpoet
+++ b/bin/seqpoet
@@ -203,12 +203,20 @@ def match_primer(primers, seqs, mismatches=2,
 
     return matches
 
-def find_operon(matches, seqs, max_distance=500, no_revcomp=False):
+def find_operon(matches, seqs, max_distance=500, no_revcomp=False,
+        extend_downstream=0, extend_upstream=0):
     match_operon = []
     for m in matches:
         gb = seqs[m['filename']]
         locus = gb[m['seqindex']]
-        location = seqpoet.genbank.Location.from_int(m['hitstart'], m['hitend'])
+        if m['strand'] == '+':
+            location = seqpoet.genbank.Location.from_int(
+                max(1, m['hitstart'] - extend_upstream),
+                m['hitend'] + extend_downstream)
+        else:
+            location = seqpoet.genbank.Location.from_int(
+                max(1, m['hitstart'] - extend_downstream),
+                m['hitend'] + extend_upstream)
         features = locus.features_at_location(location)
         if len(features) == 0:
             print('WARNING: no gene for match in locus {0}'.format(m['seqname']),
@@ -315,6 +323,14 @@ def parse_args():
         'results on the minus strand (default: do reverse complementation)'),
         action='store_true')
 
+    parser.add_argument('--downstream', help=('extend probe/primer match '
+        '%(metavar)s bases downstream for operon finding (default: '
+        '%(default)s)'), metavar='N', default=0, type=int)
+
+    parser.add_argument('--upstream', help=('extend probe/primer match '
+        '%(metavar)s bases upstream for operon finding (default: '
+        '%(default)s)'), metavar='N', default=0, type=int)
+
     parser.add_argument('-o', '--out', help='file for output (default: stdout)',
         default=sys.stdout)
 
@@ -338,7 +354,8 @@ def parse_args():
         if not os.path.exists(os.path.dirname(args.out)):
             parser.error('file or directory not found: {}'.format(args.out))
 
-    # Mismatches, distance and max/min product length should be integers >= 0
+    # Mismatches, distance, max/min product length and upstream/downstream
+    # should be integers >= 0
     if args.mismatches < 0:
         parser.error('mismatches must not be negative')
     if args.max_distance < 0:
@@ -347,6 +364,10 @@ def parse_args():
         parser.error('minimum product length must not be negative')
     if args.max_product < 0:
         parser.error('maximum product length must not be negative')
+    if args.downstream < 0:
+        parser.error('downstream extension must not be negative')
+    if args.upstream < 0:
+        parser.error('upstream extension must not be negative')
 
     return args
 
@@ -390,7 +411,8 @@ def main():
     # Operon extraction
     print('Looking for operons', file=sys.stderr)
     match_features = find_operon(matches, seqs, max_distance=args.max_distance,
-        no_revcomp=args.no_revcomp)
+        no_revcomp=args.no_revcomp, extend_downstream=args.downstream,
+        extend_upstream=args.upstream)
 
     if len(match_features) == 0:
         print('WARNING: no operons found', file=sys.stderr)

From cc07bc29fc0408bd23b6755719221354bfa23c82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 21 Apr 2015 21:55:08 +0200
Subject: [PATCH 29/40] Ignore compiled script

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index de16f90..0125fc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,4 @@
 /.coverage
 /docs/_build
 /data
-/bin/propexc
+/bin/seqpoetc

From d563fb0a9cd4e3d0bce0559cd1a26ba15c5f1a73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Tue, 21 Apr 2015 21:55:32 +0200
Subject: [PATCH 30/40] Add __repr__ for GenBankFeature

---
 seqpoet/genbank.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py
index 4650326..4620062 100644
--- a/seqpoet/genbank.py
+++ b/seqpoet/genbank.py
@@ -391,14 +391,17 @@ def get_qualifier(self, qualifier_name):
 
         :param qualifier_name: a string representing a qualifier.
         :returns: the value of the qualifier.
-        :raises: :py:class:`KeyError` if the feature does not have a qualifier called
-                      ``qualifier_name``.
+        :raises: :py:class:`KeyError` if the feature does not have a qualifier
+                 called ``qualifier_name``.
         """
         if qualifier_name not in self.qualifiers:
             raise KeyError('{0} is not a qualifier for {1}'
                 .format(qualifier_name, self))
         return self.qualifiers[qualifier_name]
 
+    def __repr__(self):
+        return '<GenBankFeature on {0} at {1}>'.format(self.locus, self.location)
+
 class GenBankLocus(object):
 
     """Represent a GenBank locus.

From e52e800870026b0f6a310c20b8ee35c8c8a91a2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Thu, 23 Apr 2015 18:39:57 +0200
Subject: [PATCH 31/40] Fix operon finding bug

A bug in the operon extraction could in some cases result in that
features from the opposite strand being returned.
---
 bin/seqpoet                  |  6 ++-
 seqpoet/tests/test_script.py | 95 ++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 seqpoet/tests/test_script.py

diff --git a/bin/seqpoet b/bin/seqpoet
index 48cc8a8..2029700 100644
--- a/bin/seqpoet
+++ b/bin/seqpoet
@@ -217,11 +217,15 @@ def find_operon(matches, seqs, max_distance=500, no_revcomp=False,
             location = seqpoet.genbank.Location.from_int(
                 max(1, m['hitstart'] - extend_downstream),
                 m['hitend'] + extend_upstream)
-        features = locus.features_at_location(location)
+
+        features = filter(lambda x: x.location.is_complement == \
+            (m['strand'] == '-'),  locus.features_at_location(location))
+
         if len(features) == 0:
             print('WARNING: no gene for match in locus {0}'.format(m['seqname']),
                 file=sys.stderr)
             continue
+
         operon_genes = []
         for f in features:
             # Find upstream genes
diff --git a/seqpoet/tests/test_script.py b/seqpoet/tests/test_script.py
new file mode 100644
index 0000000..3147ccb
--- /dev/null
+++ b/seqpoet/tests/test_script.py
@@ -0,0 +1,95 @@
+import imp
+import os
+
+from nose.plugins.skip import SkipTest
+
+import seqpoet
+
+currentdir = os.path.dirname(__file__)
+rootdir = os.path.dirname(os.path.dirname(currentdir))
+bindir = os.path.join(rootdir, 'bin')
+
+seqpoet_script = imp.load_source('seqpoet_script',
+	os.path.join(bindir, 'seqpoet'))
+
+class TestFindOperon:
+
+	def setup(self):
+		gb_dir = ('/Users/niklasm/Dropbox/operon_extractor/data_genbank')
+		gb_fname = os.path.join(gb_dir, 'LMG718-cremoris.gb')
+		if not os.path.exists(gb_fname):
+			raise SkipTest
+
+		gb = seqpoet.genbank.GenBank(gb_fname)
+		self.seqs = {
+			gb_fname: gb
+		}
+		self.matches = [{
+			'filename': ('/Users/niklasm/Dropbox/operon_extractor/'
+				'data_genbank/LMG718-cremoris.gb'),
+			'hitend': 3360,
+			'hitstart': 3311,
+			'length': 50,
+			'seq': seqpoet.sequence.Sequence(
+				'aattttactgatagctttttaaaaaataaaaaaaattactgacagaaatt'),
+			'seqindex': 61,
+			'seqname': '718_Contig_156_c',
+			'strand': '+'
+		}]
+		self.minus_matches = [{
+			'filename': '/Users/niklasm/Dropbox/operon_extractor/data_genbank/LMG718-cremoris.gb',
+			'hitend': 3360,
+			'hitstart': 3311,
+			'length': 50,
+			'seq': seqpoet.sequence.Sequence(
+				'aatttctgtcagtaattttttttattttttaaaaagctatcagtaaaatt'),
+			'seqindex': 61,
+			'seqname': '718_Contig_156_c',
+			'strand': '-'
+		}]
+
+	def test_operon_find(self):
+		res = seqpoet_script.find_operon(self.matches, self.seqs,
+			max_distance=500, no_revcomp=False, extend_downstream=0,
+			extend_upstream=0)
+		assert len(res) == 0
+
+	def test_operon_find_extend_upstream(self):
+		res = seqpoet_script.find_operon(self.matches, self.seqs,
+			max_distance=500, no_revcomp=False, extend_downstream=0,
+			extend_upstream=10)
+		assert len(res) == 1, 'expected 1 result, got {0}'.format(len(res))
+		assert len(res[0]['operon']) == 2
+
+		operon_len = len(res[0]['seq'])
+		assert operon_len == 3296, 'length is {0}'.format(operon_len)
+
+	def test_operon_find_extend_downstream(self):
+		res = seqpoet_script.find_operon(self.matches, self.seqs,
+			max_distance=500, no_revcomp=False, extend_downstream=100,
+			extend_upstream=0)
+		assert len(res) == 0, 'expected no results, got {0}'.format(len(res))
+
+	def test_revcomp_operon_find(self):
+		res = seqpoet_script.find_operon(self.matches, self.seqs,
+			max_distance=500, no_revcomp=False, extend_downstream=0,
+			extend_upstream=0)
+		assert len(res) == 0
+
+	def test_revcomp_operon_find_extend_downstream(self):
+		res = seqpoet_script.find_operon(self.minus_matches, self.seqs,
+			max_distance=500, no_revcomp=False, extend_downstream=100,
+			extend_upstream=0)
+		assert len(res) == 0
+
+	def test_revcomp_operon_find_extend_upstream(self):
+		res = seqpoet_script.find_operon(self.minus_matches, self.seqs,
+			max_distance=500, no_revcomp=False, extend_downstream=0,
+			extend_upstream=100)
+		assert len(res) == 1
+		assert len(res[0]['operon']) == 2
+
+		assert all(x.location.is_complement for x in res[0]['operon'])
+
+		operon_len = len(res[0]['seq'])
+		assert operon_len == 2135, 'length is {0}'.format(operon_len)

From 6201fd939967be0f2bf6ef6b4d977d2a44bca284 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sat, 25 Apr 2015 18:47:12 +0200
Subject: [PATCH 32/40] No side scrolling in tables

---
 docs/_static/style.css      | 9 +++++++++
 docs/_templates/layout.html | 3 +++
 2 files changed, 12 insertions(+)
 create mode 100644 docs/_static/style.css
 create mode 100644 docs/_templates/layout.html

diff --git a/docs/_static/style.css b/docs/_static/style.css
new file mode 100644
index 0000000..52b598f
--- /dev/null
+++ b/docs/_static/style.css
@@ -0,0 +1,9 @@
+.wy-table-responsive table td, .wy-table-responsive table th {
+    white-space: normal;
+}
+
+.wy-table-responsive {
+    margin-bottom: 24px;
+    max-width: 100%;
+    overflow: visible;
+}
diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html
new file mode 100644
index 0000000..75c902f
--- /dev/null
+++ b/docs/_templates/layout.html
@@ -0,0 +1,3 @@
+{# layout.html #}
+{% extends "!layout.html" %}
+{% set css_files = css_files + ['_static/style.css'] %}

From c521bef07ba4110a716334b7a747dfe7ab4f6bff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sat, 25 Apr 2015 18:49:09 +0200
Subject: [PATCH 33/40] Correct theme handling on readthedocs

---
 docs/conf.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index a13fad9..36c517e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -119,7 +119,16 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'sphinx_rtd_theme'
+
+# on_rtd is whether we are on readthedocs.org
+import os
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
+if not on_rtd:  # only import and set the theme if we're building docs locally
+    import sphinx_rtd_theme
+    html_theme = 'sphinx_rtd_theme'
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the

From 2d1f8cc62ff5862d72b0dba54a038b97ef8a8de7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sat, 25 Apr 2015 21:30:11 +0200
Subject: [PATCH 34/40] Add command line docs

In addition to the docs, I changed the meta variables for the
optional arguments to be more (in my opinion) clear.
---
 bin/seqpoet           | 14 +++++++-------
 docs/command_line.rst | 40 ++++++++++++++++++++++++++++++++++++++++
 docs/index.rst        |  1 +
 3 files changed, 48 insertions(+), 7 deletions(-)
 create mode 100644 docs/command_line.rst

diff --git a/bin/seqpoet b/bin/seqpoet
index 2029700..6e41f7b 100644
--- a/bin/seqpoet
+++ b/bin/seqpoet
@@ -309,19 +309,19 @@ def parse_args():
     parser.add_argument('-m', '--mismatches', help=('the maximum number of '
         'mismatches allowed when aligning probe/primer to the genome '
         '(default: %(default)d)'),
-        type=int, default=2, metavar='N')
+        type=int, default=2, metavar='int')
 
     parser.add_argument('-d', '--max-distance', help=('the maximum intergenic '
         'distance allowed when assembling operons (default: %(default)d)'),
-        type=int, default=500, metavar='N')
+        type=int, default=500, metavar='int')
 
     parser.add_argument('--min-product', help=('minimum PCR product length '
         'to consider (default: %(default)d)'), type=int, default=0,
-        metavar='N')
+        metavar='int')
 
     parser.add_argument('--max-product', help=('maximum PCR product length '
         'to consider (default: %(default)d)'), type=int, default=3000,
-        metavar='N')
+        metavar='int')
 
     parser.add_argument('--no-revcomp', help=('don\'t reverse complement '
         'results on the minus strand (default: do reverse complementation)'),
@@ -329,14 +329,14 @@ def parse_args():
 
     parser.add_argument('--downstream', help=('extend probe/primer match '
         '%(metavar)s bases downstream for operon finding (default: '
-        '%(default)s)'), metavar='N', default=0, type=int)
+        '%(default)s)'), metavar='int', default=0, type=int)
 
     parser.add_argument('--upstream', help=('extend probe/primer match '
         '%(metavar)s bases upstream for operon finding (default: '
-        '%(default)s)'), metavar='N', default=0, type=int)
+        '%(default)s)'), metavar='int', default=0, type=int)
 
     parser.add_argument('-o', '--out', help='file for output (default: stdout)',
-        default=sys.stdout)
+        default=sys.stdout, metavar='file')
 
     parser.add_argument('--version', help=('print version and exit'),
         action='version', version='%(prog)s v{0}'.format(seqpoet.__version__))
diff --git a/docs/command_line.rst b/docs/command_line.rst
new file mode 100644
index 0000000..e18f67a
--- /dev/null
+++ b/docs/command_line.rst
@@ -0,0 +1,40 @@
+Command line arguments
+======================
+
+::
+
+    seqpoet [options] genomedir probe
+
+Mandatory arguments
+-------------------
+
+=============   =======================================================
+genomedir       directory containing the genome files to use (FASTA or
+                GenBank format) or a single GenBank or FASTA file
+probe           file containing either a single sequence (probe) or a
+                pair of sequences (primer pair; one sequence per line)
+=============   =======================================================
+
+Optional arguments
+------------------
+
+-h, --help            show this help message and exit
+--pcr                 only perform in silico PCR. Requires that the probe
+                      file contains a primer pair (default: perform operon
+                      extraction)
+-m int, --mismatches int
+                      the maximum number of mismatches allowed when aligning
+                      probe/primer to the genome (default: 2)
+-d int, --max-distance int
+                      the maximum intergenic distance allowed when
+                      assembling operons (default: 500)
+--min-product int     minimum PCR product length to consider (default: 0)
+--max-product int     maximum PCR product length to consider (default: 3000)
+--no-revcomp          don't reverse complement results on the minus strand
+                      (default: do reverse complementation)
+--downstream int      extend probe/primer match int bases downstream for
+                      operon finding (default: 0)
+--upstream int        extend probe/primer match int bases upstream for
+                      operon finding (default: 0)
+-o file, --out file   file for output (default: stdout)
+--version             print version and exit
diff --git a/docs/index.rst b/docs/index.rst
index 21216f4..5b3d04a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,6 +19,7 @@ Contents:
    :maxdepth: 4
 
    installation
+   command_line
    insilico_pcr
    operon_extraction
    seqpoet

From e076d96fd11d74c9e515881fc5e3a3d0ffdd67a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sat, 25 Apr 2015 21:54:40 +0200
Subject: [PATCH 35/40] Add quick start page

---
 docs/index.rst      |  1 +
 docs/quickstart.rst | 58 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 docs/quickstart.rst

diff --git a/docs/index.rst b/docs/index.rst
index 5b3d04a..cf7c54a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,6 +19,7 @@ Contents:
    :maxdepth: 4
 
    installation
+   quickstart
    command_line
    insilico_pcr
    operon_extraction
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
new file mode 100644
index 0000000..f03523c
--- /dev/null
+++ b/docs/quickstart.rst
@@ -0,0 +1,58 @@
+Quick start
+===========
+
+Operon extraction
+-----------------
+
+.. code-block:: bash
+
+	seqpoet --out output.fa input_directory probe.txt
+	seqpoet --out output.fa input_directory primers.txt
+	seqpoet --out output.fa input.gb probe.txt
+	seqpoet --out output.fa input.gb primers.txt
+
+The file ``input.gb`` should be a valid GenBank file (possibly with
+multiple loci) and ``probe.txt`` should contain either a single nucleotide
+sequence (*i.e.* a probe) or two nucleotide sequences (*i.e.* a primer pair).
+Instead of supplying a single file, a directory of sequence files can be used
+as the first argument.
+
+Annotations are needed for the operon extraction, and currently GenBank
+is the only supported format for this. The FASTA file ``output.fa`` will
+contain the extracted sequences. If the ``--out`` argument is not supplied,
+the results are written to stdout.
+
+*In silico* PCR
+---------------
+
+.. code-block:: bash
+
+	seqpoet --pcr --out output.fa input.gb primers.txt
+	seqpoet --pcr --out output.fa input.fa primers.txt
+
+
+For *in silico* PCR, only primer pairs are supported, but the sequence input
+can be either FASTA or GenBank. The FASTA file ``output.fa`` will contain the
+predicted PCR products. If the ``--out`` argument is not supplied,
+the results are written to stdout.
+
+Output
+------
+
+The output from both operon extraction and *in silico* PCR will be a FASTA
+file. The header line for each result sequence is a colon separated string
+and will look something like this:
+
+::
+
+	>input.gb:locus:3451:3812:28:+
+
+- ``input.gb``: the original file where the sequence originates from
+- ``locus``: the name of the sequence in the original file, either from a FASTA
+  header or a GenBank locus name
+- ``3451``: the position in the original sequence of the first nucleotide in
+  the result sequence
+- ``3812``: the position in the original sequence of the last nucleotide in the
+  result sequence
+- ``28``: the length of the original match
+- ``+``: the sequence was found on the plus strand (otherwise ``-``)

From 3483110370c41203dbb15bb29396b5ba83f41b47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sat, 25 Apr 2015 22:09:32 +0200
Subject: [PATCH 36/40] Use quick start for examples

---
 docs/index.rst             | 2 --
 docs/insilico_pcr.rst      | 2 --
 docs/operon_extraction.rst | 2 --
 3 files changed, 6 deletions(-)
 delete mode 100644 docs/insilico_pcr.rst
 delete mode 100644 docs/operon_extraction.rst

diff --git a/docs/index.rst b/docs/index.rst
index cf7c54a..3b65f63 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,8 +21,6 @@ Contents:
    installation
    quickstart
    command_line
-   insilico_pcr
-   operon_extraction
    seqpoet
 
 
diff --git a/docs/insilico_pcr.rst b/docs/insilico_pcr.rst
deleted file mode 100644
index 7626b11..0000000
--- a/docs/insilico_pcr.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-`In silico` PCR
-===============
diff --git a/docs/operon_extraction.rst b/docs/operon_extraction.rst
deleted file mode 100644
index ebf4371..0000000
--- a/docs/operon_extraction.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Operon extraction
-=================

From eb2b8b159076c7d88e4aca5cbbbfbac59a8da3c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sat, 25 Apr 2015 22:13:39 +0200
Subject: [PATCH 37/40] Add requirements

---
 docs/installation.rst | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/installation.rst b/docs/installation.rst
index 7a78f42..17d59b6 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -1,5 +1,7 @@
-Installation
-============
+Installation and requirements
+=============================
+
+The only requirement is Python 2.7. Unfortunately Python 3 is not supported.
 
 Currently, the easiest way of installing seqpoet is to clone the GitHub
 repository and install it manually::
@@ -8,4 +10,5 @@ repository and install it manually::
     > cd seqpoet
     > python setup.py install
 
-Eventually the package will be submitted to `PyPI <https://pypi.python.org/pypi>`_.
+Eventually the package will be submitted to
+`PyPI <https://pypi.python.org/pypi>`_.

From 16c92a20dbb206cbf17f2f053c31d9b55edb32bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 26 Apr 2015 10:20:21 +0200
Subject: [PATCH 38/40] Doc formatting

---
 seqpoet/fasta.py    | 14 +++++++++-----
 seqpoet/genbank.py  | 34 +++++++++++++++++++---------------
 seqpoet/sequence.py |  4 ++++
 3 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/seqpoet/fasta.py b/seqpoet/fasta.py
index eb0d50e..500d9fa 100644
--- a/seqpoet/fasta.py
+++ b/seqpoet/fasta.py
@@ -1,5 +1,9 @@
+#-*- encoding: utf-8 -*-
 """The FASTA module contains classes for accessing FASTA files and
 FASTA index files.
+
+.. module:: fasta
+.. moduleauthor:: Niklas Mähler <niklas.mahler@gmail.com>
 """
 
 import collections
@@ -34,11 +38,11 @@ def parse_index(self):
             an OrderedDict with sequence names (headers) as keys and
             dicts as values. The value dicts have the following members:
 
-            name:    the sequence name (FASTA header line)
-            length:  sequence length
-            offset:  the byte offset of the first base of the sequence
-            nbase:   number of bases per line of sequence
-            linelen: number of bytes per line of sequence
+            - name:    the sequence name (FASTA header line)
+            - length:  sequence length
+            - offset:  the byte offset of the first base of the sequence
+            - nbase:   number of bases per line of sequence
+            - linelen: number of bytes per line of sequence
         :raises:
             ValueError if the file cannot be parsed, if the file contains
             duplicated headers or if the file is empty.
diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py
index 4620062..d3108b1 100644
--- a/seqpoet/genbank.py
+++ b/seqpoet/genbank.py
@@ -154,11 +154,12 @@ class Location(object):
     of the bases between two positions.
 
     **Class attributes:**
-        * **locstring:** the string representation of the location.
-        * **loctype:** the type of the location.
-        * **start:** the start position (0-based, including).
-        * **end:** the end position (0-based, including).
-        * **is_complement:** boolean indicating whether the position represents
+
+        - **locstring:** the string representation of the location.
+        - **loctype:** the type of the location.
+        - **start:** the start position (0-based, including).
+        - **end:** the end position (0-based, including).
+        - **is_complement:** boolean indicating whether the position represents
           the complement of the sequence.
 
     :param locstring: a GenBank location string.
@@ -293,10 +294,11 @@ class GenBankFeature(object):
     """Represent a GenBank feature.
 
     **Class attributes:**
-        * **feature_type**: a string with the feature key.
-        * **location**: a Location object representing the location of
-                        the feature.
-        * **qualifiers**: a dictionary of qualifiers of the feature.
+
+        - **feature_type**: a string with the feature key.
+        - **location**: a Location object representing the location of
+          the feature.
+        - **qualifiers**: a dictionary of qualifiers of the feature.
 
     :param locus: the name of the locus that the feature belongs to.
     :param feature_type: name of the feature.
@@ -407,9 +409,10 @@ class GenBankLocus(object):
     """Represent a GenBank locus.
 
     **Class attributes:**
-        * **name:** locus name.
-        * **seq:** a Sequence object with the sequence of the locus.
-        * **features:** a dictionary containing the features of the locus.
+
+        - **name:** locus name.
+        - **seq:** a Sequence object with the sequence of the locus.
+        - **features:** a dictionary containing the features of the locus.
 
     :param name: the name of the locus.
     :param seq: a Sequence object representing the sequence of the locus.
@@ -543,9 +546,10 @@ class GenBank(object):
 
     """Represent a GenBank file.
 
-    Class attributes:
-        * filename: the filename of the GenBank file.
-        * index: a list of dictionaries representing an index of the file.
+    **Class attributes:**
+
+        - filename: the filename of the GenBank file.
+        - index: a list of dictionaries representing an index of the file.
 
     :param fname: filename of the GenBank file.
     :raises: :py:exc:`.ParsingError` if parsing fails.
diff --git a/seqpoet/sequence.py b/seqpoet/sequence.py
index ad43b85..fc62217 100644
--- a/seqpoet/sequence.py
+++ b/seqpoet/sequence.py
@@ -1,4 +1,8 @@
+#-*- encoding: utf-8 -*-
 """Classes and functions for representing DNA sequences.
+
+.. module:: sequence
+.. moduleauthor:: Niklas Mähler <niklas.mahler@gmail.com>
 """
 
 import re

From 7e7d36f9337fc662de293f66fc6b0fdad6018b93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 26 Apr 2015 10:38:10 +0200
Subject: [PATCH 39/40] Add link to quickstart, close #8

---
 docs/index.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/index.rst b/docs/index.rst
index 3b65f63..ba90982 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -11,6 +11,8 @@ PCR and operon extraction for genomic assemblies. The secondary purpose of
 seqpoet is to be a Python package that can be used for handling sequence
 data in the form of FASTA and GenBank files.
 
+To get started quickly, take a look at the guide in `Quick start <quickstart.html>`_.
+
 Source code is hosted on GitHub: https://github.com/maehler/seqpoet.
 
 Contents:

From 78ee5416df9e481629748f2379b05f926409c4a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= <niklas.mahler@nmbu.no>
Date: Sun, 26 Apr 2015 10:40:59 +0200
Subject: [PATCH 40/40] Version bump

---
 seqpoet/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seqpoet/__init__.py b/seqpoet/__init__.py
index 7b4ce77..1ead78f 100644
--- a/seqpoet/__init__.py
+++ b/seqpoet/__init__.py
@@ -3,4 +3,4 @@
 from sequence import Sequence
 import search
 
-__version__ = '0.2.0'
+__version__ = '0.3.0'